{ "best_metric": 0.2571257948875427, "best_model_checkpoint": "./model_outputs/checkpoint-9250", "epoch": 1.922687715037442, "eval_steps": 50, "global_step": 9500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00020238818053025704, "grad_norm": 4.267368316650391, "learning_rate": 4e-05, "loss": 2.089, "step": 1 }, { "epoch": 0.0004047763610605141, "grad_norm": 4.084916114807129, "learning_rate": 8e-05, "loss": 2.0306, "step": 2 }, { "epoch": 0.0006071645415907711, "grad_norm": 1.5448576211929321, "learning_rate": 0.00012, "loss": 1.8954, "step": 3 }, { "epoch": 0.0008095527221210282, "grad_norm": NaN, "learning_rate": 0.00012, "loss": 2.0132, "step": 4 }, { "epoch": 0.0010119409026512851, "grad_norm": NaN, "learning_rate": 0.00012, "loss": 1.9737, "step": 5 }, { "epoch": 0.0012143290831815423, "grad_norm": 22.60654067993164, "learning_rate": 0.00016, "loss": 2.0251, "step": 6 }, { "epoch": 0.0014167172637117992, "grad_norm": 1.1628669500350952, "learning_rate": 0.0002, "loss": 1.7159, "step": 7 }, { "epoch": 0.0016191054442420564, "grad_norm": 0.7777935862541199, "learning_rate": 0.00019999999494152464, "loss": 1.6294, "step": 8 }, { "epoch": 0.0018214936247723133, "grad_norm": 0.9375593066215515, "learning_rate": 0.0001999999797660991, "loss": 1.5124, "step": 9 }, { "epoch": 0.0020238818053025702, "grad_norm": 0.7026050090789795, "learning_rate": 0.00019999995447372488, "loss": 1.462, "step": 10 }, { "epoch": 0.002226269985832827, "grad_norm": 0.6454700231552124, "learning_rate": 0.00019999991906440454, "loss": 1.4071, "step": 11 }, { "epoch": 0.0024286581663630845, "grad_norm": 0.7032003998756409, "learning_rate": 0.0001999998735381417, "loss": 1.3653, "step": 12 }, { "epoch": 0.0026310463468933415, "grad_norm": 0.5892922282218933, "learning_rate": 0.00019999981789494092, "loss": 1.435, "step": 13 }, { "epoch": 0.0028334345274235984, "grad_norm": 0.5047839283943176, "learning_rate": 0.00019999975213480785, "loss": 1.354, "step": 14 }, { "epoch": 0.0030358227079538553, "grad_norm": 0.5063280463218689, "learning_rate": 0.00019999967625774917, "loss": 1.3813, "step": 15 }, { "epoch": 0.0032382108884841127, "grad_norm": 0.4734111726284027, "learning_rate": 0.00019999959026377253, "loss": 1.3379, "step": 16 }, { "epoch": 0.0034405990690143697, "grad_norm": 0.5824829339981079, "learning_rate": 0.0001999994941528866, "loss": 1.3633, "step": 17 }, { "epoch": 0.0036429872495446266, "grad_norm": 0.880375325679779, "learning_rate": 0.00019999938792510116, "loss": 1.3669, "step": 18 }, { "epoch": 0.0038453754300748835, "grad_norm": 0.7805163264274597, "learning_rate": 0.00019999927158042695, "loss": 1.311, "step": 19 }, { "epoch": 0.0040477636106051405, "grad_norm": 0.6171135902404785, "learning_rate": 0.00019999914511887568, "loss": 1.2861, "step": 20 }, { "epoch": 0.004250151791135397, "grad_norm": 0.837437093257904, "learning_rate": 0.00019999900854046022, "loss": 1.2161, "step": 21 }, { "epoch": 0.004452539971665654, "grad_norm": 0.6754611134529114, "learning_rate": 0.0001999988618451943, "loss": 1.2434, "step": 22 }, { "epoch": 0.004654928152195912, "grad_norm": 0.6884850263595581, "learning_rate": 0.0001999987050330929, "loss": 1.2502, "step": 23 }, { "epoch": 0.004857316332726169, "grad_norm": 0.7658131718635559, "learning_rate": 0.00019999853810417174, "loss": 1.2025, "step": 24 }, { "epoch": 0.005059704513256426, "grad_norm": 0.6967308521270752, "learning_rate": 0.00019999836105844777, "loss": 1.2064, "step": 25 }, { "epoch": 0.005262092693786683, "grad_norm": 0.7907235026359558, "learning_rate": 0.0001999981738959389, "loss": 1.1801, "step": 26 }, { "epoch": 0.00546448087431694, "grad_norm": 0.6149998903274536, "learning_rate": 0.00019999797661666407, "loss": 1.1459, "step": 27 }, { "epoch": 0.005666869054847197, "grad_norm": 0.6044988036155701, "learning_rate": 0.00019999776922064323, "loss": 1.1821, "step": 28 }, { "epoch": 0.005869257235377454, "grad_norm": 0.687698245048523, "learning_rate": 0.00019999755170789735, "loss": 1.1662, "step": 29 }, { "epoch": 0.006071645415907711, "grad_norm": 0.5864324569702148, "learning_rate": 0.0001999973240784485, "loss": 1.1516, "step": 30 }, { "epoch": 0.006274033596437968, "grad_norm": 0.628131628036499, "learning_rate": 0.00019999708633231962, "loss": 1.1252, "step": 31 }, { "epoch": 0.0064764217769682254, "grad_norm": 0.5752449035644531, "learning_rate": 0.0001999968384695348, "loss": 1.0934, "step": 32 }, { "epoch": 0.006678809957498482, "grad_norm": 0.5566519498825073, "learning_rate": 0.00019999658049011916, "loss": 1.0986, "step": 33 }, { "epoch": 0.006881198138028739, "grad_norm": 0.5842620730400085, "learning_rate": 0.0001999963123940987, "loss": 1.1155, "step": 34 }, { "epoch": 0.007083586318558996, "grad_norm": 0.6081081628799438, "learning_rate": 0.00019999603418150065, "loss": 1.0845, "step": 35 }, { "epoch": 0.007285974499089253, "grad_norm": 0.6527591347694397, "learning_rate": 0.0001999957458523531, "loss": 1.0682, "step": 36 }, { "epoch": 0.00748836267961951, "grad_norm": 0.5430381894111633, "learning_rate": 0.0001999954474066852, "loss": 1.1304, "step": 37 }, { "epoch": 0.007690750860149767, "grad_norm": 0.6899943947792053, "learning_rate": 0.0001999951388445272, "loss": 1.0612, "step": 38 }, { "epoch": 0.007893139040680024, "grad_norm": 0.5707619190216064, "learning_rate": 0.00019999482016591028, "loss": 1.1377, "step": 39 }, { "epoch": 0.008095527221210281, "grad_norm": 0.583000898361206, "learning_rate": 0.00019999449137086668, "loss": 1.1237, "step": 40 }, { "epoch": 0.008297915401740538, "grad_norm": 0.5590953826904297, "learning_rate": 0.00019999415245942968, "loss": 1.0657, "step": 41 }, { "epoch": 0.008500303582270795, "grad_norm": 0.5293363928794861, "learning_rate": 0.00019999380343163354, "loss": 1.1134, "step": 42 }, { "epoch": 0.008702691762801052, "grad_norm": 0.5645998120307922, "learning_rate": 0.0001999934442875136, "loss": 1.0908, "step": 43 }, { "epoch": 0.008905079943331309, "grad_norm": 0.5520191192626953, "learning_rate": 0.0001999930750271062, "loss": 1.0689, "step": 44 }, { "epoch": 0.009107468123861567, "grad_norm": 0.594061017036438, "learning_rate": 0.0001999926956504487, "loss": 1.1155, "step": 45 }, { "epoch": 0.009309856304391824, "grad_norm": 0.5787252187728882, "learning_rate": 0.00019999230615757942, "loss": 1.0742, "step": 46 }, { "epoch": 0.009512244484922081, "grad_norm": 0.5889455676078796, "learning_rate": 0.00019999190654853785, "loss": 1.0397, "step": 47 }, { "epoch": 0.009714632665452338, "grad_norm": 0.5654120445251465, "learning_rate": 0.00019999149682336435, "loss": 1.0794, "step": 48 }, { "epoch": 0.009917020845982595, "grad_norm": 0.6551898121833801, "learning_rate": 0.0001999910769821004, "loss": 1.0397, "step": 49 }, { "epoch": 0.010119409026512852, "grad_norm": 0.5888538360595703, "learning_rate": 0.00019999064702478853, "loss": 1.0434, "step": 50 }, { "epoch": 0.010119409026512852, "eval_loss": 1.1003731489181519, "eval_runtime": 0.7794, "eval_samples_per_second": 6.415, "eval_steps_per_second": 1.283, "step": 50 }, { "epoch": 0.010321797207043109, "grad_norm": 0.581838846206665, "learning_rate": 0.00019999020695147214, "loss": 0.9748, "step": 51 }, { "epoch": 0.010524185387573366, "grad_norm": 0.5969087481498718, "learning_rate": 0.00019998975676219582, "loss": 1.0262, "step": 52 }, { "epoch": 0.010726573568103623, "grad_norm": 0.691875696182251, "learning_rate": 0.00019998929645700505, "loss": 0.9956, "step": 53 }, { "epoch": 0.01092896174863388, "grad_norm": 0.522814929485321, "learning_rate": 0.00019998882603594647, "loss": 1.0366, "step": 54 }, { "epoch": 0.011131349929164137, "grad_norm": 0.5945473313331604, "learning_rate": 0.00019998834549906765, "loss": 0.9809, "step": 55 }, { "epoch": 0.011333738109694394, "grad_norm": 0.4513246715068817, "learning_rate": 0.00019998785484641717, "loss": 1.0609, "step": 56 }, { "epoch": 0.01153612629022465, "grad_norm": 0.5156210064888, "learning_rate": 0.0001999873540780447, "loss": 1.0375, "step": 57 }, { "epoch": 0.011738514470754908, "grad_norm": 0.5518094897270203, "learning_rate": 0.00019998684319400093, "loss": 1.1117, "step": 58 }, { "epoch": 0.011940902651285164, "grad_norm": 0.5378175973892212, "learning_rate": 0.00019998632219433749, "loss": 1.0078, "step": 59 }, { "epoch": 0.012143290831815421, "grad_norm": 0.5768158435821533, "learning_rate": 0.00019998579107910713, "loss": 1.1274, "step": 60 }, { "epoch": 0.012345679012345678, "grad_norm": 0.5991278290748596, "learning_rate": 0.00019998524984836356, "loss": 1.018, "step": 61 }, { "epoch": 0.012548067192875935, "grad_norm": 0.5535243153572083, "learning_rate": 0.00019998469850216152, "loss": 1.0184, "step": 62 }, { "epoch": 0.012750455373406194, "grad_norm": 0.5592597723007202, "learning_rate": 0.00019998413704055686, "loss": 1.0218, "step": 63 }, { "epoch": 0.012952843553936451, "grad_norm": 0.5260300040245056, "learning_rate": 0.0001999835654636063, "loss": 1.0673, "step": 64 }, { "epoch": 0.013155231734466708, "grad_norm": 0.5915527939796448, "learning_rate": 0.00019998298377136772, "loss": 0.9527, "step": 65 }, { "epoch": 0.013357619914996965, "grad_norm": 0.6970841288566589, "learning_rate": 0.00019998239196389995, "loss": 1.0103, "step": 66 }, { "epoch": 0.013560008095527222, "grad_norm": 0.5182502865791321, "learning_rate": 0.00019998179004126286, "loss": 0.9288, "step": 67 }, { "epoch": 0.013762396276057479, "grad_norm": 0.8228877186775208, "learning_rate": 0.00019998117800351734, "loss": 0.9993, "step": 68 }, { "epoch": 0.013964784456587736, "grad_norm": 0.5726291537284851, "learning_rate": 0.00019998055585072533, "loss": 0.9788, "step": 69 }, { "epoch": 0.014167172637117992, "grad_norm": 0.8187235593795776, "learning_rate": 0.00019997992358294976, "loss": 1.004, "step": 70 }, { "epoch": 0.01436956081764825, "grad_norm": 0.6061872839927673, "learning_rate": 0.00019997928120025463, "loss": 1.0284, "step": 71 }, { "epoch": 0.014571948998178506, "grad_norm": 0.7122519612312317, "learning_rate": 0.00019997862870270488, "loss": 1.0282, "step": 72 }, { "epoch": 0.014774337178708763, "grad_norm": 0.5993272066116333, "learning_rate": 0.0001999779660903665, "loss": 1.0159, "step": 73 }, { "epoch": 0.01497672535923902, "grad_norm": 0.6778062582015991, "learning_rate": 0.00019997729336330663, "loss": 0.9764, "step": 74 }, { "epoch": 0.015179113539769277, "grad_norm": 0.5996508002281189, "learning_rate": 0.00019997661052159323, "loss": 0.9797, "step": 75 }, { "epoch": 0.015381501720299534, "grad_norm": 0.6299217343330383, "learning_rate": 0.00019997591756529541, "loss": 0.9712, "step": 76 }, { "epoch": 0.015583889900829791, "grad_norm": 0.7457549571990967, "learning_rate": 0.00019997521449448331, "loss": 1.0071, "step": 77 }, { "epoch": 0.015786278081360048, "grad_norm": 0.6136026382446289, "learning_rate": 0.00019997450130922802, "loss": 0.9881, "step": 78 }, { "epoch": 0.015988666261890307, "grad_norm": 0.6008905172348022, "learning_rate": 0.00019997377800960172, "loss": 0.9617, "step": 79 }, { "epoch": 0.016191054442420562, "grad_norm": 0.5873702764511108, "learning_rate": 0.0001999730445956776, "loss": 0.9714, "step": 80 }, { "epoch": 0.01639344262295082, "grad_norm": 0.5964879989624023, "learning_rate": 0.0001999723010675298, "loss": 1.0452, "step": 81 }, { "epoch": 0.016595830803481076, "grad_norm": 0.6384466886520386, "learning_rate": 0.00019997154742523358, "loss": 0.9848, "step": 82 }, { "epoch": 0.016798218984011334, "grad_norm": 0.5400208234786987, "learning_rate": 0.00019997078366886518, "loss": 0.9715, "step": 83 }, { "epoch": 0.01700060716454159, "grad_norm": 0.5620110630989075, "learning_rate": 0.00019997000979850188, "loss": 0.9445, "step": 84 }, { "epoch": 0.01720299534507185, "grad_norm": 0.7637978792190552, "learning_rate": 0.00019996922581422196, "loss": 1.0477, "step": 85 }, { "epoch": 0.017405383525602103, "grad_norm": 0.5261138677597046, "learning_rate": 0.0001999684317161047, "loss": 0.9975, "step": 86 }, { "epoch": 0.017607771706132362, "grad_norm": 0.6341800689697266, "learning_rate": 0.00019996762750423052, "loss": 0.9253, "step": 87 }, { "epoch": 0.017810159886662617, "grad_norm": 0.5864059329032898, "learning_rate": 0.0001999668131786807, "loss": 0.9491, "step": 88 }, { "epoch": 0.018012548067192876, "grad_norm": 0.5787160992622375, "learning_rate": 0.0001999659887395377, "loss": 0.8889, "step": 89 }, { "epoch": 0.018214936247723135, "grad_norm": 0.6294256448745728, "learning_rate": 0.0001999651541868849, "loss": 0.9698, "step": 90 }, { "epoch": 0.01841732442825339, "grad_norm": 0.5774893760681152, "learning_rate": 0.0001999643095208067, "loss": 0.9502, "step": 91 }, { "epoch": 0.01861971260878365, "grad_norm": 0.6308877468109131, "learning_rate": 0.00019996345474138858, "loss": 0.9221, "step": 92 }, { "epoch": 0.018822100789313904, "grad_norm": 0.6650766730308533, "learning_rate": 0.000199962589848717, "loss": 1.0439, "step": 93 }, { "epoch": 0.019024488969844162, "grad_norm": 0.5616105198860168, "learning_rate": 0.0001999617148428795, "loss": 0.9611, "step": 94 }, { "epoch": 0.019226877150374418, "grad_norm": 0.6718441843986511, "learning_rate": 0.00019996082972396456, "loss": 0.832, "step": 95 }, { "epoch": 0.019429265330904676, "grad_norm": 0.6764196157455444, "learning_rate": 0.00019995993449206174, "loss": 0.9261, "step": 96 }, { "epoch": 0.01963165351143493, "grad_norm": 0.6313098073005676, "learning_rate": 0.0001999590291472616, "loss": 0.9441, "step": 97 }, { "epoch": 0.01983404169196519, "grad_norm": 0.5967774987220764, "learning_rate": 0.00019995811368965578, "loss": 0.9431, "step": 98 }, { "epoch": 0.020036429872495445, "grad_norm": 0.6203646063804626, "learning_rate": 0.00019995718811933685, "loss": 0.9719, "step": 99 }, { "epoch": 0.020238818053025704, "grad_norm": 0.5203957557678223, "learning_rate": 0.0001999562524363985, "loss": 0.9755, "step": 100 }, { "epoch": 0.020238818053025704, "eval_loss": 0.9987107515335083, "eval_runtime": 0.7376, "eval_samples_per_second": 6.779, "eval_steps_per_second": 1.356, "step": 100 }, { "epoch": 0.02044120623355596, "grad_norm": 0.5949529409408569, "learning_rate": 0.00019995530664093533, "loss": 0.9831, "step": 101 }, { "epoch": 0.020643594414086218, "grad_norm": 0.6670980453491211, "learning_rate": 0.00019995435073304305, "loss": 0.8731, "step": 102 }, { "epoch": 0.020845982594616473, "grad_norm": 0.6265388131141663, "learning_rate": 0.00019995338471281838, "loss": 0.9228, "step": 103 }, { "epoch": 0.021048370775146732, "grad_norm": 0.6850742697715759, "learning_rate": 0.000199952408580359, "loss": 1.0626, "step": 104 }, { "epoch": 0.021250758955676987, "grad_norm": 0.6624189019203186, "learning_rate": 0.00019995142233576377, "loss": 1.0332, "step": 105 }, { "epoch": 0.021453147136207246, "grad_norm": 0.6240122318267822, "learning_rate": 0.0001999504259791324, "loss": 0.9175, "step": 106 }, { "epoch": 0.0216555353167375, "grad_norm": 0.6845401525497437, "learning_rate": 0.00019994941951056568, "loss": 0.9271, "step": 107 }, { "epoch": 0.02185792349726776, "grad_norm": 0.5907098054885864, "learning_rate": 0.00019994840293016545, "loss": 0.8607, "step": 108 }, { "epoch": 0.022060311677798018, "grad_norm": 0.7503966093063354, "learning_rate": 0.00019994737623803456, "loss": 0.9726, "step": 109 }, { "epoch": 0.022262699858328273, "grad_norm": 0.677544891834259, "learning_rate": 0.00019994633943427688, "loss": 1.004, "step": 110 }, { "epoch": 0.022465088038858532, "grad_norm": 0.6157703399658203, "learning_rate": 0.0001999452925189973, "loss": 0.9558, "step": 111 }, { "epoch": 0.022667476219388787, "grad_norm": 0.5559118986129761, "learning_rate": 0.00019994423549230173, "loss": 0.981, "step": 112 }, { "epoch": 0.022869864399919046, "grad_norm": 0.5934160947799683, "learning_rate": 0.00019994316835429714, "loss": 0.8549, "step": 113 }, { "epoch": 0.0230722525804493, "grad_norm": 0.5715833902359009, "learning_rate": 0.00019994209110509145, "loss": 0.9944, "step": 114 }, { "epoch": 0.02327464076097956, "grad_norm": 0.5714934468269348, "learning_rate": 0.00019994100374479365, "loss": 0.91, "step": 115 }, { "epoch": 0.023477028941509815, "grad_norm": 0.613496720790863, "learning_rate": 0.0001999399062735138, "loss": 0.8761, "step": 116 }, { "epoch": 0.023679417122040074, "grad_norm": 0.6440281271934509, "learning_rate": 0.00019993879869136284, "loss": 0.9539, "step": 117 }, { "epoch": 0.02388180530257033, "grad_norm": 0.6116454601287842, "learning_rate": 0.0001999376809984529, "loss": 0.9385, "step": 118 }, { "epoch": 0.024084193483100588, "grad_norm": 0.6269987225532532, "learning_rate": 0.00019993655319489704, "loss": 0.9607, "step": 119 }, { "epoch": 0.024286581663630843, "grad_norm": 0.6190333366394043, "learning_rate": 0.00019993541528080932, "loss": 0.9325, "step": 120 }, { "epoch": 0.0244889698441611, "grad_norm": 0.5847398042678833, "learning_rate": 0.00019993426725630492, "loss": 0.8744, "step": 121 }, { "epoch": 0.024691358024691357, "grad_norm": 0.614730715751648, "learning_rate": 0.00019993310912149996, "loss": 0.965, "step": 122 }, { "epoch": 0.024893746205221615, "grad_norm": 0.6264632940292358, "learning_rate": 0.00019993194087651158, "loss": 0.8992, "step": 123 }, { "epoch": 0.02509613438575187, "grad_norm": 0.6432466506958008, "learning_rate": 0.00019993076252145802, "loss": 0.9558, "step": 124 }, { "epoch": 0.02529852256628213, "grad_norm": 0.6524417996406555, "learning_rate": 0.00019992957405645846, "loss": 0.9221, "step": 125 }, { "epoch": 0.025500910746812388, "grad_norm": 0.6003560423851013, "learning_rate": 0.00019992837548163316, "loss": 0.9247, "step": 126 }, { "epoch": 0.025703298927342643, "grad_norm": 0.5921458601951599, "learning_rate": 0.00019992716679710334, "loss": 0.963, "step": 127 }, { "epoch": 0.025905687107872902, "grad_norm": 0.5762323141098022, "learning_rate": 0.00019992594800299131, "loss": 0.9888, "step": 128 }, { "epoch": 0.026108075288403157, "grad_norm": 0.5996343493461609, "learning_rate": 0.00019992471909942042, "loss": 0.8842, "step": 129 }, { "epoch": 0.026310463468933416, "grad_norm": 0.6580969095230103, "learning_rate": 0.00019992348008651488, "loss": 0.9383, "step": 130 }, { "epoch": 0.02651285164946367, "grad_norm": 0.646142303943634, "learning_rate": 0.00019992223096440014, "loss": 0.9079, "step": 131 }, { "epoch": 0.02671523982999393, "grad_norm": 0.6356844305992126, "learning_rate": 0.00019992097173320255, "loss": 0.9245, "step": 132 }, { "epoch": 0.026917628010524185, "grad_norm": 0.5754806995391846, "learning_rate": 0.0001999197023930495, "loss": 0.9361, "step": 133 }, { "epoch": 0.027120016191054443, "grad_norm": 0.6589515805244446, "learning_rate": 0.0001999184229440694, "loss": 0.8845, "step": 134 }, { "epoch": 0.0273224043715847, "grad_norm": 0.555176854133606, "learning_rate": 0.0001999171333863917, "loss": 0.9382, "step": 135 }, { "epoch": 0.027524792552114957, "grad_norm": 0.6286085844039917, "learning_rate": 0.00019991583372014687, "loss": 0.9328, "step": 136 }, { "epoch": 0.027727180732645212, "grad_norm": 0.6221725344657898, "learning_rate": 0.00019991452394546637, "loss": 0.9229, "step": 137 }, { "epoch": 0.02792956891317547, "grad_norm": 0.5152042508125305, "learning_rate": 0.00019991320406248275, "loss": 0.9372, "step": 138 }, { "epoch": 0.028131957093705726, "grad_norm": 0.602803647518158, "learning_rate": 0.0001999118740713295, "loss": 0.8926, "step": 139 }, { "epoch": 0.028334345274235985, "grad_norm": 0.5117329955101013, "learning_rate": 0.00019991053397214122, "loss": 1.0066, "step": 140 }, { "epoch": 0.02853673345476624, "grad_norm": 0.5424915552139282, "learning_rate": 0.00019990918376505343, "loss": 0.9543, "step": 141 }, { "epoch": 0.0287391216352965, "grad_norm": 0.5950594544410706, "learning_rate": 0.00019990782345020275, "loss": 0.8684, "step": 142 }, { "epoch": 0.028941509815826754, "grad_norm": 0.7003294825553894, "learning_rate": 0.00019990645302772687, "loss": 0.8938, "step": 143 }, { "epoch": 0.029143897996357013, "grad_norm": 0.6386146545410156, "learning_rate": 0.00019990507249776433, "loss": 0.9564, "step": 144 }, { "epoch": 0.02934628617688727, "grad_norm": 0.597125768661499, "learning_rate": 0.0001999036818604549, "loss": 0.9712, "step": 145 }, { "epoch": 0.029548674357417527, "grad_norm": 0.6054093837738037, "learning_rate": 0.00019990228111593919, "loss": 0.9828, "step": 146 }, { "epoch": 0.029751062537947785, "grad_norm": 0.6329893469810486, "learning_rate": 0.0001999008702643589, "loss": 0.94, "step": 147 }, { "epoch": 0.02995345071847804, "grad_norm": 0.7184765934944153, "learning_rate": 0.00019989944930585683, "loss": 0.9074, "step": 148 }, { "epoch": 0.0301558388990083, "grad_norm": 0.6599225401878357, "learning_rate": 0.00019989801824057675, "loss": 0.8902, "step": 149 }, { "epoch": 0.030358227079538554, "grad_norm": 0.7230977416038513, "learning_rate": 0.0001998965770686634, "loss": 0.8653, "step": 150 }, { "epoch": 0.030358227079538554, "eval_loss": 0.9306267499923706, "eval_runtime": 0.7367, "eval_samples_per_second": 6.787, "eval_steps_per_second": 1.357, "step": 150 }, { "epoch": 0.030560615260068813, "grad_norm": 0.7149731516838074, "learning_rate": 0.00019989512579026252, "loss": 0.9479, "step": 151 }, { "epoch": 0.030763003440599068, "grad_norm": 0.5758787989616394, "learning_rate": 0.00019989366440552103, "loss": 0.9579, "step": 152 }, { "epoch": 0.030965391621129327, "grad_norm": 0.6900405287742615, "learning_rate": 0.00019989219291458677, "loss": 0.8796, "step": 153 }, { "epoch": 0.031167779801659582, "grad_norm": 0.6407442688941956, "learning_rate": 0.0001998907113176086, "loss": 0.8888, "step": 154 }, { "epoch": 0.03137016798218984, "grad_norm": 0.5915456414222717, "learning_rate": 0.00019988921961473633, "loss": 0.9336, "step": 155 }, { "epoch": 0.031572556162720096, "grad_norm": 0.6366065740585327, "learning_rate": 0.000199887717806121, "loss": 0.9005, "step": 156 }, { "epoch": 0.03177494434325035, "grad_norm": 0.7416878342628479, "learning_rate": 0.0001998862058919145, "loss": 0.9671, "step": 157 }, { "epoch": 0.03197733252378061, "grad_norm": 0.6920793652534485, "learning_rate": 0.00019988468387226974, "loss": 0.9283, "step": 158 }, { "epoch": 0.03217972070431087, "grad_norm": 0.577690064907074, "learning_rate": 0.00019988315174734078, "loss": 0.932, "step": 159 }, { "epoch": 0.032382108884841124, "grad_norm": 0.5838318467140198, "learning_rate": 0.0001998816095172826, "loss": 0.8758, "step": 160 }, { "epoch": 0.03258449706537138, "grad_norm": 0.6302077174186707, "learning_rate": 0.00019988005718225117, "loss": 0.8934, "step": 161 }, { "epoch": 0.03278688524590164, "grad_norm": 0.5870795845985413, "learning_rate": 0.0001998784947424036, "loss": 0.8459, "step": 162 }, { "epoch": 0.032989273426431896, "grad_norm": 0.6889308094978333, "learning_rate": 0.00019987692219789794, "loss": 0.8667, "step": 163 }, { "epoch": 0.03319166160696215, "grad_norm": 0.6996739506721497, "learning_rate": 0.0001998753395488933, "loss": 0.8623, "step": 164 }, { "epoch": 0.033394049787492414, "grad_norm": 0.8726261854171753, "learning_rate": 0.00019987374679554979, "loss": 0.905, "step": 165 }, { "epoch": 0.03359643796802267, "grad_norm": 0.597016453742981, "learning_rate": 0.00019987214393802854, "loss": 0.8773, "step": 166 }, { "epoch": 0.033798826148552924, "grad_norm": 0.8725893497467041, "learning_rate": 0.00019987053097649172, "loss": 0.8896, "step": 167 }, { "epoch": 0.03400121432908318, "grad_norm": 0.6847428679466248, "learning_rate": 0.0001998689079111025, "loss": 0.8226, "step": 168 }, { "epoch": 0.03420360250961344, "grad_norm": 0.613000750541687, "learning_rate": 0.00019986727474202506, "loss": 0.8768, "step": 169 }, { "epoch": 0.0344059906901437, "grad_norm": 0.7318368554115295, "learning_rate": 0.00019986563146942468, "loss": 0.9174, "step": 170 }, { "epoch": 0.03460837887067395, "grad_norm": 0.6846932172775269, "learning_rate": 0.0001998639780934676, "loss": 0.8588, "step": 171 }, { "epoch": 0.03481076705120421, "grad_norm": 0.6385796666145325, "learning_rate": 0.00019986231461432106, "loss": 0.9034, "step": 172 }, { "epoch": 0.03501315523173447, "grad_norm": 0.6725485324859619, "learning_rate": 0.00019986064103215339, "loss": 0.8191, "step": 173 }, { "epoch": 0.035215543412264724, "grad_norm": 0.6478608846664429, "learning_rate": 0.00019985895734713386, "loss": 0.8056, "step": 174 }, { "epoch": 0.03541793159279498, "grad_norm": 0.6572886109352112, "learning_rate": 0.00019985726355943283, "loss": 0.8324, "step": 175 }, { "epoch": 0.035620319773325235, "grad_norm": 0.7601284980773926, "learning_rate": 0.00019985555966922167, "loss": 0.8534, "step": 176 }, { "epoch": 0.0358227079538555, "grad_norm": 0.7233675122261047, "learning_rate": 0.00019985384567667279, "loss": 0.8744, "step": 177 }, { "epoch": 0.03602509613438575, "grad_norm": 0.731354296207428, "learning_rate": 0.00019985212158195952, "loss": 0.7982, "step": 178 }, { "epoch": 0.03622748431491601, "grad_norm": 0.7109301090240479, "learning_rate": 0.00019985038738525634, "loss": 0.7729, "step": 179 }, { "epoch": 0.03642987249544627, "grad_norm": 0.7613882422447205, "learning_rate": 0.00019984864308673867, "loss": 0.8453, "step": 180 }, { "epoch": 0.036632260675976525, "grad_norm": 0.8680360317230225, "learning_rate": 0.000199846888686583, "loss": 0.788, "step": 181 }, { "epoch": 0.03683464885650678, "grad_norm": 1.017104983329773, "learning_rate": 0.00019984512418496682, "loss": 0.8349, "step": 182 }, { "epoch": 0.037037037037037035, "grad_norm": 0.6914694905281067, "learning_rate": 0.00019984334958206862, "loss": 0.8457, "step": 183 }, { "epoch": 0.0372394252175673, "grad_norm": 0.8141409158706665, "learning_rate": 0.00019984156487806799, "loss": 0.7979, "step": 184 }, { "epoch": 0.03744181339809755, "grad_norm": 0.840029239654541, "learning_rate": 0.00019983977007314544, "loss": 0.8428, "step": 185 }, { "epoch": 0.03764420157862781, "grad_norm": 0.8123815655708313, "learning_rate": 0.00019983796516748252, "loss": 0.7008, "step": 186 }, { "epoch": 0.03784658975915806, "grad_norm": 0.8500047326087952, "learning_rate": 0.00019983615016126193, "loss": 0.7789, "step": 187 }, { "epoch": 0.038048977939688325, "grad_norm": 0.809622585773468, "learning_rate": 0.00019983432505466718, "loss": 0.7469, "step": 188 }, { "epoch": 0.03825136612021858, "grad_norm": 0.7599986791610718, "learning_rate": 0.00019983248984788303, "loss": 0.8498, "step": 189 }, { "epoch": 0.038453754300748835, "grad_norm": 1.0785133838653564, "learning_rate": 0.00019983064454109505, "loss": 0.7924, "step": 190 }, { "epoch": 0.03865614248127909, "grad_norm": 1.275370717048645, "learning_rate": 0.00019982878913448997, "loss": 0.8975, "step": 191 }, { "epoch": 0.03885853066180935, "grad_norm": 1.2063267230987549, "learning_rate": 0.0001998269236282555, "loss": 0.7749, "step": 192 }, { "epoch": 0.03906091884233961, "grad_norm": 1.0798983573913574, "learning_rate": 0.00019982504802258037, "loss": 0.8307, "step": 193 }, { "epoch": 0.03926330702286986, "grad_norm": 1.2071725130081177, "learning_rate": 0.00019982316231765431, "loss": 0.8576, "step": 194 }, { "epoch": 0.03946569520340012, "grad_norm": 0.8345642685890198, "learning_rate": 0.00019982126651366816, "loss": 0.8485, "step": 195 }, { "epoch": 0.03966808338393038, "grad_norm": 0.9822136759757996, "learning_rate": 0.00019981936061081365, "loss": 0.8924, "step": 196 }, { "epoch": 0.039870471564460636, "grad_norm": 1.0564842224121094, "learning_rate": 0.0001998174446092836, "loss": 0.9139, "step": 197 }, { "epoch": 0.04007285974499089, "grad_norm": 0.8812574148178101, "learning_rate": 0.00019981551850927195, "loss": 0.7048, "step": 198 }, { "epoch": 0.04027524792552115, "grad_norm": 1.013770580291748, "learning_rate": 0.00019981358231097344, "loss": 0.8904, "step": 199 }, { "epoch": 0.04047763610605141, "grad_norm": 1.1347284317016602, "learning_rate": 0.00019981163601458403, "loss": 0.7091, "step": 200 }, { "epoch": 0.04047763610605141, "eval_loss": 0.7794874906539917, "eval_runtime": 0.7382, "eval_samples_per_second": 6.773, "eval_steps_per_second": 1.355, "step": 200 }, { "epoch": 0.04068002428658166, "grad_norm": 0.9756168127059937, "learning_rate": 0.00019980967962030056, "loss": 0.7894, "step": 201 }, { "epoch": 0.04088241246711192, "grad_norm": 1.3444828987121582, "learning_rate": 0.00019980771312832105, "loss": 0.8338, "step": 202 }, { "epoch": 0.04108480064764218, "grad_norm": 1.014088749885559, "learning_rate": 0.00019980573653884435, "loss": 0.7022, "step": 203 }, { "epoch": 0.041287188828172436, "grad_norm": 1.1845611333847046, "learning_rate": 0.0001998037498520705, "loss": 0.7177, "step": 204 }, { "epoch": 0.04148957700870269, "grad_norm": 1.521763801574707, "learning_rate": 0.00019980175306820046, "loss": 0.8397, "step": 205 }, { "epoch": 0.041691965189232946, "grad_norm": 1.1498165130615234, "learning_rate": 0.00019979974618743626, "loss": 0.8348, "step": 206 }, { "epoch": 0.04189435336976321, "grad_norm": 1.7773405313491821, "learning_rate": 0.00019979772920998093, "loss": 0.7595, "step": 207 }, { "epoch": 0.042096741550293464, "grad_norm": 1.1249701976776123, "learning_rate": 0.0001997957021360385, "loss": 0.747, "step": 208 }, { "epoch": 0.04229912973082372, "grad_norm": 1.0314749479293823, "learning_rate": 0.00019979366496581408, "loss": 0.7159, "step": 209 }, { "epoch": 0.042501517911353974, "grad_norm": 1.208742618560791, "learning_rate": 0.00019979161769951377, "loss": 0.5837, "step": 210 }, { "epoch": 0.042703906091884236, "grad_norm": 1.2238479852676392, "learning_rate": 0.00019978956033734471, "loss": 0.6674, "step": 211 }, { "epoch": 0.04290629427241449, "grad_norm": 1.3177680969238281, "learning_rate": 0.00019978749287951497, "loss": 0.6537, "step": 212 }, { "epoch": 0.04310868245294475, "grad_norm": 1.5105770826339722, "learning_rate": 0.00019978541532623379, "loss": 0.7395, "step": 213 }, { "epoch": 0.043311070633475, "grad_norm": 1.1323720216751099, "learning_rate": 0.0001997833276777113, "loss": 0.6217, "step": 214 }, { "epoch": 0.043513458814005264, "grad_norm": 1.1617493629455566, "learning_rate": 0.00019978122993415874, "loss": 0.6077, "step": 215 }, { "epoch": 0.04371584699453552, "grad_norm": 1.0218966007232666, "learning_rate": 0.00019977912209578834, "loss": 0.5779, "step": 216 }, { "epoch": 0.043918235175065774, "grad_norm": 1.3140915632247925, "learning_rate": 0.00019977700416281332, "loss": 0.5813, "step": 217 }, { "epoch": 0.044120623355596036, "grad_norm": 1.1791021823883057, "learning_rate": 0.00019977487613544797, "loss": 0.5821, "step": 218 }, { "epoch": 0.04432301153612629, "grad_norm": 1.0278735160827637, "learning_rate": 0.00019977273801390758, "loss": 0.5312, "step": 219 }, { "epoch": 0.04452539971665655, "grad_norm": 1.026606559753418, "learning_rate": 0.00019977058979840848, "loss": 0.4905, "step": 220 }, { "epoch": 0.0447277878971868, "grad_norm": 1.115782380104065, "learning_rate": 0.00019976843148916795, "loss": 0.567, "step": 221 }, { "epoch": 0.044930176077717064, "grad_norm": 0.9572044610977173, "learning_rate": 0.0001997662630864044, "loss": 0.4892, "step": 222 }, { "epoch": 0.04513256425824732, "grad_norm": 1.084818959236145, "learning_rate": 0.0001997640845903372, "loss": 0.5356, "step": 223 }, { "epoch": 0.045334952438777575, "grad_norm": 0.996979832649231, "learning_rate": 0.0001997618960011867, "loss": 0.5158, "step": 224 }, { "epoch": 0.04553734061930783, "grad_norm": 0.9665465354919434, "learning_rate": 0.0001997596973191744, "loss": 0.4367, "step": 225 }, { "epoch": 0.04573972879983809, "grad_norm": 1.0468083620071411, "learning_rate": 0.00019975748854452263, "loss": 0.4481, "step": 226 }, { "epoch": 0.04594211698036835, "grad_norm": 0.9373990297317505, "learning_rate": 0.00019975526967745496, "loss": 0.4282, "step": 227 }, { "epoch": 0.0461445051608986, "grad_norm": 0.7844080328941345, "learning_rate": 0.0001997530407181958, "loss": 0.4338, "step": 228 }, { "epoch": 0.04634689334142886, "grad_norm": 0.8212091326713562, "learning_rate": 0.00019975080166697068, "loss": 0.4436, "step": 229 }, { "epoch": 0.04654928152195912, "grad_norm": 0.8424368500709534, "learning_rate": 0.00019974855252400615, "loss": 0.419, "step": 230 }, { "epoch": 0.046751669702489375, "grad_norm": 0.8820065855979919, "learning_rate": 0.00019974629328952967, "loss": 0.3786, "step": 231 }, { "epoch": 0.04695405788301963, "grad_norm": 0.8870169520378113, "learning_rate": 0.00019974402396376992, "loss": 0.3783, "step": 232 }, { "epoch": 0.047156446063549885, "grad_norm": 0.9616879820823669, "learning_rate": 0.0001997417445469564, "loss": 0.5076, "step": 233 }, { "epoch": 0.04735883424408015, "grad_norm": 1.1007381677627563, "learning_rate": 0.00019973945503931972, "loss": 0.4013, "step": 234 }, { "epoch": 0.0475612224246104, "grad_norm": 0.9436114430427551, "learning_rate": 0.00019973715544109157, "loss": 0.3849, "step": 235 }, { "epoch": 0.04776361060514066, "grad_norm": 0.9205026626586914, "learning_rate": 0.00019973484575250457, "loss": 0.4035, "step": 236 }, { "epoch": 0.04796599878567092, "grad_norm": 0.8011429905891418, "learning_rate": 0.00019973252597379234, "loss": 0.4059, "step": 237 }, { "epoch": 0.048168386966201175, "grad_norm": 0.8591095805168152, "learning_rate": 0.00019973019610518966, "loss": 0.3347, "step": 238 }, { "epoch": 0.04837077514673143, "grad_norm": 1.0715107917785645, "learning_rate": 0.00019972785614693215, "loss": 0.3903, "step": 239 }, { "epoch": 0.048573163327261686, "grad_norm": 0.7088459134101868, "learning_rate": 0.00019972550609925662, "loss": 0.3701, "step": 240 }, { "epoch": 0.04877555150779195, "grad_norm": 1.190577507019043, "learning_rate": 0.00019972314596240076, "loss": 0.4032, "step": 241 }, { "epoch": 0.0489779396883222, "grad_norm": 0.8017274141311646, "learning_rate": 0.00019972077573660342, "loss": 0.3818, "step": 242 }, { "epoch": 0.04918032786885246, "grad_norm": 0.844630241394043, "learning_rate": 0.00019971839542210434, "loss": 0.3827, "step": 243 }, { "epoch": 0.04938271604938271, "grad_norm": 1.1047987937927246, "learning_rate": 0.00019971600501914432, "loss": 0.3752, "step": 244 }, { "epoch": 0.049585104229912975, "grad_norm": 0.7282317876815796, "learning_rate": 0.00019971360452796522, "loss": 0.3373, "step": 245 }, { "epoch": 0.04978749241044323, "grad_norm": 0.8181321620941162, "learning_rate": 0.00019971119394880988, "loss": 0.3282, "step": 246 }, { "epoch": 0.049989880590973486, "grad_norm": 0.9017817378044128, "learning_rate": 0.00019970877328192224, "loss": 0.3949, "step": 247 }, { "epoch": 0.05019226877150374, "grad_norm": 0.5751867890357971, "learning_rate": 0.0001997063425275471, "loss": 0.3443, "step": 248 }, { "epoch": 0.050394656952034, "grad_norm": 0.5587199926376343, "learning_rate": 0.0001997039016859305, "loss": 0.3545, "step": 249 }, { "epoch": 0.05059704513256426, "grad_norm": 0.6850367188453674, "learning_rate": 0.00019970145075731926, "loss": 0.3835, "step": 250 }, { "epoch": 0.05059704513256426, "eval_loss": 0.39252138137817383, "eval_runtime": 0.7379, "eval_samples_per_second": 6.776, "eval_steps_per_second": 1.355, "step": 250 }, { "epoch": 0.050799433313094514, "grad_norm": 0.706892728805542, "learning_rate": 0.0001996989897419614, "loss": 0.3413, "step": 251 }, { "epoch": 0.051001821493624776, "grad_norm": 0.6600444912910461, "learning_rate": 0.00019969651864010587, "loss": 0.3494, "step": 252 }, { "epoch": 0.05120420967415503, "grad_norm": 0.6261760592460632, "learning_rate": 0.0001996940374520027, "loss": 0.3519, "step": 253 }, { "epoch": 0.051406597854685286, "grad_norm": 0.932579517364502, "learning_rate": 0.00019969154617790292, "loss": 0.4178, "step": 254 }, { "epoch": 0.05160898603521554, "grad_norm": 0.5545374751091003, "learning_rate": 0.00019968904481805852, "loss": 0.3525, "step": 255 }, { "epoch": 0.051811374215745803, "grad_norm": 0.6660155057907104, "learning_rate": 0.00019968653337272261, "loss": 0.3532, "step": 256 }, { "epoch": 0.05201376239627606, "grad_norm": 0.576330304145813, "learning_rate": 0.00019968401184214924, "loss": 0.3391, "step": 257 }, { "epoch": 0.052216150576806314, "grad_norm": 0.7081141471862793, "learning_rate": 0.00019968148022659352, "loss": 0.3385, "step": 258 }, { "epoch": 0.05241853875733657, "grad_norm": 0.687074601650238, "learning_rate": 0.00019967893852631158, "loss": 0.3058, "step": 259 }, { "epoch": 0.05262092693786683, "grad_norm": 0.5989205241203308, "learning_rate": 0.00019967638674156057, "loss": 0.3554, "step": 260 }, { "epoch": 0.052823315118397086, "grad_norm": 0.6394159197807312, "learning_rate": 0.00019967382487259865, "loss": 0.3622, "step": 261 }, { "epoch": 0.05302570329892734, "grad_norm": 0.6000388264656067, "learning_rate": 0.00019967125291968496, "loss": 0.3167, "step": 262 }, { "epoch": 0.0532280914794576, "grad_norm": 0.4961284101009369, "learning_rate": 0.00019966867088307976, "loss": 0.3073, "step": 263 }, { "epoch": 0.05343047965998786, "grad_norm": 0.7699441909790039, "learning_rate": 0.00019966607876304427, "loss": 0.3673, "step": 264 }, { "epoch": 0.053632867840518114, "grad_norm": 0.4595373272895813, "learning_rate": 0.00019966347655984068, "loss": 0.3183, "step": 265 }, { "epoch": 0.05383525602104837, "grad_norm": 0.5160897970199585, "learning_rate": 0.00019966086427373233, "loss": 0.3031, "step": 266 }, { "epoch": 0.054037644201578625, "grad_norm": 0.7531689405441284, "learning_rate": 0.00019965824190498342, "loss": 0.3117, "step": 267 }, { "epoch": 0.05424003238210889, "grad_norm": 0.6607016324996948, "learning_rate": 0.0001996556094538593, "loss": 0.2863, "step": 268 }, { "epoch": 0.05444242056263914, "grad_norm": 0.7021990418434143, "learning_rate": 0.0001996529669206263, "loss": 0.3234, "step": 269 }, { "epoch": 0.0546448087431694, "grad_norm": 0.9091718792915344, "learning_rate": 0.00019965031430555177, "loss": 0.2699, "step": 270 }, { "epoch": 0.05484719692369966, "grad_norm": 0.5182921886444092, "learning_rate": 0.00019964765160890405, "loss": 0.2926, "step": 271 }, { "epoch": 0.055049585104229914, "grad_norm": 0.5352545380592346, "learning_rate": 0.0001996449788309525, "loss": 0.3292, "step": 272 }, { "epoch": 0.05525197328476017, "grad_norm": 0.5082312226295471, "learning_rate": 0.00019964229597196757, "loss": 0.3404, "step": 273 }, { "epoch": 0.055454361465290425, "grad_norm": 0.6266235113143921, "learning_rate": 0.0001996396030322207, "loss": 0.3178, "step": 274 }, { "epoch": 0.05565674964582069, "grad_norm": 0.5914604663848877, "learning_rate": 0.00019963690001198426, "loss": 0.3164, "step": 275 }, { "epoch": 0.05585913782635094, "grad_norm": 0.6798471212387085, "learning_rate": 0.00019963418691153176, "loss": 0.3606, "step": 276 }, { "epoch": 0.0560615260068812, "grad_norm": 0.68598872423172, "learning_rate": 0.0001996314637311377, "loss": 0.3397, "step": 277 }, { "epoch": 0.05626391418741145, "grad_norm": 0.5811251401901245, "learning_rate": 0.00019962873047107757, "loss": 0.3073, "step": 278 }, { "epoch": 0.056466302367941715, "grad_norm": 0.5858151912689209, "learning_rate": 0.00019962598713162786, "loss": 0.3172, "step": 279 }, { "epoch": 0.05666869054847197, "grad_norm": 0.5117138028144836, "learning_rate": 0.00019962323371306616, "loss": 0.3322, "step": 280 }, { "epoch": 0.056871078729002225, "grad_norm": 0.6146894097328186, "learning_rate": 0.000199620470215671, "loss": 0.3634, "step": 281 }, { "epoch": 0.05707346690953248, "grad_norm": 0.5277398228645325, "learning_rate": 0.00019961769663972195, "loss": 0.3092, "step": 282 }, { "epoch": 0.05727585509006274, "grad_norm": 0.6759743094444275, "learning_rate": 0.00019961491298549962, "loss": 0.2589, "step": 283 }, { "epoch": 0.057478243270593, "grad_norm": 0.45867061614990234, "learning_rate": 0.00019961211925328566, "loss": 0.3521, "step": 284 }, { "epoch": 0.05768063145112325, "grad_norm": 0.7512937188148499, "learning_rate": 0.0001996093154433627, "loss": 0.3041, "step": 285 }, { "epoch": 0.05788301963165351, "grad_norm": 0.5749024748802185, "learning_rate": 0.00019960650155601437, "loss": 0.328, "step": 286 }, { "epoch": 0.05808540781218377, "grad_norm": 0.6303148865699768, "learning_rate": 0.00019960367759152542, "loss": 0.3481, "step": 287 }, { "epoch": 0.058287795992714025, "grad_norm": 0.6051326990127563, "learning_rate": 0.00019960084355018145, "loss": 0.3371, "step": 288 }, { "epoch": 0.05849018417324428, "grad_norm": 0.5055463910102844, "learning_rate": 0.00019959799943226924, "loss": 0.3593, "step": 289 }, { "epoch": 0.05869257235377454, "grad_norm": 0.5214646458625793, "learning_rate": 0.00019959514523807653, "loss": 0.2492, "step": 290 }, { "epoch": 0.0588949605343048, "grad_norm": 0.5222678780555725, "learning_rate": 0.00019959228096789206, "loss": 0.2845, "step": 291 }, { "epoch": 0.05909734871483505, "grad_norm": 0.4554538130760193, "learning_rate": 0.00019958940662200561, "loss": 0.2753, "step": 292 }, { "epoch": 0.05929973689536531, "grad_norm": 0.6560251116752625, "learning_rate": 0.00019958652220070796, "loss": 0.2887, "step": 293 }, { "epoch": 0.05950212507589557, "grad_norm": 0.5200150012969971, "learning_rate": 0.00019958362770429097, "loss": 0.3094, "step": 294 }, { "epoch": 0.059704513256425826, "grad_norm": 0.5180234909057617, "learning_rate": 0.00019958072313304746, "loss": 0.3173, "step": 295 }, { "epoch": 0.05990690143695608, "grad_norm": 0.5091387033462524, "learning_rate": 0.00019957780848727123, "loss": 0.3008, "step": 296 }, { "epoch": 0.060109289617486336, "grad_norm": 0.4643876254558563, "learning_rate": 0.0001995748837672572, "loss": 0.284, "step": 297 }, { "epoch": 0.0603116777980166, "grad_norm": 0.6453856825828552, "learning_rate": 0.00019957194897330128, "loss": 0.3132, "step": 298 }, { "epoch": 0.060514065978546853, "grad_norm": 0.5977994203567505, "learning_rate": 0.00019956900410570037, "loss": 0.3169, "step": 299 }, { "epoch": 0.06071645415907711, "grad_norm": 0.5075947642326355, "learning_rate": 0.00019956604916475235, "loss": 0.2862, "step": 300 }, { "epoch": 0.06071645415907711, "eval_loss": 0.3347070813179016, "eval_runtime": 0.742, "eval_samples_per_second": 6.739, "eval_steps_per_second": 1.348, "step": 300 }, { "epoch": 0.060918842339607364, "grad_norm": 0.7045861482620239, "learning_rate": 0.00019956308415075626, "loss": 0.2855, "step": 301 }, { "epoch": 0.061121230520137626, "grad_norm": 0.5203765034675598, "learning_rate": 0.000199560109064012, "loss": 0.3551, "step": 302 }, { "epoch": 0.06132361870066788, "grad_norm": 0.39355793595314026, "learning_rate": 0.00019955712390482057, "loss": 0.3208, "step": 303 }, { "epoch": 0.061526006881198136, "grad_norm": 0.7209200859069824, "learning_rate": 0.00019955412867348398, "loss": 0.2472, "step": 304 }, { "epoch": 0.06172839506172839, "grad_norm": 0.489533007144928, "learning_rate": 0.00019955112337030525, "loss": 0.2821, "step": 305 }, { "epoch": 0.061930783242258654, "grad_norm": 0.4404822587966919, "learning_rate": 0.00019954810799558846, "loss": 0.3182, "step": 306 }, { "epoch": 0.06213317142278891, "grad_norm": 0.5157645344734192, "learning_rate": 0.00019954508254963865, "loss": 0.3032, "step": 307 }, { "epoch": 0.062335559603319164, "grad_norm": 0.5006906390190125, "learning_rate": 0.0001995420470327619, "loss": 0.2992, "step": 308 }, { "epoch": 0.06253794778384943, "grad_norm": 0.4116860330104828, "learning_rate": 0.00019953900144526528, "loss": 0.3001, "step": 309 }, { "epoch": 0.06274033596437968, "grad_norm": 0.49453118443489075, "learning_rate": 0.00019953594578745698, "loss": 0.2938, "step": 310 }, { "epoch": 0.06294272414490994, "grad_norm": 0.5103276371955872, "learning_rate": 0.0001995328800596461, "loss": 0.3285, "step": 311 }, { "epoch": 0.06314511232544019, "grad_norm": 0.47672298550605774, "learning_rate": 0.00019952980426214277, "loss": 0.3173, "step": 312 }, { "epoch": 0.06334750050597045, "grad_norm": 0.45488908886909485, "learning_rate": 0.00019952671839525824, "loss": 0.355, "step": 313 }, { "epoch": 0.0635498886865007, "grad_norm": 0.47179242968559265, "learning_rate": 0.00019952362245930467, "loss": 0.2831, "step": 314 }, { "epoch": 0.06375227686703097, "grad_norm": 0.5481370687484741, "learning_rate": 0.00019952051645459525, "loss": 0.3201, "step": 315 }, { "epoch": 0.06395466504756123, "grad_norm": 0.44173574447631836, "learning_rate": 0.00019951740038144422, "loss": 0.3181, "step": 316 }, { "epoch": 0.06415705322809148, "grad_norm": 0.5051449537277222, "learning_rate": 0.00019951427424016687, "loss": 0.3416, "step": 317 }, { "epoch": 0.06435944140862174, "grad_norm": 0.49225544929504395, "learning_rate": 0.0001995111380310794, "loss": 0.2848, "step": 318 }, { "epoch": 0.06456182958915199, "grad_norm": 0.472187340259552, "learning_rate": 0.00019950799175449922, "loss": 0.2474, "step": 319 }, { "epoch": 0.06476421776968225, "grad_norm": 0.5424083471298218, "learning_rate": 0.0001995048354107445, "loss": 0.3382, "step": 320 }, { "epoch": 0.0649666059502125, "grad_norm": 0.5043047070503235, "learning_rate": 0.00019950166900013463, "loss": 0.2834, "step": 321 }, { "epoch": 0.06516899413074276, "grad_norm": 0.3971298038959503, "learning_rate": 0.00019949849252298994, "loss": 0.2451, "step": 322 }, { "epoch": 0.06537138231127303, "grad_norm": 0.51171875, "learning_rate": 0.00019949530597963185, "loss": 0.3206, "step": 323 }, { "epoch": 0.06557377049180328, "grad_norm": 0.4314689040184021, "learning_rate": 0.00019949210937038266, "loss": 0.2662, "step": 324 }, { "epoch": 0.06577615867233354, "grad_norm": 0.5465297102928162, "learning_rate": 0.00019948890269556578, "loss": 0.335, "step": 325 }, { "epoch": 0.06597854685286379, "grad_norm": 0.731951892375946, "learning_rate": 0.0001994856859555057, "loss": 0.2778, "step": 326 }, { "epoch": 0.06618093503339405, "grad_norm": 0.6282134652137756, "learning_rate": 0.00019948245915052778, "loss": 0.3051, "step": 327 }, { "epoch": 0.0663833232139243, "grad_norm": 0.5031090378761292, "learning_rate": 0.0001994792222809585, "loss": 0.2717, "step": 328 }, { "epoch": 0.06658571139445456, "grad_norm": 0.5202815532684326, "learning_rate": 0.00019947597534712531, "loss": 0.2698, "step": 329 }, { "epoch": 0.06678809957498483, "grad_norm": 0.42810511589050293, "learning_rate": 0.00019947271834935677, "loss": 0.2872, "step": 330 }, { "epoch": 0.06699048775551508, "grad_norm": 0.5623170137405396, "learning_rate": 0.00019946945128798232, "loss": 0.2862, "step": 331 }, { "epoch": 0.06719287593604534, "grad_norm": 0.4195396602153778, "learning_rate": 0.00019946617416333252, "loss": 0.2956, "step": 332 }, { "epoch": 0.06739526411657559, "grad_norm": 0.5211588144302368, "learning_rate": 0.0001994628869757389, "loss": 0.2792, "step": 333 }, { "epoch": 0.06759765229710585, "grad_norm": 0.39513328671455383, "learning_rate": 0.00019945958972553403, "loss": 0.2431, "step": 334 }, { "epoch": 0.0678000404776361, "grad_norm": 0.6008070707321167, "learning_rate": 0.00019945628241305148, "loss": 0.271, "step": 335 }, { "epoch": 0.06800242865816636, "grad_norm": 0.46714478731155396, "learning_rate": 0.00019945296503862586, "loss": 0.2815, "step": 336 }, { "epoch": 0.06820481683869661, "grad_norm": 0.3947727680206299, "learning_rate": 0.0001994496376025928, "loss": 0.2492, "step": 337 }, { "epoch": 0.06840720501922688, "grad_norm": 0.38070622086524963, "learning_rate": 0.00019944630010528891, "loss": 0.2768, "step": 338 }, { "epoch": 0.06860959319975714, "grad_norm": 0.602689802646637, "learning_rate": 0.00019944295254705185, "loss": 0.2819, "step": 339 }, { "epoch": 0.0688119813802874, "grad_norm": 0.7967174649238586, "learning_rate": 0.0001994395949282203, "loss": 0.2612, "step": 340 }, { "epoch": 0.06901436956081765, "grad_norm": 0.4428558647632599, "learning_rate": 0.00019943622724913395, "loss": 0.3029, "step": 341 }, { "epoch": 0.0692167577413479, "grad_norm": 0.5136562585830688, "learning_rate": 0.00019943284951013347, "loss": 0.2933, "step": 342 }, { "epoch": 0.06941914592187816, "grad_norm": 0.5629222989082336, "learning_rate": 0.00019942946171156063, "loss": 0.3204, "step": 343 }, { "epoch": 0.06962153410240841, "grad_norm": 0.4126991927623749, "learning_rate": 0.00019942606385375816, "loss": 0.3101, "step": 344 }, { "epoch": 0.06982392228293868, "grad_norm": 0.5552066564559937, "learning_rate": 0.0001994226559370698, "loss": 0.3645, "step": 345 }, { "epoch": 0.07002631046346894, "grad_norm": 0.42373156547546387, "learning_rate": 0.0001994192379618404, "loss": 0.2591, "step": 346 }, { "epoch": 0.0702286986439992, "grad_norm": 0.3356430232524872, "learning_rate": 0.0001994158099284156, "loss": 0.2292, "step": 347 }, { "epoch": 0.07043108682452945, "grad_norm": 0.5170024037361145, "learning_rate": 0.0001994123718371424, "loss": 0.2949, "step": 348 }, { "epoch": 0.0706334750050597, "grad_norm": 0.5372616052627563, "learning_rate": 0.0001994089236883685, "loss": 0.2594, "step": 349 }, { "epoch": 0.07083586318558996, "grad_norm": 0.48990383744239807, "learning_rate": 0.00019940546548244278, "loss": 0.2907, "step": 350 }, { "epoch": 0.07083586318558996, "eval_loss": 0.31532177329063416, "eval_runtime": 0.7386, "eval_samples_per_second": 6.77, "eval_steps_per_second": 1.354, "step": 350 }, { "epoch": 0.07103825136612021, "grad_norm": 0.6602955460548401, "learning_rate": 0.00019940199721971515, "loss": 0.301, "step": 351 }, { "epoch": 0.07124063954665047, "grad_norm": 0.5398226380348206, "learning_rate": 0.00019939851890053643, "loss": 0.2974, "step": 352 }, { "epoch": 0.07144302772718074, "grad_norm": 0.44971731305122375, "learning_rate": 0.00019939503052525853, "loss": 0.2919, "step": 353 }, { "epoch": 0.071645415907711, "grad_norm": 0.49475017189979553, "learning_rate": 0.00019939153209423438, "loss": 0.2624, "step": 354 }, { "epoch": 0.07184780408824125, "grad_norm": 0.5040010809898376, "learning_rate": 0.00019938802360781795, "loss": 0.3241, "step": 355 }, { "epoch": 0.0720501922687715, "grad_norm": 0.35557180643081665, "learning_rate": 0.00019938450506636413, "loss": 0.2849, "step": 356 }, { "epoch": 0.07225258044930176, "grad_norm": 0.4988034665584564, "learning_rate": 0.00019938097647022893, "loss": 0.2742, "step": 357 }, { "epoch": 0.07245496862983201, "grad_norm": 0.5231152772903442, "learning_rate": 0.0001993774378197693, "loss": 0.2937, "step": 358 }, { "epoch": 0.07265735681036227, "grad_norm": 0.5258811712265015, "learning_rate": 0.00019937388911534328, "loss": 0.3157, "step": 359 }, { "epoch": 0.07285974499089254, "grad_norm": 0.42325958609580994, "learning_rate": 0.0001993703303573099, "loss": 0.2782, "step": 360 }, { "epoch": 0.0730621331714228, "grad_norm": 0.5115485191345215, "learning_rate": 0.00019936676154602915, "loss": 0.3047, "step": 361 }, { "epoch": 0.07326452135195305, "grad_norm": 0.4342189133167267, "learning_rate": 0.00019936318268186213, "loss": 0.3054, "step": 362 }, { "epoch": 0.0734669095324833, "grad_norm": 0.41122326254844666, "learning_rate": 0.00019935959376517087, "loss": 0.3087, "step": 363 }, { "epoch": 0.07366929771301356, "grad_norm": 0.35741302371025085, "learning_rate": 0.0001993559947963185, "loss": 0.2805, "step": 364 }, { "epoch": 0.07387168589354381, "grad_norm": 0.3425546884536743, "learning_rate": 0.0001993523857756691, "loss": 0.2627, "step": 365 }, { "epoch": 0.07407407407407407, "grad_norm": 0.668941080570221, "learning_rate": 0.0001993487667035878, "loss": 0.275, "step": 366 }, { "epoch": 0.07427646225460433, "grad_norm": 0.4595528542995453, "learning_rate": 0.00019934513758044074, "loss": 0.2705, "step": 367 }, { "epoch": 0.0744788504351346, "grad_norm": 0.5246752500534058, "learning_rate": 0.00019934149840659506, "loss": 0.3243, "step": 368 }, { "epoch": 0.07468123861566485, "grad_norm": 0.4581606388092041, "learning_rate": 0.00019933784918241897, "loss": 0.2529, "step": 369 }, { "epoch": 0.0748836267961951, "grad_norm": 0.40944111347198486, "learning_rate": 0.00019933418990828163, "loss": 0.2986, "step": 370 }, { "epoch": 0.07508601497672536, "grad_norm": 0.3873656392097473, "learning_rate": 0.00019933052058455325, "loss": 0.2519, "step": 371 }, { "epoch": 0.07528840315725562, "grad_norm": 0.4861612915992737, "learning_rate": 0.0001993268412116051, "loss": 0.2279, "step": 372 }, { "epoch": 0.07549079133778587, "grad_norm": 0.5027487277984619, "learning_rate": 0.00019932315178980935, "loss": 0.26, "step": 373 }, { "epoch": 0.07569317951831613, "grad_norm": 0.42377370595932007, "learning_rate": 0.00019931945231953927, "loss": 0.2662, "step": 374 }, { "epoch": 0.07589556769884638, "grad_norm": 0.4166119396686554, "learning_rate": 0.0001993157428011692, "loss": 0.2813, "step": 375 }, { "epoch": 0.07609795587937665, "grad_norm": 0.4710562825202942, "learning_rate": 0.00019931202323507434, "loss": 0.2973, "step": 376 }, { "epoch": 0.0763003440599069, "grad_norm": 0.41344717144966125, "learning_rate": 0.00019930829362163106, "loss": 0.2842, "step": 377 }, { "epoch": 0.07650273224043716, "grad_norm": 0.567787766456604, "learning_rate": 0.00019930455396121666, "loss": 0.2918, "step": 378 }, { "epoch": 0.07670512042096742, "grad_norm": 0.39079976081848145, "learning_rate": 0.00019930080425420947, "loss": 0.2845, "step": 379 }, { "epoch": 0.07690750860149767, "grad_norm": 0.32150888442993164, "learning_rate": 0.00019929704450098889, "loss": 0.2704, "step": 380 }, { "epoch": 0.07710989678202793, "grad_norm": 0.4112628996372223, "learning_rate": 0.0001992932747019352, "loss": 0.2013, "step": 381 }, { "epoch": 0.07731228496255818, "grad_norm": 0.4239389896392822, "learning_rate": 0.0001992894948574299, "loss": 0.2387, "step": 382 }, { "epoch": 0.07751467314308845, "grad_norm": 0.4737512767314911, "learning_rate": 0.00019928570496785533, "loss": 0.334, "step": 383 }, { "epoch": 0.0777170613236187, "grad_norm": 0.36126965284347534, "learning_rate": 0.0001992819050335949, "loss": 0.2539, "step": 384 }, { "epoch": 0.07791944950414896, "grad_norm": 0.4831307828426361, "learning_rate": 0.00019927809505503307, "loss": 0.2965, "step": 385 }, { "epoch": 0.07812183768467922, "grad_norm": 0.5115605592727661, "learning_rate": 0.00019927427503255534, "loss": 0.2953, "step": 386 }, { "epoch": 0.07832422586520947, "grad_norm": 0.38006240129470825, "learning_rate": 0.0001992704449665481, "loss": 0.2978, "step": 387 }, { "epoch": 0.07852661404573973, "grad_norm": 0.4983793795108795, "learning_rate": 0.00019926660485739887, "loss": 0.2788, "step": 388 }, { "epoch": 0.07872900222626998, "grad_norm": 0.41345569491386414, "learning_rate": 0.00019926275470549617, "loss": 0.2738, "step": 389 }, { "epoch": 0.07893139040680024, "grad_norm": 0.4223555326461792, "learning_rate": 0.00019925889451122953, "loss": 0.2921, "step": 390 }, { "epoch": 0.0791337785873305, "grad_norm": 0.5941314697265625, "learning_rate": 0.00019925502427498942, "loss": 0.2642, "step": 391 }, { "epoch": 0.07933616676786076, "grad_norm": 0.4179031252861023, "learning_rate": 0.0001992511439971674, "loss": 0.2429, "step": 392 }, { "epoch": 0.07953855494839102, "grad_norm": 0.4050491750240326, "learning_rate": 0.0001992472536781561, "loss": 0.2923, "step": 393 }, { "epoch": 0.07974094312892127, "grad_norm": 0.38059109449386597, "learning_rate": 0.00019924335331834904, "loss": 0.2807, "step": 394 }, { "epoch": 0.07994333130945153, "grad_norm": 0.42187055945396423, "learning_rate": 0.00019923944291814084, "loss": 0.2692, "step": 395 }, { "epoch": 0.08014571948998178, "grad_norm": 0.42098861932754517, "learning_rate": 0.0001992355224779271, "loss": 0.3065, "step": 396 }, { "epoch": 0.08034810767051204, "grad_norm": 0.38886579871177673, "learning_rate": 0.00019923159199810453, "loss": 0.2607, "step": 397 }, { "epoch": 0.0805504958510423, "grad_norm": 0.4102109968662262, "learning_rate": 0.00019922765147907065, "loss": 0.292, "step": 398 }, { "epoch": 0.08075288403157256, "grad_norm": 0.5397422909736633, "learning_rate": 0.0001992237009212242, "loss": 0.3344, "step": 399 }, { "epoch": 0.08095527221210282, "grad_norm": 0.45712950825691223, "learning_rate": 0.0001992197403249648, "loss": 0.2833, "step": 400 }, { "epoch": 0.08095527221210282, "eval_loss": 0.3104330003261566, "eval_runtime": 0.7392, "eval_samples_per_second": 6.764, "eval_steps_per_second": 1.353, "step": 400 }, { "epoch": 0.08115766039263307, "grad_norm": 0.3736235499382019, "learning_rate": 0.0001992157696906932, "loss": 0.2656, "step": 401 }, { "epoch": 0.08136004857316333, "grad_norm": 0.3134549558162689, "learning_rate": 0.0001992117890188111, "loss": 0.2562, "step": 402 }, { "epoch": 0.08156243675369358, "grad_norm": 0.3223460614681244, "learning_rate": 0.00019920779830972116, "loss": 0.2533, "step": 403 }, { "epoch": 0.08176482493422384, "grad_norm": 0.39283445477485657, "learning_rate": 0.0001992037975638272, "loss": 0.2602, "step": 404 }, { "epoch": 0.08196721311475409, "grad_norm": 0.44545242190361023, "learning_rate": 0.00019919978678153391, "loss": 0.2675, "step": 405 }, { "epoch": 0.08216960129528436, "grad_norm": 0.373585045337677, "learning_rate": 0.00019919576596324709, "loss": 0.2244, "step": 406 }, { "epoch": 0.08237198947581462, "grad_norm": 0.38392719626426697, "learning_rate": 0.00019919173510937356, "loss": 0.2645, "step": 407 }, { "epoch": 0.08257437765634487, "grad_norm": 0.43317118287086487, "learning_rate": 0.00019918769422032102, "loss": 0.2755, "step": 408 }, { "epoch": 0.08277676583687513, "grad_norm": 0.4535873234272003, "learning_rate": 0.00019918364329649837, "loss": 0.2632, "step": 409 }, { "epoch": 0.08297915401740538, "grad_norm": 0.32678788900375366, "learning_rate": 0.00019917958233831538, "loss": 0.2825, "step": 410 }, { "epoch": 0.08318154219793564, "grad_norm": 0.4875165522098541, "learning_rate": 0.00019917551134618298, "loss": 0.2603, "step": 411 }, { "epoch": 0.08338393037846589, "grad_norm": 0.4667682647705078, "learning_rate": 0.00019917143032051297, "loss": 0.262, "step": 412 }, { "epoch": 0.08358631855899615, "grad_norm": 0.32662510871887207, "learning_rate": 0.00019916733926171823, "loss": 0.222, "step": 413 }, { "epoch": 0.08378870673952642, "grad_norm": 0.3929062485694885, "learning_rate": 0.00019916323817021264, "loss": 0.2742, "step": 414 }, { "epoch": 0.08399109492005667, "grad_norm": 0.3036748170852661, "learning_rate": 0.00019915912704641112, "loss": 0.2677, "step": 415 }, { "epoch": 0.08419348310058693, "grad_norm": 0.28849026560783386, "learning_rate": 0.00019915500589072962, "loss": 0.2248, "step": 416 }, { "epoch": 0.08439587128111718, "grad_norm": 0.4197934567928314, "learning_rate": 0.00019915087470358502, "loss": 0.3324, "step": 417 }, { "epoch": 0.08459825946164744, "grad_norm": 0.3673010766506195, "learning_rate": 0.00019914673348539529, "loss": 0.2377, "step": 418 }, { "epoch": 0.08480064764217769, "grad_norm": 0.4578173756599426, "learning_rate": 0.00019914258223657942, "loss": 0.2526, "step": 419 }, { "epoch": 0.08500303582270795, "grad_norm": 0.3941150903701782, "learning_rate": 0.00019913842095755735, "loss": 0.2522, "step": 420 }, { "epoch": 0.08520542400323822, "grad_norm": 0.42108628153800964, "learning_rate": 0.00019913424964875009, "loss": 0.2516, "step": 421 }, { "epoch": 0.08540781218376847, "grad_norm": 0.3706568777561188, "learning_rate": 0.00019913006831057969, "loss": 0.2809, "step": 422 }, { "epoch": 0.08561020036429873, "grad_norm": 0.37976858019828796, "learning_rate": 0.00019912587694346912, "loss": 0.2495, "step": 423 }, { "epoch": 0.08581258854482898, "grad_norm": 0.3249634802341461, "learning_rate": 0.00019912167554784246, "loss": 0.2631, "step": 424 }, { "epoch": 0.08601497672535924, "grad_norm": 0.35132887959480286, "learning_rate": 0.0001991174641241247, "loss": 0.28, "step": 425 }, { "epoch": 0.0862173649058895, "grad_norm": 0.3174639046192169, "learning_rate": 0.00019911324267274196, "loss": 0.272, "step": 426 }, { "epoch": 0.08641975308641975, "grad_norm": 0.3432652950286865, "learning_rate": 0.0001991090111941213, "loss": 0.2545, "step": 427 }, { "epoch": 0.08662214126695, "grad_norm": 0.4003385305404663, "learning_rate": 0.00019910476968869082, "loss": 0.2637, "step": 428 }, { "epoch": 0.08682452944748027, "grad_norm": 0.2984203100204468, "learning_rate": 0.0001991005181568797, "loss": 0.2719, "step": 429 }, { "epoch": 0.08702691762801053, "grad_norm": 0.5444194078445435, "learning_rate": 0.00019909625659911794, "loss": 0.2601, "step": 430 }, { "epoch": 0.08722930580854078, "grad_norm": 0.43773773312568665, "learning_rate": 0.00019909198501583678, "loss": 0.2497, "step": 431 }, { "epoch": 0.08743169398907104, "grad_norm": 0.3558596074581146, "learning_rate": 0.00019908770340746829, "loss": 0.2708, "step": 432 }, { "epoch": 0.0876340821696013, "grad_norm": 0.37624043226242065, "learning_rate": 0.00019908341177444575, "loss": 0.2568, "step": 433 }, { "epoch": 0.08783647035013155, "grad_norm": 0.37379008531570435, "learning_rate": 0.0001990791101172032, "loss": 0.2937, "step": 434 }, { "epoch": 0.0880388585306618, "grad_norm": 0.44534832239151, "learning_rate": 0.00019907479843617597, "loss": 0.2665, "step": 435 }, { "epoch": 0.08824124671119207, "grad_norm": 0.356768935918808, "learning_rate": 0.00019907047673180023, "loss": 0.2367, "step": 436 }, { "epoch": 0.08844363489172233, "grad_norm": 0.3663882315158844, "learning_rate": 0.00019906614500451314, "loss": 0.2213, "step": 437 }, { "epoch": 0.08864602307225258, "grad_norm": 0.6157559156417847, "learning_rate": 0.000199061803254753, "loss": 0.2592, "step": 438 }, { "epoch": 0.08884841125278284, "grad_norm": 0.3663040101528168, "learning_rate": 0.00019905745148295905, "loss": 0.2704, "step": 439 }, { "epoch": 0.0890507994333131, "grad_norm": 0.41940397024154663, "learning_rate": 0.00019905308968957156, "loss": 0.2783, "step": 440 }, { "epoch": 0.08925318761384335, "grad_norm": 0.28495171666145325, "learning_rate": 0.0001990487178750318, "loss": 0.2551, "step": 441 }, { "epoch": 0.0894555757943736, "grad_norm": 0.33689308166503906, "learning_rate": 0.0001990443360397821, "loss": 0.2662, "step": 442 }, { "epoch": 0.08965796397490386, "grad_norm": 0.34871089458465576, "learning_rate": 0.00019903994418426571, "loss": 0.2518, "step": 443 }, { "epoch": 0.08986035215543413, "grad_norm": 0.3919788897037506, "learning_rate": 0.000199035542308927, "loss": 0.2686, "step": 444 }, { "epoch": 0.09006274033596438, "grad_norm": 0.33044981956481934, "learning_rate": 0.00019903113041421126, "loss": 0.2436, "step": 445 }, { "epoch": 0.09026512851649464, "grad_norm": 0.4084889888763428, "learning_rate": 0.0001990267085005649, "loss": 0.2694, "step": 446 }, { "epoch": 0.0904675166970249, "grad_norm": 0.43821752071380615, "learning_rate": 0.00019902227656843523, "loss": 0.286, "step": 447 }, { "epoch": 0.09066990487755515, "grad_norm": 0.41732826828956604, "learning_rate": 0.00019901783461827066, "loss": 0.2943, "step": 448 }, { "epoch": 0.0908722930580854, "grad_norm": 0.6563799381256104, "learning_rate": 0.00019901338265052056, "loss": 0.249, "step": 449 }, { "epoch": 0.09107468123861566, "grad_norm": 0.3646370768547058, "learning_rate": 0.0001990089206656353, "loss": 0.2762, "step": 450 }, { "epoch": 0.09107468123861566, "eval_loss": 0.3083243668079376, "eval_runtime": 0.7375, "eval_samples_per_second": 6.78, "eval_steps_per_second": 1.356, "step": 450 }, { "epoch": 0.09127706941914593, "grad_norm": 0.3965453803539276, "learning_rate": 0.0001990044486640664, "loss": 0.2632, "step": 451 }, { "epoch": 0.09147945759967618, "grad_norm": 0.4205472469329834, "learning_rate": 0.0001989999666462662, "loss": 0.2831, "step": 452 }, { "epoch": 0.09168184578020644, "grad_norm": 0.4374144971370697, "learning_rate": 0.00019899547461268817, "loss": 0.2215, "step": 453 }, { "epoch": 0.0918842339607367, "grad_norm": 0.6188966631889343, "learning_rate": 0.00019899097256378677, "loss": 0.2578, "step": 454 }, { "epoch": 0.09208662214126695, "grad_norm": 0.34634727239608765, "learning_rate": 0.00019898646050001747, "loss": 0.2623, "step": 455 }, { "epoch": 0.0922890103217972, "grad_norm": 0.326874315738678, "learning_rate": 0.00019898193842183672, "loss": 0.2283, "step": 456 }, { "epoch": 0.09249139850232746, "grad_norm": 0.3612661361694336, "learning_rate": 0.00019897740632970207, "loss": 0.2627, "step": 457 }, { "epoch": 0.09269378668285772, "grad_norm": 0.3630461096763611, "learning_rate": 0.00019897286422407203, "loss": 0.2503, "step": 458 }, { "epoch": 0.09289617486338798, "grad_norm": 0.4167366027832031, "learning_rate": 0.00019896831210540605, "loss": 0.2897, "step": 459 }, { "epoch": 0.09309856304391824, "grad_norm": 0.41102683544158936, "learning_rate": 0.00019896374997416475, "loss": 0.2091, "step": 460 }, { "epoch": 0.0933009512244485, "grad_norm": 0.4077226221561432, "learning_rate": 0.0001989591778308096, "loss": 0.3367, "step": 461 }, { "epoch": 0.09350333940497875, "grad_norm": 0.5037345886230469, "learning_rate": 0.00019895459567580325, "loss": 0.2544, "step": 462 }, { "epoch": 0.093705727585509, "grad_norm": 0.40945005416870117, "learning_rate": 0.00019895000350960923, "loss": 0.2801, "step": 463 }, { "epoch": 0.09390811576603926, "grad_norm": 0.561182975769043, "learning_rate": 0.00019894540133269208, "loss": 0.2737, "step": 464 }, { "epoch": 0.09411050394656952, "grad_norm": 0.33685287833213806, "learning_rate": 0.00019894078914551748, "loss": 0.2281, "step": 465 }, { "epoch": 0.09431289212709977, "grad_norm": 0.3741171360015869, "learning_rate": 0.000198936166948552, "loss": 0.3131, "step": 466 }, { "epoch": 0.09451528030763004, "grad_norm": 0.4491271674633026, "learning_rate": 0.00019893153474226328, "loss": 0.2873, "step": 467 }, { "epoch": 0.0947176684881603, "grad_norm": 0.36082473397254944, "learning_rate": 0.00019892689252711993, "loss": 0.2676, "step": 468 }, { "epoch": 0.09492005666869055, "grad_norm": 0.39217400550842285, "learning_rate": 0.00019892224030359165, "loss": 0.2614, "step": 469 }, { "epoch": 0.0951224448492208, "grad_norm": 0.45073944330215454, "learning_rate": 0.00019891757807214905, "loss": 0.2643, "step": 470 }, { "epoch": 0.09532483302975106, "grad_norm": 0.32423001527786255, "learning_rate": 0.00019891290583326385, "loss": 0.2128, "step": 471 }, { "epoch": 0.09552722121028132, "grad_norm": 0.47926604747772217, "learning_rate": 0.0001989082235874087, "loss": 0.2651, "step": 472 }, { "epoch": 0.09572960939081157, "grad_norm": 0.8147411346435547, "learning_rate": 0.00019890353133505734, "loss": 0.2591, "step": 473 }, { "epoch": 0.09593199757134184, "grad_norm": 0.422296941280365, "learning_rate": 0.00019889882907668445, "loss": 0.2747, "step": 474 }, { "epoch": 0.0961343857518721, "grad_norm": 0.4397691786289215, "learning_rate": 0.00019889411681276578, "loss": 0.2674, "step": 475 }, { "epoch": 0.09633677393240235, "grad_norm": 0.6375408172607422, "learning_rate": 0.00019888939454377805, "loss": 0.3053, "step": 476 }, { "epoch": 0.0965391621129326, "grad_norm": 0.40631526708602905, "learning_rate": 0.00019888466227019902, "loss": 0.302, "step": 477 }, { "epoch": 0.09674155029346286, "grad_norm": 0.4207494556903839, "learning_rate": 0.00019887991999250742, "loss": 0.3136, "step": 478 }, { "epoch": 0.09694393847399312, "grad_norm": 0.41882240772247314, "learning_rate": 0.00019887516771118307, "loss": 0.3117, "step": 479 }, { "epoch": 0.09714632665452337, "grad_norm": 0.3637535572052002, "learning_rate": 0.00019887040542670672, "loss": 0.2458, "step": 480 }, { "epoch": 0.09734871483505363, "grad_norm": 0.5529453158378601, "learning_rate": 0.0001988656331395602, "loss": 0.3039, "step": 481 }, { "epoch": 0.0975511030155839, "grad_norm": 0.36648619174957275, "learning_rate": 0.00019886085085022632, "loss": 0.323, "step": 482 }, { "epoch": 0.09775349119611415, "grad_norm": 0.3846886157989502, "learning_rate": 0.00019885605855918885, "loss": 0.2829, "step": 483 }, { "epoch": 0.0979558793766444, "grad_norm": 0.4345422089099884, "learning_rate": 0.0001988512562669327, "loss": 0.2507, "step": 484 }, { "epoch": 0.09815826755717466, "grad_norm": 0.39755547046661377, "learning_rate": 0.00019884644397394366, "loss": 0.245, "step": 485 }, { "epoch": 0.09836065573770492, "grad_norm": 0.36251211166381836, "learning_rate": 0.0001988416216807086, "loss": 0.2837, "step": 486 }, { "epoch": 0.09856304391823517, "grad_norm": 0.3217640221118927, "learning_rate": 0.00019883678938771538, "loss": 0.2498, "step": 487 }, { "epoch": 0.09876543209876543, "grad_norm": 0.3786596953868866, "learning_rate": 0.0001988319470954529, "loss": 0.2828, "step": 488 }, { "epoch": 0.0989678202792957, "grad_norm": 0.2914827764034271, "learning_rate": 0.00019882709480441104, "loss": 0.2648, "step": 489 }, { "epoch": 0.09917020845982595, "grad_norm": 0.5327249765396118, "learning_rate": 0.00019882223251508073, "loss": 0.3445, "step": 490 }, { "epoch": 0.0993725966403562, "grad_norm": 0.30911511182785034, "learning_rate": 0.00019881736022795383, "loss": 0.2566, "step": 491 }, { "epoch": 0.09957498482088646, "grad_norm": 0.8316447734832764, "learning_rate": 0.00019881247794352333, "loss": 0.266, "step": 492 }, { "epoch": 0.09977737300141672, "grad_norm": 0.32144424319267273, "learning_rate": 0.00019880758566228314, "loss": 0.2761, "step": 493 }, { "epoch": 0.09997976118194697, "grad_norm": 0.4167412221431732, "learning_rate": 0.00019880268338472819, "loss": 0.2849, "step": 494 }, { "epoch": 0.10018214936247723, "grad_norm": 0.39232102036476135, "learning_rate": 0.00019879777111135444, "loss": 0.253, "step": 495 }, { "epoch": 0.10038453754300748, "grad_norm": 0.34585121273994446, "learning_rate": 0.0001987928488426589, "loss": 0.2682, "step": 496 }, { "epoch": 0.10058692572353775, "grad_norm": 0.41705626249313354, "learning_rate": 0.00019878791657913957, "loss": 0.2431, "step": 497 }, { "epoch": 0.100789313904068, "grad_norm": 0.38251325488090515, "learning_rate": 0.0001987829743212954, "loss": 0.3012, "step": 498 }, { "epoch": 0.10099170208459826, "grad_norm": 0.43135866522789, "learning_rate": 0.00019877802206962639, "loss": 0.2738, "step": 499 }, { "epoch": 0.10119409026512852, "grad_norm": 0.3900761902332306, "learning_rate": 0.00019877305982463357, "loss": 0.2732, "step": 500 }, { "epoch": 0.10119409026512852, "eval_loss": 0.30053189396858215, "eval_runtime": 0.7369, "eval_samples_per_second": 6.785, "eval_steps_per_second": 1.357, "step": 500 }, { "epoch": 0.10139647844565877, "grad_norm": 0.3199058473110199, "learning_rate": 0.00019876808758681897, "loss": 0.2594, "step": 501 }, { "epoch": 0.10159886662618903, "grad_norm": 0.3172077238559723, "learning_rate": 0.00019876310535668564, "loss": 0.2061, "step": 502 }, { "epoch": 0.10180125480671928, "grad_norm": 0.3133200705051422, "learning_rate": 0.00019875811313473763, "loss": 0.2348, "step": 503 }, { "epoch": 0.10200364298724955, "grad_norm": 0.331843763589859, "learning_rate": 0.00019875311092147998, "loss": 0.2242, "step": 504 }, { "epoch": 0.1022060311677798, "grad_norm": 0.4921395778656006, "learning_rate": 0.00019874809871741876, "loss": 0.2463, "step": 505 }, { "epoch": 0.10240841934831006, "grad_norm": 0.4361092746257782, "learning_rate": 0.00019874307652306106, "loss": 0.2927, "step": 506 }, { "epoch": 0.10261080752884032, "grad_norm": 0.3385670781135559, "learning_rate": 0.00019873804433891498, "loss": 0.2534, "step": 507 }, { "epoch": 0.10281319570937057, "grad_norm": 0.36069896817207336, "learning_rate": 0.0001987330021654896, "loss": 0.2575, "step": 508 }, { "epoch": 0.10301558388990083, "grad_norm": 0.3283306062221527, "learning_rate": 0.0001987279500032951, "loss": 0.2374, "step": 509 }, { "epoch": 0.10321797207043108, "grad_norm": 0.4866870939731598, "learning_rate": 0.00019872288785284257, "loss": 0.3021, "step": 510 }, { "epoch": 0.10342036025096134, "grad_norm": 0.3630296587944031, "learning_rate": 0.0001987178157146441, "loss": 0.2506, "step": 511 }, { "epoch": 0.10362274843149161, "grad_norm": 0.5312589406967163, "learning_rate": 0.00019871273358921284, "loss": 0.322, "step": 512 }, { "epoch": 0.10382513661202186, "grad_norm": 0.3144959807395935, "learning_rate": 0.00019870764147706304, "loss": 0.2577, "step": 513 }, { "epoch": 0.10402752479255212, "grad_norm": 0.32503482699394226, "learning_rate": 0.00019870253937870978, "loss": 0.2335, "step": 514 }, { "epoch": 0.10422991297308237, "grad_norm": 0.34515443444252014, "learning_rate": 0.00019869742729466925, "loss": 0.2417, "step": 515 }, { "epoch": 0.10443230115361263, "grad_norm": 0.43555840849876404, "learning_rate": 0.00019869230522545866, "loss": 0.239, "step": 516 }, { "epoch": 0.10463468933414288, "grad_norm": 0.3596293032169342, "learning_rate": 0.00019868717317159617, "loss": 0.2569, "step": 517 }, { "epoch": 0.10483707751467314, "grad_norm": 0.7044296264648438, "learning_rate": 0.00019868203113360103, "loss": 0.2603, "step": 518 }, { "epoch": 0.1050394656952034, "grad_norm": 0.32274308800697327, "learning_rate": 0.0001986768791119934, "loss": 0.2647, "step": 519 }, { "epoch": 0.10524185387573366, "grad_norm": 0.4142962396144867, "learning_rate": 0.00019867171710729462, "loss": 0.2608, "step": 520 }, { "epoch": 0.10544424205626392, "grad_norm": 0.5066673159599304, "learning_rate": 0.00019866654512002682, "loss": 0.2298, "step": 521 }, { "epoch": 0.10564663023679417, "grad_norm": 0.36532062292099, "learning_rate": 0.00019866136315071326, "loss": 0.2578, "step": 522 }, { "epoch": 0.10584901841732443, "grad_norm": 0.30255767703056335, "learning_rate": 0.00019865617119987824, "loss": 0.2639, "step": 523 }, { "epoch": 0.10605140659785468, "grad_norm": 0.5272563099861145, "learning_rate": 0.000198650969268047, "loss": 0.2291, "step": 524 }, { "epoch": 0.10625379477838494, "grad_norm": 0.2946690022945404, "learning_rate": 0.00019864575735574583, "loss": 0.2495, "step": 525 }, { "epoch": 0.1064561829589152, "grad_norm": 0.3070518374443054, "learning_rate": 0.000198640535463502, "loss": 0.2251, "step": 526 }, { "epoch": 0.10665857113944546, "grad_norm": 0.37028443813323975, "learning_rate": 0.00019863530359184381, "loss": 0.2619, "step": 527 }, { "epoch": 0.10686095931997572, "grad_norm": 0.3705368936061859, "learning_rate": 0.00019863006174130056, "loss": 0.2826, "step": 528 }, { "epoch": 0.10706334750050597, "grad_norm": 0.3553207814693451, "learning_rate": 0.0001986248099124026, "loss": 0.2731, "step": 529 }, { "epoch": 0.10726573568103623, "grad_norm": 0.306058406829834, "learning_rate": 0.00019861954810568123, "loss": 0.2201, "step": 530 }, { "epoch": 0.10746812386156648, "grad_norm": 0.30540916323661804, "learning_rate": 0.00019861427632166879, "loss": 0.2476, "step": 531 }, { "epoch": 0.10767051204209674, "grad_norm": 0.28308695554733276, "learning_rate": 0.0001986089945608986, "loss": 0.2123, "step": 532 }, { "epoch": 0.107872900222627, "grad_norm": 0.3797146677970886, "learning_rate": 0.00019860370282390505, "loss": 0.2896, "step": 533 }, { "epoch": 0.10807528840315725, "grad_norm": 0.40093159675598145, "learning_rate": 0.0001985984011112235, "loss": 0.2256, "step": 534 }, { "epoch": 0.10827767658368752, "grad_norm": 0.5490695238113403, "learning_rate": 0.00019859308942339027, "loss": 0.2718, "step": 535 }, { "epoch": 0.10848006476421777, "grad_norm": 0.3820257782936096, "learning_rate": 0.00019858776776094278, "loss": 0.2431, "step": 536 }, { "epoch": 0.10868245294474803, "grad_norm": 0.3355390727519989, "learning_rate": 0.00019858243612441945, "loss": 0.2208, "step": 537 }, { "epoch": 0.10888484112527828, "grad_norm": 0.28901511430740356, "learning_rate": 0.00019857709451435963, "loss": 0.2429, "step": 538 }, { "epoch": 0.10908722930580854, "grad_norm": 0.3239997625350952, "learning_rate": 0.00019857174293130375, "loss": 0.2538, "step": 539 }, { "epoch": 0.1092896174863388, "grad_norm": 0.37941160798072815, "learning_rate": 0.0001985663813757932, "loss": 0.2529, "step": 540 }, { "epoch": 0.10949200566686905, "grad_norm": 0.39999866485595703, "learning_rate": 0.00019856100984837042, "loss": 0.2871, "step": 541 }, { "epoch": 0.10969439384739932, "grad_norm": 0.3257283866405487, "learning_rate": 0.0001985556283495789, "loss": 0.2384, "step": 542 }, { "epoch": 0.10989678202792957, "grad_norm": 0.7400651574134827, "learning_rate": 0.000198550236879963, "loss": 0.2667, "step": 543 }, { "epoch": 0.11009917020845983, "grad_norm": 0.40268924832344055, "learning_rate": 0.00019854483544006821, "loss": 0.2802, "step": 544 }, { "epoch": 0.11030155838899008, "grad_norm": 0.44967809319496155, "learning_rate": 0.000198539424030441, "loss": 0.2922, "step": 545 }, { "epoch": 0.11050394656952034, "grad_norm": 0.3962949812412262, "learning_rate": 0.00019853400265162883, "loss": 0.2769, "step": 546 }, { "epoch": 0.1107063347500506, "grad_norm": 0.388681560754776, "learning_rate": 0.00019852857130418019, "loss": 0.2319, "step": 547 }, { "epoch": 0.11090872293058085, "grad_norm": 0.5428779125213623, "learning_rate": 0.00019852312998864453, "loss": 0.2121, "step": 548 }, { "epoch": 0.1111111111111111, "grad_norm": 0.3685462176799774, "learning_rate": 0.00019851767870557237, "loss": 0.2442, "step": 549 }, { "epoch": 0.11131349929164137, "grad_norm": 0.4083040952682495, "learning_rate": 0.00019851221745551524, "loss": 0.2762, "step": 550 }, { "epoch": 0.11131349929164137, "eval_loss": 0.30158162117004395, "eval_runtime": 0.7395, "eval_samples_per_second": 6.761, "eval_steps_per_second": 1.352, "step": 550 }, { "epoch": 0.11151588747217163, "grad_norm": 0.43971139192581177, "learning_rate": 0.00019850674623902558, "loss": 0.2168, "step": 551 }, { "epoch": 0.11171827565270188, "grad_norm": 0.4038715958595276, "learning_rate": 0.00019850126505665698, "loss": 0.2518, "step": 552 }, { "epoch": 0.11192066383323214, "grad_norm": 0.5503129959106445, "learning_rate": 0.00019849577390896396, "loss": 0.2501, "step": 553 }, { "epoch": 0.1121230520137624, "grad_norm": 0.779593825340271, "learning_rate": 0.00019849027279650203, "loss": 0.294, "step": 554 }, { "epoch": 0.11232544019429265, "grad_norm": 0.29945191740989685, "learning_rate": 0.00019848476171982772, "loss": 0.2523, "step": 555 }, { "epoch": 0.1125278283748229, "grad_norm": 0.33953744173049927, "learning_rate": 0.00019847924067949862, "loss": 0.2657, "step": 556 }, { "epoch": 0.11273021655535317, "grad_norm": 0.4823399484157562, "learning_rate": 0.00019847370967607331, "loss": 0.2603, "step": 557 }, { "epoch": 0.11293260473588343, "grad_norm": 0.330797016620636, "learning_rate": 0.00019846816871011132, "loss": 0.2241, "step": 558 }, { "epoch": 0.11313499291641368, "grad_norm": 0.31722167134284973, "learning_rate": 0.0001984626177821732, "loss": 0.2284, "step": 559 }, { "epoch": 0.11333738109694394, "grad_norm": 0.3729718327522278, "learning_rate": 0.0001984570568928206, "loss": 0.3019, "step": 560 }, { "epoch": 0.1135397692774742, "grad_norm": 0.29263293743133545, "learning_rate": 0.00019845148604261605, "loss": 0.2791, "step": 561 }, { "epoch": 0.11374215745800445, "grad_norm": 0.6067195534706116, "learning_rate": 0.0001984459052321232, "loss": 0.2651, "step": 562 }, { "epoch": 0.1139445456385347, "grad_norm": 0.5350626111030579, "learning_rate": 0.00019844031446190666, "loss": 0.294, "step": 563 }, { "epoch": 0.11414693381906496, "grad_norm": 0.346068412065506, "learning_rate": 0.00019843471373253202, "loss": 0.2733, "step": 564 }, { "epoch": 0.11434932199959523, "grad_norm": 0.3487130105495453, "learning_rate": 0.00019842910304456587, "loss": 0.2378, "step": 565 }, { "epoch": 0.11455171018012549, "grad_norm": 0.24237936735153198, "learning_rate": 0.0001984234823985759, "loss": 0.2055, "step": 566 }, { "epoch": 0.11475409836065574, "grad_norm": 0.3237369656562805, "learning_rate": 0.00019841785179513073, "loss": 0.2398, "step": 567 }, { "epoch": 0.114956486541186, "grad_norm": 0.3716401755809784, "learning_rate": 0.0001984122112348, "loss": 0.2309, "step": 568 }, { "epoch": 0.11515887472171625, "grad_norm": 0.5387085676193237, "learning_rate": 0.00019840656071815437, "loss": 0.2603, "step": 569 }, { "epoch": 0.1153612629022465, "grad_norm": 0.5174623131752014, "learning_rate": 0.0001984009002457655, "loss": 0.2898, "step": 570 }, { "epoch": 0.11556365108277676, "grad_norm": 0.3925536274909973, "learning_rate": 0.00019839522981820605, "loss": 0.2316, "step": 571 }, { "epoch": 0.11576603926330702, "grad_norm": 0.42717403173446655, "learning_rate": 0.0001983895494360497, "loss": 0.2339, "step": 572 }, { "epoch": 0.11596842744383729, "grad_norm": 0.3932379484176636, "learning_rate": 0.0001983838590998711, "loss": 0.2916, "step": 573 }, { "epoch": 0.11617081562436754, "grad_norm": 0.3760148584842682, "learning_rate": 0.000198378158810246, "loss": 0.2469, "step": 574 }, { "epoch": 0.1163732038048978, "grad_norm": 0.48632335662841797, "learning_rate": 0.00019837244856775102, "loss": 0.27, "step": 575 }, { "epoch": 0.11657559198542805, "grad_norm": 0.44857627153396606, "learning_rate": 0.00019836672837296394, "loss": 0.253, "step": 576 }, { "epoch": 0.1167779801659583, "grad_norm": 0.37322884798049927, "learning_rate": 0.00019836099822646342, "loss": 0.2575, "step": 577 }, { "epoch": 0.11698036834648856, "grad_norm": 0.3037974238395691, "learning_rate": 0.00019835525812882923, "loss": 0.2609, "step": 578 }, { "epoch": 0.11718275652701882, "grad_norm": 0.5397683382034302, "learning_rate": 0.000198349508080642, "loss": 0.2463, "step": 579 }, { "epoch": 0.11738514470754909, "grad_norm": 0.44430914521217346, "learning_rate": 0.00019834374808248351, "loss": 0.2565, "step": 580 }, { "epoch": 0.11758753288807934, "grad_norm": 0.7803641557693481, "learning_rate": 0.00019833797813493655, "loss": 0.2483, "step": 581 }, { "epoch": 0.1177899210686096, "grad_norm": 0.3826069235801697, "learning_rate": 0.00019833219823858477, "loss": 0.2909, "step": 582 }, { "epoch": 0.11799230924913985, "grad_norm": 0.3330939710140228, "learning_rate": 0.00019832640839401297, "loss": 0.2606, "step": 583 }, { "epoch": 0.1181946974296701, "grad_norm": 0.35879385471343994, "learning_rate": 0.00019832060860180688, "loss": 0.2447, "step": 584 }, { "epoch": 0.11839708561020036, "grad_norm": 0.7940172553062439, "learning_rate": 0.0001983147988625533, "loss": 0.2826, "step": 585 }, { "epoch": 0.11859947379073062, "grad_norm": 0.4618555009365082, "learning_rate": 0.00019830897917683997, "loss": 0.2495, "step": 586 }, { "epoch": 0.11880186197126087, "grad_norm": 0.2834542691707611, "learning_rate": 0.0001983031495452557, "loss": 0.2505, "step": 587 }, { "epoch": 0.11900425015179114, "grad_norm": 0.40233033895492554, "learning_rate": 0.0001982973099683902, "loss": 0.2571, "step": 588 }, { "epoch": 0.1192066383323214, "grad_norm": 0.355094850063324, "learning_rate": 0.00019829146044683432, "loss": 0.2356, "step": 589 }, { "epoch": 0.11940902651285165, "grad_norm": 0.7586387991905212, "learning_rate": 0.0001982856009811798, "loss": 0.3068, "step": 590 }, { "epoch": 0.1196114146933819, "grad_norm": 0.54310542345047, "learning_rate": 0.0001982797315720195, "loss": 0.2868, "step": 591 }, { "epoch": 0.11981380287391216, "grad_norm": 0.42550796270370483, "learning_rate": 0.0001982738522199472, "loss": 0.3018, "step": 592 }, { "epoch": 0.12001619105444242, "grad_norm": 0.39309918880462646, "learning_rate": 0.0001982679629255577, "loss": 0.2296, "step": 593 }, { "epoch": 0.12021857923497267, "grad_norm": 0.3419604003429413, "learning_rate": 0.00019826206368944683, "loss": 0.2438, "step": 594 }, { "epoch": 0.12042096741550294, "grad_norm": 0.6050029397010803, "learning_rate": 0.0001982561545122114, "loss": 0.2923, "step": 595 }, { "epoch": 0.1206233555960332, "grad_norm": 0.3203630745410919, "learning_rate": 0.00019825023539444926, "loss": 0.2406, "step": 596 }, { "epoch": 0.12082574377656345, "grad_norm": 0.5872333645820618, "learning_rate": 0.00019824430633675922, "loss": 0.2742, "step": 597 }, { "epoch": 0.12102813195709371, "grad_norm": 0.3505767285823822, "learning_rate": 0.00019823836733974114, "loss": 0.2484, "step": 598 }, { "epoch": 0.12123052013762396, "grad_norm": 0.395964115858078, "learning_rate": 0.0001982324184039958, "loss": 0.2487, "step": 599 }, { "epoch": 0.12143290831815422, "grad_norm": 0.4352482259273529, "learning_rate": 0.00019822645953012518, "loss": 0.2336, "step": 600 }, { "epoch": 0.12143290831815422, "eval_loss": 0.3115096390247345, "eval_runtime": 0.7395, "eval_samples_per_second": 6.761, "eval_steps_per_second": 1.352, "step": 600 }, { "epoch": 0.12163529649868447, "grad_norm": 0.47118180990219116, "learning_rate": 0.00019822049071873204, "loss": 0.3171, "step": 601 }, { "epoch": 0.12183768467921473, "grad_norm": 0.3833599090576172, "learning_rate": 0.00019821451197042026, "loss": 0.2849, "step": 602 }, { "epoch": 0.122040072859745, "grad_norm": 0.3530329763889313, "learning_rate": 0.00019820852328579472, "loss": 0.2554, "step": 603 }, { "epoch": 0.12224246104027525, "grad_norm": 0.42997848987579346, "learning_rate": 0.0001982025246654613, "loss": 0.2597, "step": 604 }, { "epoch": 0.12244484922080551, "grad_norm": 0.3684461712837219, "learning_rate": 0.00019819651611002685, "loss": 0.2819, "step": 605 }, { "epoch": 0.12264723740133576, "grad_norm": 0.4009121060371399, "learning_rate": 0.00019819049762009926, "loss": 0.2501, "step": 606 }, { "epoch": 0.12284962558186602, "grad_norm": 0.4022650420665741, "learning_rate": 0.00019818446919628738, "loss": 0.3116, "step": 607 }, { "epoch": 0.12305201376239627, "grad_norm": 0.37116739153862, "learning_rate": 0.0001981784308392012, "loss": 0.2623, "step": 608 }, { "epoch": 0.12325440194292653, "grad_norm": 0.4383566975593567, "learning_rate": 0.00019817238254945157, "loss": 0.2417, "step": 609 }, { "epoch": 0.12345679012345678, "grad_norm": 0.4073963165283203, "learning_rate": 0.00019816632432765035, "loss": 0.2518, "step": 610 }, { "epoch": 0.12365917830398705, "grad_norm": 0.29820218682289124, "learning_rate": 0.00019816025617441047, "loss": 0.2828, "step": 611 }, { "epoch": 0.12386156648451731, "grad_norm": 0.3478280305862427, "learning_rate": 0.00019815417809034588, "loss": 0.264, "step": 612 }, { "epoch": 0.12406395466504756, "grad_norm": 0.4512695074081421, "learning_rate": 0.00019814809007607148, "loss": 0.2657, "step": 613 }, { "epoch": 0.12426634284557782, "grad_norm": 0.3232296407222748, "learning_rate": 0.00019814199213220317, "loss": 0.241, "step": 614 }, { "epoch": 0.12446873102610807, "grad_norm": 0.4013686776161194, "learning_rate": 0.00019813588425935787, "loss": 0.2306, "step": 615 }, { "epoch": 0.12467111920663833, "grad_norm": 0.4323452115058899, "learning_rate": 0.00019812976645815358, "loss": 0.2774, "step": 616 }, { "epoch": 0.12487350738716858, "grad_norm": 0.38302597403526306, "learning_rate": 0.00019812363872920915, "loss": 0.2361, "step": 617 }, { "epoch": 0.12507589556769885, "grad_norm": 0.31080150604248047, "learning_rate": 0.00019811750107314455, "loss": 0.2229, "step": 618 }, { "epoch": 0.1252782837482291, "grad_norm": 0.39039042592048645, "learning_rate": 0.00019811135349058072, "loss": 0.2586, "step": 619 }, { "epoch": 0.12548067192875936, "grad_norm": 0.43987491726875305, "learning_rate": 0.0001981051959821396, "loss": 0.2318, "step": 620 }, { "epoch": 0.12568306010928962, "grad_norm": 0.6152598857879639, "learning_rate": 0.0001980990285484442, "loss": 0.2319, "step": 621 }, { "epoch": 0.12588544828981987, "grad_norm": 0.40612903237342834, "learning_rate": 0.00019809285119011842, "loss": 0.3145, "step": 622 }, { "epoch": 0.12608783647035013, "grad_norm": 0.3982747793197632, "learning_rate": 0.00019808666390778724, "loss": 0.2536, "step": 623 }, { "epoch": 0.12629022465088038, "grad_norm": 0.28268828988075256, "learning_rate": 0.0001980804667020766, "loss": 0.2054, "step": 624 }, { "epoch": 0.12649261283141064, "grad_norm": 0.3771643042564392, "learning_rate": 0.0001980742595736135, "loss": 0.2429, "step": 625 }, { "epoch": 0.1266950010119409, "grad_norm": 0.45596668124198914, "learning_rate": 0.00019806804252302587, "loss": 0.2702, "step": 626 }, { "epoch": 0.12689738919247115, "grad_norm": 0.34389880299568176, "learning_rate": 0.00019806181555094275, "loss": 0.2403, "step": 627 }, { "epoch": 0.1270997773730014, "grad_norm": 0.4212355613708496, "learning_rate": 0.00019805557865799407, "loss": 0.248, "step": 628 }, { "epoch": 0.1273021655535317, "grad_norm": 0.36532604694366455, "learning_rate": 0.0001980493318448108, "loss": 0.2886, "step": 629 }, { "epoch": 0.12750455373406194, "grad_norm": 0.3870829939842224, "learning_rate": 0.00019804307511202499, "loss": 0.2781, "step": 630 }, { "epoch": 0.1277069419145922, "grad_norm": 0.2700032889842987, "learning_rate": 0.00019803680846026958, "loss": 0.2415, "step": 631 }, { "epoch": 0.12790933009512245, "grad_norm": 0.3396844267845154, "learning_rate": 0.00019803053189017858, "loss": 0.2563, "step": 632 }, { "epoch": 0.1281117182756527, "grad_norm": 0.3511733114719391, "learning_rate": 0.00019802424540238698, "loss": 0.2378, "step": 633 }, { "epoch": 0.12831410645618296, "grad_norm": 0.3880968689918518, "learning_rate": 0.0001980179489975308, "loss": 0.2632, "step": 634 }, { "epoch": 0.12851649463671322, "grad_norm": 0.5152776837348938, "learning_rate": 0.00019801164267624702, "loss": 0.2662, "step": 635 }, { "epoch": 0.12871888281724347, "grad_norm": 0.5050997138023376, "learning_rate": 0.00019800532643917364, "loss": 0.2408, "step": 636 }, { "epoch": 0.12892127099777373, "grad_norm": 0.38669291138648987, "learning_rate": 0.00019799900028694974, "loss": 0.2369, "step": 637 }, { "epoch": 0.12912365917830398, "grad_norm": 0.32681190967559814, "learning_rate": 0.00019799266422021523, "loss": 0.2198, "step": 638 }, { "epoch": 0.12932604735883424, "grad_norm": 0.35121649503707886, "learning_rate": 0.0001979863182396112, "loss": 0.2482, "step": 639 }, { "epoch": 0.1295284355393645, "grad_norm": 0.3443315923213959, "learning_rate": 0.00019797996234577968, "loss": 0.2369, "step": 640 }, { "epoch": 0.12973082371989475, "grad_norm": 0.5270746946334839, "learning_rate": 0.00019797359653936364, "loss": 0.2847, "step": 641 }, { "epoch": 0.129933211900425, "grad_norm": 0.3135738968849182, "learning_rate": 0.0001979672208210071, "loss": 0.2449, "step": 642 }, { "epoch": 0.13013560008095526, "grad_norm": 0.4503747522830963, "learning_rate": 0.00019796083519135516, "loss": 0.2765, "step": 643 }, { "epoch": 0.13033798826148552, "grad_norm": 0.4427570700645447, "learning_rate": 0.00019795443965105376, "loss": 0.2366, "step": 644 }, { "epoch": 0.1305403764420158, "grad_norm": 0.42543524503707886, "learning_rate": 0.00019794803420075002, "loss": 0.2786, "step": 645 }, { "epoch": 0.13074276462254605, "grad_norm": 0.3327653408050537, "learning_rate": 0.0001979416188410919, "loss": 0.229, "step": 646 }, { "epoch": 0.1309451528030763, "grad_norm": 0.396659791469574, "learning_rate": 0.0001979351935727285, "loss": 0.2764, "step": 647 }, { "epoch": 0.13114754098360656, "grad_norm": 0.4171930253505707, "learning_rate": 0.00019792875839630986, "loss": 0.2593, "step": 648 }, { "epoch": 0.13134992916413682, "grad_norm": 0.3459969162940979, "learning_rate": 0.00019792231331248696, "loss": 0.2561, "step": 649 }, { "epoch": 0.13155231734466707, "grad_norm": 0.3193213641643524, "learning_rate": 0.0001979158583219119, "loss": 0.2353, "step": 650 }, { "epoch": 0.13155231734466707, "eval_loss": 0.2984254062175751, "eval_runtime": 0.7375, "eval_samples_per_second": 6.779, "eval_steps_per_second": 1.356, "step": 650 }, { "epoch": 0.13175470552519733, "grad_norm": 0.35675105452537537, "learning_rate": 0.00019790939342523772, "loss": 0.2582, "step": 651 }, { "epoch": 0.13195709370572759, "grad_norm": 0.4250843822956085, "learning_rate": 0.00019790291862311845, "loss": 0.2315, "step": 652 }, { "epoch": 0.13215948188625784, "grad_norm": 0.35945165157318115, "learning_rate": 0.00019789643391620917, "loss": 0.2649, "step": 653 }, { "epoch": 0.1323618700667881, "grad_norm": 0.47871312499046326, "learning_rate": 0.0001978899393051659, "loss": 0.2292, "step": 654 }, { "epoch": 0.13256425824731835, "grad_norm": 0.3194306492805481, "learning_rate": 0.00019788343479064575, "loss": 0.2355, "step": 655 }, { "epoch": 0.1327666464278486, "grad_norm": 0.4799971878528595, "learning_rate": 0.00019787692037330674, "loss": 0.2406, "step": 656 }, { "epoch": 0.13296903460837886, "grad_norm": 0.3283880352973938, "learning_rate": 0.00019787039605380791, "loss": 0.2594, "step": 657 }, { "epoch": 0.13317142278890912, "grad_norm": 0.8230167031288147, "learning_rate": 0.00019786386183280938, "loss": 0.2599, "step": 658 }, { "epoch": 0.13337381096943937, "grad_norm": 0.4739612936973572, "learning_rate": 0.0001978573177109722, "loss": 0.2229, "step": 659 }, { "epoch": 0.13357619914996965, "grad_norm": 0.3426133692264557, "learning_rate": 0.00019785076368895838, "loss": 0.2493, "step": 660 }, { "epoch": 0.1337785873304999, "grad_norm": 0.3582127094268799, "learning_rate": 0.00019784419976743106, "loss": 0.2569, "step": 661 }, { "epoch": 0.13398097551103016, "grad_norm": 0.4506543278694153, "learning_rate": 0.00019783762594705425, "loss": 0.2642, "step": 662 }, { "epoch": 0.13418336369156042, "grad_norm": 0.3988342881202698, "learning_rate": 0.00019783104222849304, "loss": 0.2934, "step": 663 }, { "epoch": 0.13438575187209068, "grad_norm": 0.5457602739334106, "learning_rate": 0.0001978244486124135, "loss": 0.2505, "step": 664 }, { "epoch": 0.13458814005262093, "grad_norm": 0.35253384709358215, "learning_rate": 0.00019781784509948275, "loss": 0.2468, "step": 665 }, { "epoch": 0.13479052823315119, "grad_norm": 0.41637489199638367, "learning_rate": 0.00019781123169036882, "loss": 0.246, "step": 666 }, { "epoch": 0.13499291641368144, "grad_norm": 0.3531467616558075, "learning_rate": 0.00019780460838574076, "loss": 0.2765, "step": 667 }, { "epoch": 0.1351953045942117, "grad_norm": 0.3421364724636078, "learning_rate": 0.00019779797518626867, "loss": 0.2719, "step": 668 }, { "epoch": 0.13539769277474195, "grad_norm": 0.42127346992492676, "learning_rate": 0.00019779133209262367, "loss": 0.2805, "step": 669 }, { "epoch": 0.1356000809552722, "grad_norm": 0.3892177641391754, "learning_rate": 0.00019778467910547777, "loss": 0.2316, "step": 670 }, { "epoch": 0.13580246913580246, "grad_norm": 0.383327841758728, "learning_rate": 0.00019777801622550408, "loss": 0.2751, "step": 671 }, { "epoch": 0.13600485731633272, "grad_norm": 0.3991505205631256, "learning_rate": 0.0001977713434533767, "loss": 0.27, "step": 672 }, { "epoch": 0.13620724549686297, "grad_norm": 0.44599419832229614, "learning_rate": 0.0001977646607897707, "loss": 0.2381, "step": 673 }, { "epoch": 0.13640963367739323, "grad_norm": 0.3119847774505615, "learning_rate": 0.00019775796823536212, "loss": 0.2381, "step": 674 }, { "epoch": 0.1366120218579235, "grad_norm": 0.4660552144050598, "learning_rate": 0.0001977512657908281, "loss": 0.26, "step": 675 }, { "epoch": 0.13681441003845377, "grad_norm": 0.3962903916835785, "learning_rate": 0.0001977445534568467, "loss": 0.2909, "step": 676 }, { "epoch": 0.13701679821898402, "grad_norm": 0.34353816509246826, "learning_rate": 0.00019773783123409698, "loss": 0.2648, "step": 677 }, { "epoch": 0.13721918639951428, "grad_norm": 0.31514930725097656, "learning_rate": 0.0001977310991232591, "loss": 0.2701, "step": 678 }, { "epoch": 0.13742157458004453, "grad_norm": 0.4266648292541504, "learning_rate": 0.00019772435712501406, "loss": 0.3107, "step": 679 }, { "epoch": 0.1376239627605748, "grad_norm": 0.5088281631469727, "learning_rate": 0.00019771760524004396, "loss": 0.3094, "step": 680 }, { "epoch": 0.13782635094110504, "grad_norm": 0.358623743057251, "learning_rate": 0.00019771084346903193, "loss": 0.2786, "step": 681 }, { "epoch": 0.1380287391216353, "grad_norm": 0.5978558659553528, "learning_rate": 0.000197704071812662, "loss": 0.2459, "step": 682 }, { "epoch": 0.13823112730216555, "grad_norm": 0.3651149272918701, "learning_rate": 0.00019769729027161928, "loss": 0.2472, "step": 683 }, { "epoch": 0.1384335154826958, "grad_norm": 0.3216381371021271, "learning_rate": 0.0001976904988465899, "loss": 0.2599, "step": 684 }, { "epoch": 0.13863590366322606, "grad_norm": 0.35924655199050903, "learning_rate": 0.00019768369753826086, "loss": 0.2816, "step": 685 }, { "epoch": 0.13883829184375632, "grad_norm": 0.3398737609386444, "learning_rate": 0.00019767688634732026, "loss": 0.2536, "step": 686 }, { "epoch": 0.13904068002428657, "grad_norm": 0.3374193012714386, "learning_rate": 0.00019767006527445728, "loss": 0.2369, "step": 687 }, { "epoch": 0.13924306820481683, "grad_norm": 0.3065876364707947, "learning_rate": 0.00019766323432036188, "loss": 0.2748, "step": 688 }, { "epoch": 0.13944545638534708, "grad_norm": 0.48267465829849243, "learning_rate": 0.00019765639348572525, "loss": 0.2428, "step": 689 }, { "epoch": 0.13964784456587737, "grad_norm": 0.33604946732521057, "learning_rate": 0.0001976495427712394, "loss": 0.3006, "step": 690 }, { "epoch": 0.13985023274640762, "grad_norm": 0.31897780299186707, "learning_rate": 0.00019764268217759744, "loss": 0.2808, "step": 691 }, { "epoch": 0.14005262092693788, "grad_norm": 0.3280028700828552, "learning_rate": 0.00019763581170549342, "loss": 0.2503, "step": 692 }, { "epoch": 0.14025500910746813, "grad_norm": 0.2858153283596039, "learning_rate": 0.0001976289313556225, "loss": 0.2548, "step": 693 }, { "epoch": 0.1404573972879984, "grad_norm": 0.2821749746799469, "learning_rate": 0.00019762204112868067, "loss": 0.2294, "step": 694 }, { "epoch": 0.14065978546852864, "grad_norm": 0.33796799182891846, "learning_rate": 0.0001976151410253651, "loss": 0.2496, "step": 695 }, { "epoch": 0.1408621736490589, "grad_norm": 0.44861510396003723, "learning_rate": 0.0001976082310463738, "loss": 0.255, "step": 696 }, { "epoch": 0.14106456182958915, "grad_norm": 0.3637007176876068, "learning_rate": 0.00019760131119240585, "loss": 0.2256, "step": 697 }, { "epoch": 0.1412669500101194, "grad_norm": 0.5837531089782715, "learning_rate": 0.00019759438146416138, "loss": 0.2494, "step": 698 }, { "epoch": 0.14146933819064966, "grad_norm": 0.36459869146347046, "learning_rate": 0.00019758744186234147, "loss": 0.3114, "step": 699 }, { "epoch": 0.14167172637117992, "grad_norm": 0.2742053270339966, "learning_rate": 0.00019758049238764814, "loss": 0.2047, "step": 700 }, { "epoch": 0.14167172637117992, "eval_loss": 0.2933715283870697, "eval_runtime": 0.7397, "eval_samples_per_second": 6.759, "eval_steps_per_second": 1.352, "step": 700 }, { "epoch": 0.14187411455171017, "grad_norm": 0.42038586735725403, "learning_rate": 0.00019757353304078446, "loss": 0.2474, "step": 701 }, { "epoch": 0.14207650273224043, "grad_norm": 0.5313589572906494, "learning_rate": 0.00019756656382245456, "loss": 0.2622, "step": 702 }, { "epoch": 0.14227889091277068, "grad_norm": 0.3278883695602417, "learning_rate": 0.0001975595847333635, "loss": 0.2398, "step": 703 }, { "epoch": 0.14248127909330094, "grad_norm": 0.39546725153923035, "learning_rate": 0.00019755259577421732, "loss": 0.2423, "step": 704 }, { "epoch": 0.14268366727383122, "grad_norm": 0.33064937591552734, "learning_rate": 0.0001975455969457231, "loss": 0.2816, "step": 705 }, { "epoch": 0.14288605545436148, "grad_norm": 0.2638942003250122, "learning_rate": 0.00019753858824858895, "loss": 0.2267, "step": 706 }, { "epoch": 0.14308844363489173, "grad_norm": 0.408956378698349, "learning_rate": 0.00019753156968352385, "loss": 0.2375, "step": 707 }, { "epoch": 0.143290831815422, "grad_norm": 0.4136710464954376, "learning_rate": 0.00019752454125123795, "loss": 0.276, "step": 708 }, { "epoch": 0.14349321999595224, "grad_norm": 0.27548322081565857, "learning_rate": 0.00019751750295244228, "loss": 0.2297, "step": 709 }, { "epoch": 0.1436956081764825, "grad_norm": 0.31546905636787415, "learning_rate": 0.0001975104547878489, "loss": 0.2758, "step": 710 }, { "epoch": 0.14389799635701275, "grad_norm": 0.2947577238082886, "learning_rate": 0.00019750339675817086, "loss": 0.2268, "step": 711 }, { "epoch": 0.144100384537543, "grad_norm": 0.4339549243450165, "learning_rate": 0.00019749632886412223, "loss": 0.2353, "step": 712 }, { "epoch": 0.14430277271807326, "grad_norm": 0.2924511432647705, "learning_rate": 0.00019748925110641807, "loss": 0.2333, "step": 713 }, { "epoch": 0.14450516089860352, "grad_norm": 0.31660690903663635, "learning_rate": 0.00019748216348577442, "loss": 0.2287, "step": 714 }, { "epoch": 0.14470754907913377, "grad_norm": 0.3289544880390167, "learning_rate": 0.00019747506600290834, "loss": 0.2926, "step": 715 }, { "epoch": 0.14490993725966403, "grad_norm": 0.3553188443183899, "learning_rate": 0.0001974679586585379, "loss": 0.2954, "step": 716 }, { "epoch": 0.14511232544019428, "grad_norm": 0.47059759497642517, "learning_rate": 0.0001974608414533821, "loss": 0.2394, "step": 717 }, { "epoch": 0.14531471362072454, "grad_norm": 0.3676668703556061, "learning_rate": 0.000197453714388161, "loss": 0.234, "step": 718 }, { "epoch": 0.1455171018012548, "grad_norm": 0.31340935826301575, "learning_rate": 0.00019744657746359562, "loss": 0.2305, "step": 719 }, { "epoch": 0.14571948998178508, "grad_norm": 0.3455619215965271, "learning_rate": 0.00019743943068040808, "loss": 0.2689, "step": 720 }, { "epoch": 0.14592187816231533, "grad_norm": 0.4046059250831604, "learning_rate": 0.00019743227403932134, "loss": 0.2544, "step": 721 }, { "epoch": 0.1461242663428456, "grad_norm": 0.46637916564941406, "learning_rate": 0.00019742510754105946, "loss": 0.2285, "step": 722 }, { "epoch": 0.14632665452337584, "grad_norm": 0.3575298488140106, "learning_rate": 0.00019741793118634748, "loss": 0.2497, "step": 723 }, { "epoch": 0.1465290427039061, "grad_norm": 0.38678669929504395, "learning_rate": 0.0001974107449759114, "loss": 0.2577, "step": 724 }, { "epoch": 0.14673143088443635, "grad_norm": 0.28437402844429016, "learning_rate": 0.00019740354891047826, "loss": 0.2304, "step": 725 }, { "epoch": 0.1469338190649666, "grad_norm": 0.31594318151474, "learning_rate": 0.00019739634299077613, "loss": 0.2755, "step": 726 }, { "epoch": 0.14713620724549686, "grad_norm": 0.2776789963245392, "learning_rate": 0.00019738912721753394, "loss": 0.2377, "step": 727 }, { "epoch": 0.14733859542602712, "grad_norm": 0.30711445212364197, "learning_rate": 0.00019738190159148178, "loss": 0.2254, "step": 728 }, { "epoch": 0.14754098360655737, "grad_norm": 0.3012600839138031, "learning_rate": 0.0001973746661133506, "loss": 0.2529, "step": 729 }, { "epoch": 0.14774337178708763, "grad_norm": 0.33163461089134216, "learning_rate": 0.00019736742078387245, "loss": 0.258, "step": 730 }, { "epoch": 0.14794575996761788, "grad_norm": 0.26741090416908264, "learning_rate": 0.00019736016560378036, "loss": 0.2405, "step": 731 }, { "epoch": 0.14814814814814814, "grad_norm": 0.4081975519657135, "learning_rate": 0.00019735290057380827, "loss": 0.2999, "step": 732 }, { "epoch": 0.1483505363286784, "grad_norm": 0.3062702715396881, "learning_rate": 0.00019734562569469124, "loss": 0.2618, "step": 733 }, { "epoch": 0.14855292450920865, "grad_norm": 0.28259265422821045, "learning_rate": 0.0001973383409671652, "loss": 0.2309, "step": 734 }, { "epoch": 0.1487553126897389, "grad_norm": 0.25090116262435913, "learning_rate": 0.00019733104639196722, "loss": 0.248, "step": 735 }, { "epoch": 0.1489577008702692, "grad_norm": 0.3727152347564697, "learning_rate": 0.00019732374196983522, "loss": 0.2266, "step": 736 }, { "epoch": 0.14916008905079944, "grad_norm": 0.4587722718715668, "learning_rate": 0.00019731642770150822, "loss": 0.2854, "step": 737 }, { "epoch": 0.1493624772313297, "grad_norm": 0.5172100067138672, "learning_rate": 0.0001973091035877262, "loss": 0.2573, "step": 738 }, { "epoch": 0.14956486541185995, "grad_norm": 0.3253283202648163, "learning_rate": 0.00019730176962923015, "loss": 0.2639, "step": 739 }, { "epoch": 0.1497672535923902, "grad_norm": 0.4492495357990265, "learning_rate": 0.00019729442582676202, "loss": 0.2732, "step": 740 }, { "epoch": 0.14996964177292046, "grad_norm": 0.34648340940475464, "learning_rate": 0.00019728707218106477, "loss": 0.2595, "step": 741 }, { "epoch": 0.15017202995345072, "grad_norm": 0.3878565728664398, "learning_rate": 0.00019727970869288237, "loss": 0.3247, "step": 742 }, { "epoch": 0.15037441813398097, "grad_norm": 0.2799864113330841, "learning_rate": 0.0001972723353629598, "loss": 0.2582, "step": 743 }, { "epoch": 0.15057680631451123, "grad_norm": 0.3522235155105591, "learning_rate": 0.000197264952192043, "loss": 0.2187, "step": 744 }, { "epoch": 0.15077919449504149, "grad_norm": 0.3626643717288971, "learning_rate": 0.00019725755918087893, "loss": 0.2197, "step": 745 }, { "epoch": 0.15098158267557174, "grad_norm": 0.304533988237381, "learning_rate": 0.00019725015633021553, "loss": 0.2334, "step": 746 }, { "epoch": 0.151183970856102, "grad_norm": 0.4297725260257721, "learning_rate": 0.00019724274364080175, "loss": 0.2646, "step": 747 }, { "epoch": 0.15138635903663225, "grad_norm": 0.4226777255535126, "learning_rate": 0.00019723532111338754, "loss": 0.2325, "step": 748 }, { "epoch": 0.1515887472171625, "grad_norm": 0.3075390160083771, "learning_rate": 0.00019722788874872377, "loss": 0.2664, "step": 749 }, { "epoch": 0.15179113539769276, "grad_norm": 0.3131004273891449, "learning_rate": 0.00019722044654756248, "loss": 0.2607, "step": 750 }, { "epoch": 0.15179113539769276, "eval_loss": 0.28550732135772705, "eval_runtime": 0.7363, "eval_samples_per_second": 6.791, "eval_steps_per_second": 1.358, "step": 750 }, { "epoch": 0.15199352357822304, "grad_norm": 0.34832969307899475, "learning_rate": 0.0001972129945106565, "loss": 0.2407, "step": 751 }, { "epoch": 0.1521959117587533, "grad_norm": 0.33916226029396057, "learning_rate": 0.00019720553263875978, "loss": 0.2806, "step": 752 }, { "epoch": 0.15239829993928355, "grad_norm": 0.40339797735214233, "learning_rate": 0.00019719806093262726, "loss": 0.3036, "step": 753 }, { "epoch": 0.1526006881198138, "grad_norm": 0.259329617023468, "learning_rate": 0.00019719057939301477, "loss": 0.25, "step": 754 }, { "epoch": 0.15280307630034407, "grad_norm": 0.351357638835907, "learning_rate": 0.0001971830880206793, "loss": 0.2487, "step": 755 }, { "epoch": 0.15300546448087432, "grad_norm": 0.41735726594924927, "learning_rate": 0.0001971755868163787, "loss": 0.2562, "step": 756 }, { "epoch": 0.15320785266140458, "grad_norm": 0.3808186948299408, "learning_rate": 0.0001971680757808719, "loss": 0.3059, "step": 757 }, { "epoch": 0.15341024084193483, "grad_norm": 0.2662958800792694, "learning_rate": 0.00019716055491491875, "loss": 0.2254, "step": 758 }, { "epoch": 0.15361262902246509, "grad_norm": 0.30748385190963745, "learning_rate": 0.00019715302421928013, "loss": 0.2595, "step": 759 }, { "epoch": 0.15381501720299534, "grad_norm": 0.3423989415168762, "learning_rate": 0.00019714548369471796, "loss": 0.2642, "step": 760 }, { "epoch": 0.1540174053835256, "grad_norm": 0.35559195280075073, "learning_rate": 0.00019713793334199511, "loss": 0.2546, "step": 761 }, { "epoch": 0.15421979356405585, "grad_norm": 0.29996034502983093, "learning_rate": 0.00019713037316187537, "loss": 0.237, "step": 762 }, { "epoch": 0.1544221817445861, "grad_norm": 0.2678242325782776, "learning_rate": 0.0001971228031551237, "loss": 0.2591, "step": 763 }, { "epoch": 0.15462456992511636, "grad_norm": 0.3217318058013916, "learning_rate": 0.00019711522332250585, "loss": 0.2598, "step": 764 }, { "epoch": 0.15482695810564662, "grad_norm": 0.31618547439575195, "learning_rate": 0.00019710763366478874, "loss": 0.2455, "step": 765 }, { "epoch": 0.1550293462861769, "grad_norm": 0.28240564465522766, "learning_rate": 0.0001971000341827402, "loss": 0.2189, "step": 766 }, { "epoch": 0.15523173446670716, "grad_norm": 0.3480214774608612, "learning_rate": 0.0001970924248771291, "loss": 0.2573, "step": 767 }, { "epoch": 0.1554341226472374, "grad_norm": 0.3173188865184784, "learning_rate": 0.00019708480574872516, "loss": 0.259, "step": 768 }, { "epoch": 0.15563651082776767, "grad_norm": 0.4622735381126404, "learning_rate": 0.00019707717679829935, "loss": 0.2777, "step": 769 }, { "epoch": 0.15583889900829792, "grad_norm": 0.583462655544281, "learning_rate": 0.00019706953802662333, "loss": 0.2666, "step": 770 }, { "epoch": 0.15604128718882818, "grad_norm": 0.371926873922348, "learning_rate": 0.00019706188943447006, "loss": 0.2508, "step": 771 }, { "epoch": 0.15624367536935843, "grad_norm": 0.2995915710926056, "learning_rate": 0.00019705423102261326, "loss": 0.2464, "step": 772 }, { "epoch": 0.1564460635498887, "grad_norm": 0.35817331075668335, "learning_rate": 0.0001970465627918277, "loss": 0.2718, "step": 773 }, { "epoch": 0.15664845173041894, "grad_norm": 0.32463064789772034, "learning_rate": 0.00019703888474288924, "loss": 0.2746, "step": 774 }, { "epoch": 0.1568508399109492, "grad_norm": 0.3261243402957916, "learning_rate": 0.00019703119687657466, "loss": 0.2844, "step": 775 }, { "epoch": 0.15705322809147945, "grad_norm": 0.39310258626937866, "learning_rate": 0.0001970234991936617, "loss": 0.2369, "step": 776 }, { "epoch": 0.1572556162720097, "grad_norm": 0.22917023301124573, "learning_rate": 0.00019701579169492916, "loss": 0.2154, "step": 777 }, { "epoch": 0.15745800445253996, "grad_norm": 0.22985297441482544, "learning_rate": 0.0001970080743811568, "loss": 0.2075, "step": 778 }, { "epoch": 0.15766039263307022, "grad_norm": 0.3643643260002136, "learning_rate": 0.0001970003472531253, "loss": 0.2588, "step": 779 }, { "epoch": 0.15786278081360047, "grad_norm": 0.3869781494140625, "learning_rate": 0.00019699261031161657, "loss": 0.2537, "step": 780 }, { "epoch": 0.15806516899413076, "grad_norm": 0.4232870638370514, "learning_rate": 0.0001969848635574132, "loss": 0.2638, "step": 781 }, { "epoch": 0.158267557174661, "grad_norm": 0.3287450969219208, "learning_rate": 0.000196977106991299, "loss": 0.2503, "step": 782 }, { "epoch": 0.15846994535519127, "grad_norm": 0.34829604625701904, "learning_rate": 0.0001969693406140587, "loss": 0.2564, "step": 783 }, { "epoch": 0.15867233353572152, "grad_norm": 0.31483200192451477, "learning_rate": 0.00019696156442647797, "loss": 0.2465, "step": 784 }, { "epoch": 0.15887472171625178, "grad_norm": 0.41794779896736145, "learning_rate": 0.0001969537784293436, "loss": 0.2569, "step": 785 }, { "epoch": 0.15907710989678203, "grad_norm": 0.3754033148288727, "learning_rate": 0.00019694598262344322, "loss": 0.2626, "step": 786 }, { "epoch": 0.1592794980773123, "grad_norm": 0.39740636944770813, "learning_rate": 0.00019693817700956555, "loss": 0.2436, "step": 787 }, { "epoch": 0.15948188625784254, "grad_norm": 0.34521111845970154, "learning_rate": 0.00019693036158850033, "loss": 0.2442, "step": 788 }, { "epoch": 0.1596842744383728, "grad_norm": 0.5534895658493042, "learning_rate": 0.0001969225363610382, "loss": 0.2119, "step": 789 }, { "epoch": 0.15988666261890305, "grad_norm": 0.42546847462654114, "learning_rate": 0.00019691470132797081, "loss": 0.2589, "step": 790 }, { "epoch": 0.1600890507994333, "grad_norm": 0.31263595819473267, "learning_rate": 0.00019690685649009087, "loss": 0.237, "step": 791 }, { "epoch": 0.16029143897996356, "grad_norm": 0.43939104676246643, "learning_rate": 0.00019689900184819204, "loss": 0.2212, "step": 792 }, { "epoch": 0.16049382716049382, "grad_norm": 0.3826169967651367, "learning_rate": 0.0001968911374030689, "loss": 0.2643, "step": 793 }, { "epoch": 0.16069621534102407, "grad_norm": 0.32133644819259644, "learning_rate": 0.0001968832631555172, "loss": 0.2412, "step": 794 }, { "epoch": 0.16089860352155433, "grad_norm": 0.31465601921081543, "learning_rate": 0.00019687537910633349, "loss": 0.2176, "step": 795 }, { "epoch": 0.1611009917020846, "grad_norm": 0.26472586393356323, "learning_rate": 0.00019686748525631545, "loss": 0.2301, "step": 796 }, { "epoch": 0.16130337988261487, "grad_norm": 0.33633241057395935, "learning_rate": 0.00019685958160626163, "loss": 0.271, "step": 797 }, { "epoch": 0.16150576806314512, "grad_norm": 0.30255284905433655, "learning_rate": 0.0001968516681569717, "loss": 0.23, "step": 798 }, { "epoch": 0.16170815624367538, "grad_norm": 0.3203800618648529, "learning_rate": 0.00019684374490924625, "loss": 0.2417, "step": 799 }, { "epoch": 0.16191054442420563, "grad_norm": 0.32507458329200745, "learning_rate": 0.00019683581186388685, "loss": 0.2584, "step": 800 }, { "epoch": 0.16191054442420563, "eval_loss": 0.2888225317001343, "eval_runtime": 0.7379, "eval_samples_per_second": 6.776, "eval_steps_per_second": 1.355, "step": 800 }, { "epoch": 0.1621129326047359, "grad_norm": 0.3781580924987793, "learning_rate": 0.00019682786902169608, "loss": 0.2743, "step": 801 }, { "epoch": 0.16231532078526614, "grad_norm": 0.32408541440963745, "learning_rate": 0.00019681991638347755, "loss": 0.2528, "step": 802 }, { "epoch": 0.1625177089657964, "grad_norm": 0.30190667510032654, "learning_rate": 0.00019681195395003577, "loss": 0.2434, "step": 803 }, { "epoch": 0.16272009714632665, "grad_norm": 0.34094536304473877, "learning_rate": 0.00019680398172217635, "loss": 0.2479, "step": 804 }, { "epoch": 0.1629224853268569, "grad_norm": 0.45574498176574707, "learning_rate": 0.00019679599970070578, "loss": 0.2652, "step": 805 }, { "epoch": 0.16312487350738716, "grad_norm": 0.2918074429035187, "learning_rate": 0.00019678800788643167, "loss": 0.2581, "step": 806 }, { "epoch": 0.16332726168791742, "grad_norm": 0.42664429545402527, "learning_rate": 0.00019678000628016248, "loss": 0.2459, "step": 807 }, { "epoch": 0.16352964986844767, "grad_norm": 0.32714366912841797, "learning_rate": 0.00019677199488270778, "loss": 0.2543, "step": 808 }, { "epoch": 0.16373203804897793, "grad_norm": 0.4104800522327423, "learning_rate": 0.00019676397369487804, "loss": 0.2388, "step": 809 }, { "epoch": 0.16393442622950818, "grad_norm": 0.4657924473285675, "learning_rate": 0.0001967559427174848, "loss": 0.2687, "step": 810 }, { "epoch": 0.16413681441003847, "grad_norm": 0.39114388823509216, "learning_rate": 0.00019674790195134048, "loss": 0.2364, "step": 811 }, { "epoch": 0.16433920259056872, "grad_norm": 0.40501561760902405, "learning_rate": 0.00019673985139725863, "loss": 0.2602, "step": 812 }, { "epoch": 0.16454159077109898, "grad_norm": 0.2732917368412018, "learning_rate": 0.00019673179105605368, "loss": 0.2295, "step": 813 }, { "epoch": 0.16474397895162923, "grad_norm": 0.5099993944168091, "learning_rate": 0.00019672372092854111, "loss": 0.282, "step": 814 }, { "epoch": 0.1649463671321595, "grad_norm": 0.31335383653640747, "learning_rate": 0.0001967156410155374, "loss": 0.2502, "step": 815 }, { "epoch": 0.16514875531268974, "grad_norm": 0.2994532585144043, "learning_rate": 0.00019670755131785992, "loss": 0.2277, "step": 816 }, { "epoch": 0.16535114349322, "grad_norm": 0.425386905670166, "learning_rate": 0.00019669945183632716, "loss": 0.257, "step": 817 }, { "epoch": 0.16555353167375025, "grad_norm": 0.27471521496772766, "learning_rate": 0.0001966913425717585, "loss": 0.249, "step": 818 }, { "epoch": 0.1657559198542805, "grad_norm": 0.39158228039741516, "learning_rate": 0.0001966832235249744, "loss": 0.2765, "step": 819 }, { "epoch": 0.16595830803481076, "grad_norm": 0.3115609884262085, "learning_rate": 0.0001966750946967962, "loss": 0.2527, "step": 820 }, { "epoch": 0.16616069621534102, "grad_norm": 0.3412054777145386, "learning_rate": 0.00019666695608804632, "loss": 0.2534, "step": 821 }, { "epoch": 0.16636308439587127, "grad_norm": 0.2714287042617798, "learning_rate": 0.00019665880769954814, "loss": 0.227, "step": 822 }, { "epoch": 0.16656547257640153, "grad_norm": 0.33095890283584595, "learning_rate": 0.00019665064953212604, "loss": 0.2628, "step": 823 }, { "epoch": 0.16676786075693179, "grad_norm": 0.4704902470111847, "learning_rate": 0.00019664248158660533, "loss": 0.2982, "step": 824 }, { "epoch": 0.16697024893746204, "grad_norm": 0.32132309675216675, "learning_rate": 0.00019663430386381242, "loss": 0.2643, "step": 825 }, { "epoch": 0.1671726371179923, "grad_norm": 0.3183353543281555, "learning_rate": 0.00019662611636457462, "loss": 0.2679, "step": 826 }, { "epoch": 0.16737502529852258, "grad_norm": 0.3988979756832123, "learning_rate": 0.00019661791908972024, "loss": 0.2103, "step": 827 }, { "epoch": 0.16757741347905283, "grad_norm": 0.3745168149471283, "learning_rate": 0.00019660971204007863, "loss": 0.253, "step": 828 }, { "epoch": 0.1677798016595831, "grad_norm": 0.33534538745880127, "learning_rate": 0.00019660149521648004, "loss": 0.253, "step": 829 }, { "epoch": 0.16798218984011334, "grad_norm": 0.3371214270591736, "learning_rate": 0.0001965932686197558, "loss": 0.2287, "step": 830 }, { "epoch": 0.1681845780206436, "grad_norm": 0.3766498863697052, "learning_rate": 0.00019658503225073817, "loss": 0.2868, "step": 831 }, { "epoch": 0.16838696620117385, "grad_norm": 0.3650453984737396, "learning_rate": 0.0001965767861102605, "loss": 0.2496, "step": 832 }, { "epoch": 0.1685893543817041, "grad_norm": 0.3437459170818329, "learning_rate": 0.0001965685301991569, "loss": 0.2374, "step": 833 }, { "epoch": 0.16879174256223436, "grad_norm": 0.5330494046211243, "learning_rate": 0.00019656026451826274, "loss": 0.3107, "step": 834 }, { "epoch": 0.16899413074276462, "grad_norm": 0.28489741683006287, "learning_rate": 0.0001965519890684142, "loss": 0.24, "step": 835 }, { "epoch": 0.16919651892329488, "grad_norm": 0.6033898591995239, "learning_rate": 0.00019654370385044852, "loss": 0.2826, "step": 836 }, { "epoch": 0.16939890710382513, "grad_norm": 0.42504727840423584, "learning_rate": 0.00019653540886520386, "loss": 0.2215, "step": 837 }, { "epoch": 0.16960129528435539, "grad_norm": 0.3004084825515747, "learning_rate": 0.00019652710411351953, "loss": 0.282, "step": 838 }, { "epoch": 0.16980368346488564, "grad_norm": 0.3893960416316986, "learning_rate": 0.0001965187895962356, "loss": 0.3009, "step": 839 }, { "epoch": 0.1700060716454159, "grad_norm": 0.40491530299186707, "learning_rate": 0.00019651046531419332, "loss": 0.2341, "step": 840 }, { "epoch": 0.17020845982594615, "grad_norm": 0.37225958704948425, "learning_rate": 0.00019650213126823487, "loss": 0.2727, "step": 841 }, { "epoch": 0.17041084800647643, "grad_norm": 0.34467262029647827, "learning_rate": 0.00019649378745920332, "loss": 0.2944, "step": 842 }, { "epoch": 0.1706132361870067, "grad_norm": 0.3478735387325287, "learning_rate": 0.00019648543388794284, "loss": 0.2406, "step": 843 }, { "epoch": 0.17081562436753694, "grad_norm": 0.24952255189418793, "learning_rate": 0.0001964770705552986, "loss": 0.2476, "step": 844 }, { "epoch": 0.1710180125480672, "grad_norm": 0.34091097116470337, "learning_rate": 0.0001964686974621167, "loss": 0.2735, "step": 845 }, { "epoch": 0.17122040072859745, "grad_norm": 0.4286462068557739, "learning_rate": 0.0001964603146092442, "loss": 0.2825, "step": 846 }, { "epoch": 0.1714227889091277, "grad_norm": 0.32420212030410767, "learning_rate": 0.0001964519219975292, "loss": 0.2661, "step": 847 }, { "epoch": 0.17162517708965797, "grad_norm": 0.3544958233833313, "learning_rate": 0.0001964435196278208, "loss": 0.2623, "step": 848 }, { "epoch": 0.17182756527018822, "grad_norm": 0.3037719428539276, "learning_rate": 0.00019643510750096908, "loss": 0.2334, "step": 849 }, { "epoch": 0.17202995345071848, "grad_norm": 0.3185819685459137, "learning_rate": 0.00019642668561782505, "loss": 0.2727, "step": 850 }, { "epoch": 0.17202995345071848, "eval_loss": 0.28905409574508667, "eval_runtime": 0.7363, "eval_samples_per_second": 6.79, "eval_steps_per_second": 1.358, "step": 850 }, { "epoch": 0.17223234163124873, "grad_norm": 0.3105728328227997, "learning_rate": 0.00019641825397924076, "loss": 0.244, "step": 851 }, { "epoch": 0.172434729811779, "grad_norm": 0.31616318225860596, "learning_rate": 0.00019640981258606924, "loss": 0.2598, "step": 852 }, { "epoch": 0.17263711799230924, "grad_norm": 0.27749040722846985, "learning_rate": 0.00019640136143916448, "loss": 0.2225, "step": 853 }, { "epoch": 0.1728395061728395, "grad_norm": 0.37248048186302185, "learning_rate": 0.00019639290053938152, "loss": 0.252, "step": 854 }, { "epoch": 0.17304189435336975, "grad_norm": 0.3649056553840637, "learning_rate": 0.00019638442988757632, "loss": 0.2886, "step": 855 }, { "epoch": 0.1732442825339, "grad_norm": 0.30918097496032715, "learning_rate": 0.00019637594948460584, "loss": 0.2494, "step": 856 }, { "epoch": 0.1734466707144303, "grad_norm": 0.30996596813201904, "learning_rate": 0.00019636745933132807, "loss": 0.2318, "step": 857 }, { "epoch": 0.17364905889496055, "grad_norm": 0.4232423007488251, "learning_rate": 0.00019635895942860193, "loss": 0.2736, "step": 858 }, { "epoch": 0.1738514470754908, "grad_norm": 0.43543338775634766, "learning_rate": 0.00019635044977728736, "loss": 0.266, "step": 859 }, { "epoch": 0.17405383525602106, "grad_norm": 0.35472288727760315, "learning_rate": 0.00019634193037824528, "loss": 0.2276, "step": 860 }, { "epoch": 0.1742562234365513, "grad_norm": 0.33700042963027954, "learning_rate": 0.0001963334012323376, "loss": 0.2907, "step": 861 }, { "epoch": 0.17445861161708157, "grad_norm": 0.2621822953224182, "learning_rate": 0.00019632486234042715, "loss": 0.2244, "step": 862 }, { "epoch": 0.17466099979761182, "grad_norm": 0.27254778146743774, "learning_rate": 0.00019631631370337787, "loss": 0.2414, "step": 863 }, { "epoch": 0.17486338797814208, "grad_norm": 0.3431929349899292, "learning_rate": 0.00019630775532205466, "loss": 0.2544, "step": 864 }, { "epoch": 0.17506577615867233, "grad_norm": 0.3418872654438019, "learning_rate": 0.00019629918719732325, "loss": 0.2592, "step": 865 }, { "epoch": 0.1752681643392026, "grad_norm": 0.3220979571342468, "learning_rate": 0.00019629060933005056, "loss": 0.2882, "step": 866 }, { "epoch": 0.17547055251973284, "grad_norm": 0.4210681617259979, "learning_rate": 0.0001962820217211044, "loss": 0.279, "step": 867 }, { "epoch": 0.1756729407002631, "grad_norm": 0.2708446681499481, "learning_rate": 0.00019627342437135355, "loss": 0.244, "step": 868 }, { "epoch": 0.17587532888079335, "grad_norm": 0.35413023829460144, "learning_rate": 0.00019626481728166777, "loss": 0.2727, "step": 869 }, { "epoch": 0.1760777170613236, "grad_norm": 0.3829691708087921, "learning_rate": 0.0001962562004529179, "loss": 0.2697, "step": 870 }, { "epoch": 0.17628010524185386, "grad_norm": 0.39556726813316345, "learning_rate": 0.00019624757388597567, "loss": 0.2492, "step": 871 }, { "epoch": 0.17648249342238415, "grad_norm": 0.46628960967063904, "learning_rate": 0.00019623893758171385, "loss": 0.288, "step": 872 }, { "epoch": 0.1766848816029144, "grad_norm": 0.3176499009132385, "learning_rate": 0.0001962302915410061, "loss": 0.24, "step": 873 }, { "epoch": 0.17688726978344466, "grad_norm": 0.4139035642147064, "learning_rate": 0.00019622163576472724, "loss": 0.2517, "step": 874 }, { "epoch": 0.1770896579639749, "grad_norm": 0.29733502864837646, "learning_rate": 0.00019621297025375288, "loss": 0.2606, "step": 875 }, { "epoch": 0.17729204614450517, "grad_norm": 0.5197150111198425, "learning_rate": 0.00019620429500895976, "loss": 0.2508, "step": 876 }, { "epoch": 0.17749443432503542, "grad_norm": 0.28445857763290405, "learning_rate": 0.00019619561003122554, "loss": 0.2149, "step": 877 }, { "epoch": 0.17769682250556568, "grad_norm": 0.31572219729423523, "learning_rate": 0.00019618691532142884, "loss": 0.2569, "step": 878 }, { "epoch": 0.17789921068609593, "grad_norm": 0.26687559485435486, "learning_rate": 0.00019617821088044934, "loss": 0.2379, "step": 879 }, { "epoch": 0.1781015988666262, "grad_norm": 0.31003233790397644, "learning_rate": 0.0001961694967091676, "loss": 0.2384, "step": 880 }, { "epoch": 0.17830398704715644, "grad_norm": 0.30486080050468445, "learning_rate": 0.00019616077280846535, "loss": 0.2257, "step": 881 }, { "epoch": 0.1785063752276867, "grad_norm": 0.32738712430000305, "learning_rate": 0.00019615203917922508, "loss": 0.2493, "step": 882 }, { "epoch": 0.17870876340821695, "grad_norm": 0.3937731087207794, "learning_rate": 0.0001961432958223304, "loss": 0.2577, "step": 883 }, { "epoch": 0.1789111515887472, "grad_norm": 0.29360431432724, "learning_rate": 0.00019613454273866588, "loss": 0.2184, "step": 884 }, { "epoch": 0.17911353976927746, "grad_norm": 0.4143258333206177, "learning_rate": 0.00019612577992911704, "loss": 0.2304, "step": 885 }, { "epoch": 0.17931592794980772, "grad_norm": 0.34402403235435486, "learning_rate": 0.00019611700739457044, "loss": 0.2317, "step": 886 }, { "epoch": 0.179518316130338, "grad_norm": 0.3758588433265686, "learning_rate": 0.00019610822513591356, "loss": 0.2564, "step": 887 }, { "epoch": 0.17972070431086826, "grad_norm": 0.343376100063324, "learning_rate": 0.0001960994331540349, "loss": 0.258, "step": 888 }, { "epoch": 0.1799230924913985, "grad_norm": 0.3708763122558594, "learning_rate": 0.000196090631449824, "loss": 0.2497, "step": 889 }, { "epoch": 0.18012548067192877, "grad_norm": 0.3553926646709442, "learning_rate": 0.00019608182002417123, "loss": 0.2845, "step": 890 }, { "epoch": 0.18032786885245902, "grad_norm": 0.295229434967041, "learning_rate": 0.0001960729988779681, "loss": 0.2787, "step": 891 }, { "epoch": 0.18053025703298928, "grad_norm": 0.28084808588027954, "learning_rate": 0.00019606416801210702, "loss": 0.2342, "step": 892 }, { "epoch": 0.18073264521351953, "grad_norm": 0.3650226891040802, "learning_rate": 0.00019605532742748141, "loss": 0.298, "step": 893 }, { "epoch": 0.1809350333940498, "grad_norm": 0.34866154193878174, "learning_rate": 0.0001960464771249857, "loss": 0.237, "step": 894 }, { "epoch": 0.18113742157458004, "grad_norm": 0.5545859336853027, "learning_rate": 0.00019603761710551521, "loss": 0.2476, "step": 895 }, { "epoch": 0.1813398097551103, "grad_norm": 0.36406493186950684, "learning_rate": 0.00019602874736996632, "loss": 0.2419, "step": 896 }, { "epoch": 0.18154219793564055, "grad_norm": 0.38556376099586487, "learning_rate": 0.00019601986791923642, "loss": 0.2541, "step": 897 }, { "epoch": 0.1817445861161708, "grad_norm": 0.3872116208076477, "learning_rate": 0.0001960109787542238, "loss": 0.2412, "step": 898 }, { "epoch": 0.18194697429670106, "grad_norm": 0.3459681570529938, "learning_rate": 0.00019600207987582777, "loss": 0.2707, "step": 899 }, { "epoch": 0.18214936247723132, "grad_norm": 0.35269078612327576, "learning_rate": 0.00019599317128494862, "loss": 0.2372, "step": 900 }, { "epoch": 0.18214936247723132, "eval_loss": 0.2881661057472229, "eval_runtime": 0.7383, "eval_samples_per_second": 6.772, "eval_steps_per_second": 1.354, "step": 900 }, { "epoch": 0.18235175065776157, "grad_norm": 0.3326655328273773, "learning_rate": 0.0001959842529824877, "loss": 0.2698, "step": 901 }, { "epoch": 0.18255413883829186, "grad_norm": 0.42814531922340393, "learning_rate": 0.00019597532496934717, "loss": 0.2723, "step": 902 }, { "epoch": 0.1827565270188221, "grad_norm": 0.4340554177761078, "learning_rate": 0.00019596638724643032, "loss": 0.275, "step": 903 }, { "epoch": 0.18295891519935237, "grad_norm": 0.2819032371044159, "learning_rate": 0.0001959574398146414, "loss": 0.2369, "step": 904 }, { "epoch": 0.18316130337988262, "grad_norm": 0.31827113032341003, "learning_rate": 0.00019594848267488556, "loss": 0.2772, "step": 905 }, { "epoch": 0.18336369156041288, "grad_norm": 0.31110498309135437, "learning_rate": 0.00019593951582806902, "loss": 0.226, "step": 906 }, { "epoch": 0.18356607974094313, "grad_norm": 0.34133848547935486, "learning_rate": 0.00019593053927509897, "loss": 0.1788, "step": 907 }, { "epoch": 0.1837684679214734, "grad_norm": 0.30660688877105713, "learning_rate": 0.00019592155301688356, "loss": 0.2477, "step": 908 }, { "epoch": 0.18397085610200364, "grad_norm": 0.34563958644866943, "learning_rate": 0.00019591255705433188, "loss": 0.2614, "step": 909 }, { "epoch": 0.1841732442825339, "grad_norm": 0.37129271030426025, "learning_rate": 0.00019590355138835407, "loss": 0.2313, "step": 910 }, { "epoch": 0.18437563246306415, "grad_norm": 0.3038152754306793, "learning_rate": 0.00019589453601986123, "loss": 0.2475, "step": 911 }, { "epoch": 0.1845780206435944, "grad_norm": 0.31466180086135864, "learning_rate": 0.0001958855109497655, "loss": 0.2383, "step": 912 }, { "epoch": 0.18478040882412466, "grad_norm": 0.2817492187023163, "learning_rate": 0.0001958764761789798, "loss": 0.2495, "step": 913 }, { "epoch": 0.18498279700465492, "grad_norm": 0.5035717487335205, "learning_rate": 0.00019586743170841832, "loss": 0.2825, "step": 914 }, { "epoch": 0.18518518518518517, "grad_norm": 0.37324804067611694, "learning_rate": 0.00019585837753899603, "loss": 0.268, "step": 915 }, { "epoch": 0.18538757336571543, "grad_norm": 0.29959744215011597, "learning_rate": 0.0001958493136716289, "loss": 0.2341, "step": 916 }, { "epoch": 0.1855899615462457, "grad_norm": 0.30992889404296875, "learning_rate": 0.00019584024010723398, "loss": 0.2398, "step": 917 }, { "epoch": 0.18579234972677597, "grad_norm": 0.41944313049316406, "learning_rate": 0.00019583115684672917, "loss": 0.3288, "step": 918 }, { "epoch": 0.18599473790730622, "grad_norm": 0.417007714509964, "learning_rate": 0.00019582206389103348, "loss": 0.2268, "step": 919 }, { "epoch": 0.18619712608783648, "grad_norm": 0.42923831939697266, "learning_rate": 0.0001958129612410668, "loss": 0.2854, "step": 920 }, { "epoch": 0.18639951426836673, "grad_norm": 0.2931532859802246, "learning_rate": 0.00019580384889775006, "loss": 0.23, "step": 921 }, { "epoch": 0.186601902448897, "grad_norm": 0.40126919746398926, "learning_rate": 0.00019579472686200518, "loss": 0.2682, "step": 922 }, { "epoch": 0.18680429062942724, "grad_norm": 0.3147217333316803, "learning_rate": 0.00019578559513475495, "loss": 0.2346, "step": 923 }, { "epoch": 0.1870066788099575, "grad_norm": 0.32852163910865784, "learning_rate": 0.00019577645371692332, "loss": 0.2728, "step": 924 }, { "epoch": 0.18720906699048775, "grad_norm": 0.41189318895339966, "learning_rate": 0.00019576730260943503, "loss": 0.3268, "step": 925 }, { "epoch": 0.187411455171018, "grad_norm": 0.3896682858467102, "learning_rate": 0.00019575814181321593, "loss": 0.2162, "step": 926 }, { "epoch": 0.18761384335154827, "grad_norm": 0.33515724539756775, "learning_rate": 0.00019574897132919284, "loss": 0.2487, "step": 927 }, { "epoch": 0.18781623153207852, "grad_norm": 0.310791015625, "learning_rate": 0.00019573979115829353, "loss": 0.2704, "step": 928 }, { "epoch": 0.18801861971260878, "grad_norm": 0.3307356536388397, "learning_rate": 0.00019573060130144673, "loss": 0.2184, "step": 929 }, { "epoch": 0.18822100789313903, "grad_norm": 0.35177716612815857, "learning_rate": 0.0001957214017595822, "loss": 0.2221, "step": 930 }, { "epoch": 0.18842339607366929, "grad_norm": 0.38704296946525574, "learning_rate": 0.00019571219253363057, "loss": 0.239, "step": 931 }, { "epoch": 0.18862578425419954, "grad_norm": 0.40003615617752075, "learning_rate": 0.00019570297362452363, "loss": 0.2707, "step": 932 }, { "epoch": 0.18882817243472982, "grad_norm": 0.3730137050151825, "learning_rate": 0.00019569374503319403, "loss": 0.2705, "step": 933 }, { "epoch": 0.18903056061526008, "grad_norm": 0.3282415270805359, "learning_rate": 0.0001956845067605754, "loss": 0.2668, "step": 934 }, { "epoch": 0.18923294879579033, "grad_norm": 0.3039020299911499, "learning_rate": 0.00019567525880760238, "loss": 0.2294, "step": 935 }, { "epoch": 0.1894353369763206, "grad_norm": 0.40377500653266907, "learning_rate": 0.00019566600117521058, "loss": 0.2432, "step": 936 }, { "epoch": 0.18963772515685084, "grad_norm": 0.2721993029117584, "learning_rate": 0.0001956567338643366, "loss": 0.2113, "step": 937 }, { "epoch": 0.1898401133373811, "grad_norm": 0.2860236167907715, "learning_rate": 0.00019564745687591803, "loss": 0.2504, "step": 938 }, { "epoch": 0.19004250151791136, "grad_norm": 0.29996049404144287, "learning_rate": 0.00019563817021089338, "loss": 0.26, "step": 939 }, { "epoch": 0.1902448896984416, "grad_norm": 0.2724343538284302, "learning_rate": 0.00019562887387020216, "loss": 0.2515, "step": 940 }, { "epoch": 0.19044727787897187, "grad_norm": 0.3486071228981018, "learning_rate": 0.0001956195678547849, "loss": 0.3149, "step": 941 }, { "epoch": 0.19064966605950212, "grad_norm": 0.33210688829421997, "learning_rate": 0.0001956102521655831, "loss": 0.2132, "step": 942 }, { "epoch": 0.19085205424003238, "grad_norm": 0.32016050815582275, "learning_rate": 0.0001956009268035392, "loss": 0.2585, "step": 943 }, { "epoch": 0.19105444242056263, "grad_norm": 0.3609424829483032, "learning_rate": 0.00019559159176959668, "loss": 0.2609, "step": 944 }, { "epoch": 0.1912568306010929, "grad_norm": 0.3491605818271637, "learning_rate": 0.00019558224706469994, "loss": 0.236, "step": 945 }, { "epoch": 0.19145921878162314, "grad_norm": 0.3373927175998688, "learning_rate": 0.00019557289268979435, "loss": 0.2615, "step": 946 }, { "epoch": 0.1916616069621534, "grad_norm": 0.40465179085731506, "learning_rate": 0.00019556352864582633, "loss": 0.2617, "step": 947 }, { "epoch": 0.19186399514268368, "grad_norm": 0.3118032217025757, "learning_rate": 0.00019555415493374324, "loss": 0.2469, "step": 948 }, { "epoch": 0.19206638332321394, "grad_norm": 0.3560132086277008, "learning_rate": 0.00019554477155449336, "loss": 0.2562, "step": 949 }, { "epoch": 0.1922687715037442, "grad_norm": 0.35652318596839905, "learning_rate": 0.000195535378509026, "loss": 0.2473, "step": 950 }, { "epoch": 0.1922687715037442, "eval_loss": 0.29233258962631226, "eval_runtime": 0.7383, "eval_samples_per_second": 6.772, "eval_steps_per_second": 1.354, "step": 950 }, { "epoch": 0.19247115968427445, "grad_norm": 0.38569778203964233, "learning_rate": 0.00019552597579829156, "loss": 0.2271, "step": 951 }, { "epoch": 0.1926735478648047, "grad_norm": 0.3487134277820587, "learning_rate": 0.00019551656342324118, "loss": 0.2568, "step": 952 }, { "epoch": 0.19287593604533496, "grad_norm": 0.29401594400405884, "learning_rate": 0.00019550714138482717, "loss": 0.2609, "step": 953 }, { "epoch": 0.1930783242258652, "grad_norm": 0.342074453830719, "learning_rate": 0.00019549770968400277, "loss": 0.2566, "step": 954 }, { "epoch": 0.19328071240639547, "grad_norm": 0.33882638812065125, "learning_rate": 0.0001954882683217221, "loss": 0.2631, "step": 955 }, { "epoch": 0.19348310058692572, "grad_norm": 0.37705790996551514, "learning_rate": 0.0001954788172989404, "loss": 0.2366, "step": 956 }, { "epoch": 0.19368548876745598, "grad_norm": 0.3006690442562103, "learning_rate": 0.00019546935661661382, "loss": 0.2495, "step": 957 }, { "epoch": 0.19388787694798623, "grad_norm": 0.28892847895622253, "learning_rate": 0.00019545988627569952, "loss": 0.2503, "step": 958 }, { "epoch": 0.1940902651285165, "grad_norm": 0.34635743498802185, "learning_rate": 0.0001954504062771555, "loss": 0.2554, "step": 959 }, { "epoch": 0.19429265330904674, "grad_norm": 0.3986789286136627, "learning_rate": 0.000195440916621941, "loss": 0.2566, "step": 960 }, { "epoch": 0.194495041489577, "grad_norm": 0.4037439227104187, "learning_rate": 0.00019543141731101596, "loss": 0.2677, "step": 961 }, { "epoch": 0.19469742967010725, "grad_norm": 0.5037823915481567, "learning_rate": 0.0001954219083453415, "loss": 0.2491, "step": 962 }, { "epoch": 0.19489981785063754, "grad_norm": 0.3077455759048462, "learning_rate": 0.00019541238972587958, "loss": 0.2614, "step": 963 }, { "epoch": 0.1951022060311678, "grad_norm": 0.3154994249343872, "learning_rate": 0.0001954028614535932, "loss": 0.2671, "step": 964 }, { "epoch": 0.19530459421169805, "grad_norm": 0.3500082790851593, "learning_rate": 0.0001953933235294464, "loss": 0.2431, "step": 965 }, { "epoch": 0.1955069823922283, "grad_norm": 0.5176851153373718, "learning_rate": 0.00019538377595440404, "loss": 0.2662, "step": 966 }, { "epoch": 0.19570937057275856, "grad_norm": 0.4201027452945709, "learning_rate": 0.0001953742187294321, "loss": 0.2842, "step": 967 }, { "epoch": 0.1959117587532888, "grad_norm": 0.3936191201210022, "learning_rate": 0.00019536465185549746, "loss": 0.2361, "step": 968 }, { "epoch": 0.19611414693381907, "grad_norm": 0.2671091556549072, "learning_rate": 0.00019535507533356797, "loss": 0.2429, "step": 969 }, { "epoch": 0.19631653511434932, "grad_norm": 0.3662154972553253, "learning_rate": 0.00019534548916461252, "loss": 0.3213, "step": 970 }, { "epoch": 0.19651892329487958, "grad_norm": 0.2781408131122589, "learning_rate": 0.00019533589334960093, "loss": 0.208, "step": 971 }, { "epoch": 0.19672131147540983, "grad_norm": 0.3062308728694916, "learning_rate": 0.000195326287889504, "loss": 0.2704, "step": 972 }, { "epoch": 0.1969236996559401, "grad_norm": 0.3737871050834656, "learning_rate": 0.0001953166727852935, "loss": 0.2686, "step": 973 }, { "epoch": 0.19712608783647034, "grad_norm": 0.2863426208496094, "learning_rate": 0.0001953070480379422, "loss": 0.2275, "step": 974 }, { "epoch": 0.1973284760170006, "grad_norm": 0.3075900673866272, "learning_rate": 0.0001952974136484238, "loss": 0.2473, "step": 975 }, { "epoch": 0.19753086419753085, "grad_norm": 0.30082938075065613, "learning_rate": 0.00019528776961771308, "loss": 0.2504, "step": 976 }, { "epoch": 0.1977332523780611, "grad_norm": 0.31082528829574585, "learning_rate": 0.00019527811594678563, "loss": 0.2632, "step": 977 }, { "epoch": 0.1979356405585914, "grad_norm": 0.2519990801811218, "learning_rate": 0.00019526845263661817, "loss": 0.21, "step": 978 }, { "epoch": 0.19813802873912165, "grad_norm": 0.304943710565567, "learning_rate": 0.0001952587796881883, "loss": 0.2578, "step": 979 }, { "epoch": 0.1983404169196519, "grad_norm": 0.4617615044116974, "learning_rate": 0.00019524909710247465, "loss": 0.2765, "step": 980 }, { "epoch": 0.19854280510018216, "grad_norm": 0.3307913541793823, "learning_rate": 0.0001952394048804568, "loss": 0.2358, "step": 981 }, { "epoch": 0.1987451932807124, "grad_norm": 0.30370616912841797, "learning_rate": 0.0001952297030231153, "loss": 0.2148, "step": 982 }, { "epoch": 0.19894758146124267, "grad_norm": 0.5463431477546692, "learning_rate": 0.00019521999153143167, "loss": 0.2654, "step": 983 }, { "epoch": 0.19914996964177292, "grad_norm": 0.3252580463886261, "learning_rate": 0.00019521027040638844, "loss": 0.2239, "step": 984 }, { "epoch": 0.19935235782230318, "grad_norm": 0.5134966969490051, "learning_rate": 0.00019520053964896907, "loss": 0.2509, "step": 985 }, { "epoch": 0.19955474600283343, "grad_norm": 0.26822400093078613, "learning_rate": 0.00019519079926015804, "loss": 0.2219, "step": 986 }, { "epoch": 0.1997571341833637, "grad_norm": 0.29705125093460083, "learning_rate": 0.00019518104924094075, "loss": 0.223, "step": 987 }, { "epoch": 0.19995952236389394, "grad_norm": 0.2979499399662018, "learning_rate": 0.00019517128959230365, "loss": 0.282, "step": 988 }, { "epoch": 0.2001619105444242, "grad_norm": 0.3182627856731415, "learning_rate": 0.00019516152031523405, "loss": 0.2596, "step": 989 }, { "epoch": 0.20036429872495445, "grad_norm": 0.28033894300460815, "learning_rate": 0.00019515174141072038, "loss": 0.2134, "step": 990 }, { "epoch": 0.2005666869054847, "grad_norm": 0.2765475809574127, "learning_rate": 0.00019514195287975188, "loss": 0.246, "step": 991 }, { "epoch": 0.20076907508601496, "grad_norm": 0.34840041399002075, "learning_rate": 0.00019513215472331894, "loss": 0.2412, "step": 992 }, { "epoch": 0.20097146326654525, "grad_norm": 0.2901442050933838, "learning_rate": 0.00019512234694241278, "loss": 0.2201, "step": 993 }, { "epoch": 0.2011738514470755, "grad_norm": 0.3278302252292633, "learning_rate": 0.00019511252953802568, "loss": 0.258, "step": 994 }, { "epoch": 0.20137623962760576, "grad_norm": 0.38416075706481934, "learning_rate": 0.00019510270251115084, "loss": 0.2476, "step": 995 }, { "epoch": 0.201578627808136, "grad_norm": 0.31708383560180664, "learning_rate": 0.00019509286586278247, "loss": 0.2711, "step": 996 }, { "epoch": 0.20178101598866627, "grad_norm": 0.33503633737564087, "learning_rate": 0.0001950830195939157, "loss": 0.2568, "step": 997 }, { "epoch": 0.20198340416919652, "grad_norm": 0.3001486659049988, "learning_rate": 0.00019507316370554674, "loss": 0.2575, "step": 998 }, { "epoch": 0.20218579234972678, "grad_norm": 0.3202657997608185, "learning_rate": 0.00019506329819867264, "loss": 0.2213, "step": 999 }, { "epoch": 0.20238818053025703, "grad_norm": 0.3761771023273468, "learning_rate": 0.00019505342307429152, "loss": 0.2377, "step": 1000 }, { "epoch": 0.20238818053025703, "eval_loss": 0.29387423396110535, "eval_runtime": 0.7386, "eval_samples_per_second": 6.77, "eval_steps_per_second": 1.354, "step": 1000 }, { "epoch": 0.2025905687107873, "grad_norm": 0.25448256731033325, "learning_rate": 0.00019504353833340243, "loss": 0.2335, "step": 1001 }, { "epoch": 0.20279295689131754, "grad_norm": 0.26972317695617676, "learning_rate": 0.00019503364397700543, "loss": 0.2325, "step": 1002 }, { "epoch": 0.2029953450718478, "grad_norm": 0.3720798194408417, "learning_rate": 0.00019502374000610151, "loss": 0.2357, "step": 1003 }, { "epoch": 0.20319773325237805, "grad_norm": 0.2979353070259094, "learning_rate": 0.00019501382642169265, "loss": 0.268, "step": 1004 }, { "epoch": 0.2034001214329083, "grad_norm": 0.33817097544670105, "learning_rate": 0.00019500390322478177, "loss": 0.3045, "step": 1005 }, { "epoch": 0.20360250961343856, "grad_norm": 0.29102396965026855, "learning_rate": 0.00019499397041637285, "loss": 0.2421, "step": 1006 }, { "epoch": 0.20380489779396882, "grad_norm": 0.35644692182540894, "learning_rate": 0.00019498402799747079, "loss": 0.2856, "step": 1007 }, { "epoch": 0.2040072859744991, "grad_norm": 0.2464309185743332, "learning_rate": 0.0001949740759690814, "loss": 0.2375, "step": 1008 }, { "epoch": 0.20420967415502936, "grad_norm": 0.3096578121185303, "learning_rate": 0.00019496411433221156, "loss": 0.2123, "step": 1009 }, { "epoch": 0.2044120623355596, "grad_norm": 0.43892258405685425, "learning_rate": 0.00019495414308786909, "loss": 0.2298, "step": 1010 }, { "epoch": 0.20461445051608987, "grad_norm": 0.37740859389305115, "learning_rate": 0.00019494416223706274, "loss": 0.2645, "step": 1011 }, { "epoch": 0.20481683869662012, "grad_norm": 0.34194040298461914, "learning_rate": 0.0001949341717808023, "loss": 0.2501, "step": 1012 }, { "epoch": 0.20501922687715038, "grad_norm": 0.3642720580101013, "learning_rate": 0.0001949241717200985, "loss": 0.2856, "step": 1013 }, { "epoch": 0.20522161505768063, "grad_norm": 0.34601861238479614, "learning_rate": 0.00019491416205596305, "loss": 0.2536, "step": 1014 }, { "epoch": 0.2054240032382109, "grad_norm": 0.2937442362308502, "learning_rate": 0.00019490414278940858, "loss": 0.2354, "step": 1015 }, { "epoch": 0.20562639141874114, "grad_norm": 0.2591923475265503, "learning_rate": 0.00019489411392144875, "loss": 0.2362, "step": 1016 }, { "epoch": 0.2058287795992714, "grad_norm": 0.32309219241142273, "learning_rate": 0.00019488407545309824, "loss": 0.288, "step": 1017 }, { "epoch": 0.20603116777980165, "grad_norm": 0.4047209918498993, "learning_rate": 0.00019487402738537255, "loss": 0.2606, "step": 1018 }, { "epoch": 0.2062335559603319, "grad_norm": 0.35363447666168213, "learning_rate": 0.00019486396971928827, "loss": 0.2832, "step": 1019 }, { "epoch": 0.20643594414086217, "grad_norm": 0.6062155961990356, "learning_rate": 0.00019485390245586293, "loss": 0.2238, "step": 1020 }, { "epoch": 0.20663833232139242, "grad_norm": 0.3819604516029358, "learning_rate": 0.00019484382559611504, "loss": 0.2462, "step": 1021 }, { "epoch": 0.20684072050192268, "grad_norm": 0.3663135766983032, "learning_rate": 0.00019483373914106405, "loss": 0.2344, "step": 1022 }, { "epoch": 0.20704310868245293, "grad_norm": 0.9232494831085205, "learning_rate": 0.0001948236430917304, "loss": 0.2962, "step": 1023 }, { "epoch": 0.20724549686298321, "grad_norm": 0.37580475211143494, "learning_rate": 0.00019481353744913552, "loss": 0.2449, "step": 1024 }, { "epoch": 0.20744788504351347, "grad_norm": 0.2748352587223053, "learning_rate": 0.0001948034222143018, "loss": 0.2154, "step": 1025 }, { "epoch": 0.20765027322404372, "grad_norm": 0.2947147786617279, "learning_rate": 0.0001947932973882526, "loss": 0.2678, "step": 1026 }, { "epoch": 0.20785266140457398, "grad_norm": 0.579681932926178, "learning_rate": 0.00019478316297201218, "loss": 0.2584, "step": 1027 }, { "epoch": 0.20805504958510423, "grad_norm": 0.2945061922073364, "learning_rate": 0.0001947730189666059, "loss": 0.2124, "step": 1028 }, { "epoch": 0.2082574377656345, "grad_norm": 0.3777732849121094, "learning_rate": 0.00019476286537306004, "loss": 0.2362, "step": 1029 }, { "epoch": 0.20845982594616475, "grad_norm": 0.3627498745918274, "learning_rate": 0.00019475270219240174, "loss": 0.2156, "step": 1030 }, { "epoch": 0.208662214126695, "grad_norm": 0.38102152943611145, "learning_rate": 0.0001947425294256593, "loss": 0.2703, "step": 1031 }, { "epoch": 0.20886460230722526, "grad_norm": 0.380249559879303, "learning_rate": 0.00019473234707386184, "loss": 0.2587, "step": 1032 }, { "epoch": 0.2090669904877555, "grad_norm": 0.38338467478752136, "learning_rate": 0.0001947221551380395, "loss": 0.2455, "step": 1033 }, { "epoch": 0.20926937866828577, "grad_norm": 0.5233322381973267, "learning_rate": 0.00019471195361922346, "loss": 0.2585, "step": 1034 }, { "epoch": 0.20947176684881602, "grad_norm": 0.3479728698730469, "learning_rate": 0.0001947017425184457, "loss": 0.237, "step": 1035 }, { "epoch": 0.20967415502934628, "grad_norm": 0.3039289116859436, "learning_rate": 0.00019469152183673936, "loss": 0.2524, "step": 1036 }, { "epoch": 0.20987654320987653, "grad_norm": 0.5088594555854797, "learning_rate": 0.00019468129157513842, "loss": 0.2509, "step": 1037 }, { "epoch": 0.2100789313904068, "grad_norm": 0.3189416825771332, "learning_rate": 0.00019467105173467787, "loss": 0.2619, "step": 1038 }, { "epoch": 0.21028131957093707, "grad_norm": 0.5125216245651245, "learning_rate": 0.00019466080231639367, "loss": 0.2573, "step": 1039 }, { "epoch": 0.21048370775146732, "grad_norm": 0.3192436099052429, "learning_rate": 0.0001946505433213228, "loss": 0.2115, "step": 1040 }, { "epoch": 0.21068609593199758, "grad_norm": 0.43862384557724, "learning_rate": 0.00019464027475050305, "loss": 0.2914, "step": 1041 }, { "epoch": 0.21088848411252784, "grad_norm": 0.4017934799194336, "learning_rate": 0.0001946299966049734, "loss": 0.2352, "step": 1042 }, { "epoch": 0.2110908722930581, "grad_norm": 0.45070984959602356, "learning_rate": 0.0001946197088857736, "loss": 0.2795, "step": 1043 }, { "epoch": 0.21129326047358835, "grad_norm": 0.45495152473449707, "learning_rate": 0.0001946094115939445, "loss": 0.2179, "step": 1044 }, { "epoch": 0.2114956486541186, "grad_norm": 0.3081578314304352, "learning_rate": 0.00019459910473052788, "loss": 0.2281, "step": 1045 }, { "epoch": 0.21169803683464886, "grad_norm": 0.36594170331954956, "learning_rate": 0.00019458878829656644, "loss": 0.2252, "step": 1046 }, { "epoch": 0.2119004250151791, "grad_norm": 0.3685033321380615, "learning_rate": 0.0001945784622931039, "loss": 0.2563, "step": 1047 }, { "epoch": 0.21210281319570937, "grad_norm": 0.37463536858558655, "learning_rate": 0.00019456812672118498, "loss": 0.2342, "step": 1048 }, { "epoch": 0.21230520137623962, "grad_norm": 0.4732096493244171, "learning_rate": 0.00019455778158185524, "loss": 0.2471, "step": 1049 }, { "epoch": 0.21250758955676988, "grad_norm": 0.3210441470146179, "learning_rate": 0.0001945474268761614, "loss": 0.2513, "step": 1050 }, { "epoch": 0.21250758955676988, "eval_loss": 0.29186126589775085, "eval_runtime": 0.7396, "eval_samples_per_second": 6.761, "eval_steps_per_second": 1.352, "step": 1050 }, { "epoch": 0.21270997773730013, "grad_norm": 0.3155227303504944, "learning_rate": 0.00019453706260515093, "loss": 0.2633, "step": 1051 }, { "epoch": 0.2129123659178304, "grad_norm": 0.33783820271492004, "learning_rate": 0.00019452668876987248, "loss": 0.2694, "step": 1052 }, { "epoch": 0.21311475409836064, "grad_norm": 0.2770099639892578, "learning_rate": 0.00019451630537137548, "loss": 0.2462, "step": 1053 }, { "epoch": 0.21331714227889093, "grad_norm": 0.32644253969192505, "learning_rate": 0.00019450591241071047, "loss": 0.2395, "step": 1054 }, { "epoch": 0.21351953045942118, "grad_norm": 0.38256141543388367, "learning_rate": 0.00019449550988892892, "loss": 0.2407, "step": 1055 }, { "epoch": 0.21372191863995144, "grad_norm": 0.30983036756515503, "learning_rate": 0.00019448509780708312, "loss": 0.2464, "step": 1056 }, { "epoch": 0.2139243068204817, "grad_norm": 0.3679855465888977, "learning_rate": 0.00019447467616622662, "loss": 0.2608, "step": 1057 }, { "epoch": 0.21412669500101195, "grad_norm": 0.3574540615081787, "learning_rate": 0.00019446424496741366, "loss": 0.2257, "step": 1058 }, { "epoch": 0.2143290831815422, "grad_norm": 0.38781675696372986, "learning_rate": 0.0001944538042116996, "loss": 0.2339, "step": 1059 }, { "epoch": 0.21453147136207246, "grad_norm": 0.39075520634651184, "learning_rate": 0.00019444335390014073, "loss": 0.269, "step": 1060 }, { "epoch": 0.2147338595426027, "grad_norm": 0.4448465406894684, "learning_rate": 0.0001944328940337943, "loss": 0.2571, "step": 1061 }, { "epoch": 0.21493624772313297, "grad_norm": 0.3397972583770752, "learning_rate": 0.0001944224246137185, "loss": 0.2667, "step": 1062 }, { "epoch": 0.21513863590366322, "grad_norm": 0.3251417875289917, "learning_rate": 0.00019441194564097258, "loss": 0.2203, "step": 1063 }, { "epoch": 0.21534102408419348, "grad_norm": 0.3613958954811096, "learning_rate": 0.00019440145711661664, "loss": 0.2751, "step": 1064 }, { "epoch": 0.21554341226472373, "grad_norm": 0.38893675804138184, "learning_rate": 0.0001943909590417118, "loss": 0.236, "step": 1065 }, { "epoch": 0.215745800445254, "grad_norm": 0.3045949935913086, "learning_rate": 0.00019438045141732016, "loss": 0.2284, "step": 1066 }, { "epoch": 0.21594818862578424, "grad_norm": 0.5375556349754333, "learning_rate": 0.00019436993424450476, "loss": 0.2692, "step": 1067 }, { "epoch": 0.2161505768063145, "grad_norm": 0.3492136299610138, "learning_rate": 0.00019435940752432967, "loss": 0.2843, "step": 1068 }, { "epoch": 0.21635296498684478, "grad_norm": 0.23821096122264862, "learning_rate": 0.00019434887125785975, "loss": 0.1821, "step": 1069 }, { "epoch": 0.21655535316737504, "grad_norm": 0.38852131366729736, "learning_rate": 0.00019433832544616108, "loss": 0.2703, "step": 1070 }, { "epoch": 0.2167577413479053, "grad_norm": 0.30407455563545227, "learning_rate": 0.00019432777009030053, "loss": 0.253, "step": 1071 }, { "epoch": 0.21696012952843555, "grad_norm": 0.47758540511131287, "learning_rate": 0.00019431720519134596, "loss": 0.2616, "step": 1072 }, { "epoch": 0.2171625177089658, "grad_norm": 0.3612705171108246, "learning_rate": 0.0001943066307503662, "loss": 0.276, "step": 1073 }, { "epoch": 0.21736490588949606, "grad_norm": 0.3635235130786896, "learning_rate": 0.00019429604676843114, "loss": 0.2374, "step": 1074 }, { "epoch": 0.2175672940700263, "grad_norm": 0.3618505895137787, "learning_rate": 0.00019428545324661148, "loss": 0.2657, "step": 1075 }, { "epoch": 0.21776968225055657, "grad_norm": 0.42967814207077026, "learning_rate": 0.00019427485018597897, "loss": 0.269, "step": 1076 }, { "epoch": 0.21797207043108682, "grad_norm": 0.3846936523914337, "learning_rate": 0.00019426423758760634, "loss": 0.2277, "step": 1077 }, { "epoch": 0.21817445861161708, "grad_norm": 0.3141876459121704, "learning_rate": 0.00019425361545256727, "loss": 0.2587, "step": 1078 }, { "epoch": 0.21837684679214733, "grad_norm": 0.4163576364517212, "learning_rate": 0.00019424298378193638, "loss": 0.24, "step": 1079 }, { "epoch": 0.2185792349726776, "grad_norm": 0.30708739161491394, "learning_rate": 0.00019423234257678925, "loss": 0.2517, "step": 1080 }, { "epoch": 0.21878162315320784, "grad_norm": 0.32779768109321594, "learning_rate": 0.00019422169183820249, "loss": 0.2395, "step": 1081 }, { "epoch": 0.2189840113337381, "grad_norm": 0.35462939739227295, "learning_rate": 0.00019421103156725363, "loss": 0.2727, "step": 1082 }, { "epoch": 0.21918639951426835, "grad_norm": 0.3319645822048187, "learning_rate": 0.00019420036176502107, "loss": 0.2345, "step": 1083 }, { "epoch": 0.21938878769479864, "grad_norm": 0.39992696046829224, "learning_rate": 0.0001941896824325844, "loss": 0.2932, "step": 1084 }, { "epoch": 0.2195911758753289, "grad_norm": 0.4323594570159912, "learning_rate": 0.00019417899357102397, "loss": 0.2737, "step": 1085 }, { "epoch": 0.21979356405585915, "grad_norm": 0.31720826029777527, "learning_rate": 0.00019416829518142118, "loss": 0.2378, "step": 1086 }, { "epoch": 0.2199959522363894, "grad_norm": 0.35780414938926697, "learning_rate": 0.00019415758726485836, "loss": 0.2661, "step": 1087 }, { "epoch": 0.22019834041691966, "grad_norm": 0.3422088325023651, "learning_rate": 0.00019414686982241884, "loss": 0.2429, "step": 1088 }, { "epoch": 0.2204007285974499, "grad_norm": 0.33238205313682556, "learning_rate": 0.00019413614285518693, "loss": 0.2382, "step": 1089 }, { "epoch": 0.22060311677798017, "grad_norm": 0.45054638385772705, "learning_rate": 0.00019412540636424782, "loss": 0.2712, "step": 1090 }, { "epoch": 0.22080550495851042, "grad_norm": 0.299956738948822, "learning_rate": 0.00019411466035068776, "loss": 0.241, "step": 1091 }, { "epoch": 0.22100789313904068, "grad_norm": 0.31100958585739136, "learning_rate": 0.0001941039048155939, "loss": 0.2306, "step": 1092 }, { "epoch": 0.22121028131957093, "grad_norm": 0.3154681921005249, "learning_rate": 0.00019409313976005436, "loss": 0.247, "step": 1093 }, { "epoch": 0.2214126695001012, "grad_norm": 0.25610601902008057, "learning_rate": 0.00019408236518515825, "loss": 0.2112, "step": 1094 }, { "epoch": 0.22161505768063144, "grad_norm": 0.3014226257801056, "learning_rate": 0.00019407158109199565, "loss": 0.2436, "step": 1095 }, { "epoch": 0.2218174458611617, "grad_norm": 0.3093024492263794, "learning_rate": 0.00019406078748165751, "loss": 0.2345, "step": 1096 }, { "epoch": 0.22201983404169195, "grad_norm": 0.3434992730617523, "learning_rate": 0.00019404998435523592, "loss": 0.2294, "step": 1097 }, { "epoch": 0.2222222222222222, "grad_norm": 0.32315975427627563, "learning_rate": 0.00019403917171382373, "loss": 0.2591, "step": 1098 }, { "epoch": 0.2224246104027525, "grad_norm": 0.3836386799812317, "learning_rate": 0.0001940283495585149, "loss": 0.2431, "step": 1099 }, { "epoch": 0.22262699858328275, "grad_norm": 0.27538156509399414, "learning_rate": 0.00019401751789040428, "loss": 0.2458, "step": 1100 }, { "epoch": 0.22262699858328275, "eval_loss": 0.2982866168022156, "eval_runtime": 0.7369, "eval_samples_per_second": 6.785, "eval_steps_per_second": 1.357, "step": 1100 }, { "epoch": 0.222829386763813, "grad_norm": 0.3932587206363678, "learning_rate": 0.0001940066767105877, "loss": 0.2303, "step": 1101 }, { "epoch": 0.22303177494434326, "grad_norm": 0.3903684616088867, "learning_rate": 0.000193995826020162, "loss": 0.2362, "step": 1102 }, { "epoch": 0.2232341631248735, "grad_norm": 0.35070565342903137, "learning_rate": 0.0001939849658202249, "loss": 0.2352, "step": 1103 }, { "epoch": 0.22343655130540377, "grad_norm": 0.29039356112480164, "learning_rate": 0.00019397409611187513, "loss": 0.2248, "step": 1104 }, { "epoch": 0.22363893948593402, "grad_norm": 0.29755696654319763, "learning_rate": 0.00019396321689621238, "loss": 0.2605, "step": 1105 }, { "epoch": 0.22384132766646428, "grad_norm": 0.713554322719574, "learning_rate": 0.0001939523281743373, "loss": 0.2707, "step": 1106 }, { "epoch": 0.22404371584699453, "grad_norm": 0.3808262348175049, "learning_rate": 0.00019394142994735145, "loss": 0.2433, "step": 1107 }, { "epoch": 0.2242461040275248, "grad_norm": 0.31507617235183716, "learning_rate": 0.00019393052221635746, "loss": 0.2235, "step": 1108 }, { "epoch": 0.22444849220805504, "grad_norm": 0.31799739599227905, "learning_rate": 0.00019391960498245882, "loss": 0.2398, "step": 1109 }, { "epoch": 0.2246508803885853, "grad_norm": 0.36539918184280396, "learning_rate": 0.00019390867824676004, "loss": 0.2539, "step": 1110 }, { "epoch": 0.22485326856911556, "grad_norm": 0.313516229391098, "learning_rate": 0.00019389774201036657, "loss": 0.205, "step": 1111 }, { "epoch": 0.2250556567496458, "grad_norm": 0.48267585039138794, "learning_rate": 0.00019388679627438483, "loss": 0.2819, "step": 1112 }, { "epoch": 0.22525804493017607, "grad_norm": 0.25217655301094055, "learning_rate": 0.00019387584103992218, "loss": 0.2286, "step": 1113 }, { "epoch": 0.22546043311070635, "grad_norm": 0.3520773649215698, "learning_rate": 0.00019386487630808697, "loss": 0.2477, "step": 1114 }, { "epoch": 0.2256628212912366, "grad_norm": 0.3652741611003876, "learning_rate": 0.00019385390207998847, "loss": 0.2646, "step": 1115 }, { "epoch": 0.22586520947176686, "grad_norm": 0.45731592178344727, "learning_rate": 0.00019384291835673696, "loss": 0.251, "step": 1116 }, { "epoch": 0.22606759765229711, "grad_norm": 0.3881213963031769, "learning_rate": 0.00019383192513944367, "loss": 0.229, "step": 1117 }, { "epoch": 0.22626998583282737, "grad_norm": 0.32269710302352905, "learning_rate": 0.00019382092242922075, "loss": 0.2453, "step": 1118 }, { "epoch": 0.22647237401335762, "grad_norm": 0.3093521296977997, "learning_rate": 0.00019380991022718133, "loss": 0.2428, "step": 1119 }, { "epoch": 0.22667476219388788, "grad_norm": 0.3101629912853241, "learning_rate": 0.00019379888853443954, "loss": 0.2236, "step": 1120 }, { "epoch": 0.22687715037441814, "grad_norm": 0.32502663135528564, "learning_rate": 0.00019378785735211046, "loss": 0.2685, "step": 1121 }, { "epoch": 0.2270795385549484, "grad_norm": 0.36589106917381287, "learning_rate": 0.00019377681668131006, "loss": 0.247, "step": 1122 }, { "epoch": 0.22728192673547865, "grad_norm": 0.32985422015190125, "learning_rate": 0.00019376576652315532, "loss": 0.2331, "step": 1123 }, { "epoch": 0.2274843149160089, "grad_norm": 0.3607713580131531, "learning_rate": 0.00019375470687876424, "loss": 0.266, "step": 1124 }, { "epoch": 0.22768670309653916, "grad_norm": 0.2738536298274994, "learning_rate": 0.0001937436377492556, "loss": 0.2187, "step": 1125 }, { "epoch": 0.2278890912770694, "grad_norm": 0.45239001512527466, "learning_rate": 0.0001937325591357494, "loss": 0.2882, "step": 1126 }, { "epoch": 0.22809147945759967, "grad_norm": 0.34548941254615784, "learning_rate": 0.00019372147103936636, "loss": 0.2365, "step": 1127 }, { "epoch": 0.22829386763812992, "grad_norm": 0.503173828125, "learning_rate": 0.00019371037346122832, "loss": 0.2808, "step": 1128 }, { "epoch": 0.22849625581866018, "grad_norm": 0.3561367094516754, "learning_rate": 0.00019369926640245796, "loss": 0.2423, "step": 1129 }, { "epoch": 0.22869864399919046, "grad_norm": 0.2863787114620209, "learning_rate": 0.00019368814986417897, "loss": 0.2339, "step": 1130 }, { "epoch": 0.22890103217972071, "grad_norm": 0.3625780940055847, "learning_rate": 0.00019367702384751607, "loss": 0.2037, "step": 1131 }, { "epoch": 0.22910342036025097, "grad_norm": 0.28424787521362305, "learning_rate": 0.00019366588835359485, "loss": 0.2404, "step": 1132 }, { "epoch": 0.22930580854078123, "grad_norm": 0.30158454179763794, "learning_rate": 0.00019365474338354184, "loss": 0.2466, "step": 1133 }, { "epoch": 0.22950819672131148, "grad_norm": 0.3348842263221741, "learning_rate": 0.0001936435889384846, "loss": 0.2854, "step": 1134 }, { "epoch": 0.22971058490184174, "grad_norm": 0.4381720721721649, "learning_rate": 0.00019363242501955168, "loss": 0.2493, "step": 1135 }, { "epoch": 0.229912973082372, "grad_norm": 0.40765929222106934, "learning_rate": 0.00019362125162787242, "loss": 0.2845, "step": 1136 }, { "epoch": 0.23011536126290225, "grad_norm": 0.329726904630661, "learning_rate": 0.0001936100687645773, "loss": 0.2795, "step": 1137 }, { "epoch": 0.2303177494434325, "grad_norm": 0.2915996015071869, "learning_rate": 0.00019359887643079766, "loss": 0.2134, "step": 1138 }, { "epoch": 0.23052013762396276, "grad_norm": 0.3605322241783142, "learning_rate": 0.0001935876746276658, "loss": 0.2622, "step": 1139 }, { "epoch": 0.230722525804493, "grad_norm": 0.4158976674079895, "learning_rate": 0.00019357646335631503, "loss": 0.2564, "step": 1140 }, { "epoch": 0.23092491398502327, "grad_norm": 0.5059180855751038, "learning_rate": 0.0001935652426178796, "loss": 0.2614, "step": 1141 }, { "epoch": 0.23112730216555352, "grad_norm": 0.2895376980304718, "learning_rate": 0.00019355401241349468, "loss": 0.2556, "step": 1142 }, { "epoch": 0.23132969034608378, "grad_norm": 0.7368476390838623, "learning_rate": 0.00019354277274429645, "loss": 0.2987, "step": 1143 }, { "epoch": 0.23153207852661403, "grad_norm": 0.3806043565273285, "learning_rate": 0.000193531523611422, "loss": 0.2894, "step": 1144 }, { "epoch": 0.23173446670714432, "grad_norm": 0.3642440140247345, "learning_rate": 0.0001935202650160094, "loss": 0.2633, "step": 1145 }, { "epoch": 0.23193685488767457, "grad_norm": 0.38270601630210876, "learning_rate": 0.0001935089969591977, "loss": 0.2258, "step": 1146 }, { "epoch": 0.23213924306820483, "grad_norm": 0.3156524896621704, "learning_rate": 0.00019349771944212685, "loss": 0.2552, "step": 1147 }, { "epoch": 0.23234163124873508, "grad_norm": 0.3413656949996948, "learning_rate": 0.0001934864324659378, "loss": 0.2604, "step": 1148 }, { "epoch": 0.23254401942926534, "grad_norm": 0.2893592119216919, "learning_rate": 0.00019347513603177246, "loss": 0.2305, "step": 1149 }, { "epoch": 0.2327464076097956, "grad_norm": 0.4488675892353058, "learning_rate": 0.00019346383014077372, "loss": 0.2278, "step": 1150 }, { "epoch": 0.2327464076097956, "eval_loss": 0.284952312707901, "eval_runtime": 0.7418, "eval_samples_per_second": 6.741, "eval_steps_per_second": 1.348, "step": 1150 }, { "epoch": 0.23294879579032585, "grad_norm": 0.38737061619758606, "learning_rate": 0.00019345251479408528, "loss": 0.2428, "step": 1151 }, { "epoch": 0.2331511839708561, "grad_norm": 0.5436646342277527, "learning_rate": 0.00019344118999285202, "loss": 0.2633, "step": 1152 }, { "epoch": 0.23335357215138636, "grad_norm": 0.3771059215068817, "learning_rate": 0.00019342985573821963, "loss": 0.2295, "step": 1153 }, { "epoch": 0.2335559603319166, "grad_norm": 0.3182806372642517, "learning_rate": 0.00019341851203133476, "loss": 0.2379, "step": 1154 }, { "epoch": 0.23375834851244687, "grad_norm": 0.3835807740688324, "learning_rate": 0.0001934071588733451, "loss": 0.2702, "step": 1155 }, { "epoch": 0.23396073669297712, "grad_norm": 0.29284048080444336, "learning_rate": 0.0001933957962653992, "loss": 0.2558, "step": 1156 }, { "epoch": 0.23416312487350738, "grad_norm": 0.2697771489620209, "learning_rate": 0.00019338442420864665, "loss": 0.2362, "step": 1157 }, { "epoch": 0.23436551305403763, "grad_norm": 0.44306814670562744, "learning_rate": 0.0001933730427042379, "loss": 0.2421, "step": 1158 }, { "epoch": 0.2345679012345679, "grad_norm": 0.2870837152004242, "learning_rate": 0.00019336165175332445, "loss": 0.2491, "step": 1159 }, { "epoch": 0.23477028941509817, "grad_norm": 0.2829664647579193, "learning_rate": 0.00019335025135705869, "loss": 0.2482, "step": 1160 }, { "epoch": 0.23497267759562843, "grad_norm": 0.35894840955734253, "learning_rate": 0.00019333884151659402, "loss": 0.2268, "step": 1161 }, { "epoch": 0.23517506577615868, "grad_norm": 0.2977736294269562, "learning_rate": 0.0001933274222330848, "loss": 0.2323, "step": 1162 }, { "epoch": 0.23537745395668894, "grad_norm": 0.3158702552318573, "learning_rate": 0.00019331599350768622, "loss": 0.217, "step": 1163 }, { "epoch": 0.2355798421372192, "grad_norm": 0.252240926027298, "learning_rate": 0.00019330455534155458, "loss": 0.2308, "step": 1164 }, { "epoch": 0.23578223031774945, "grad_norm": 0.36202460527420044, "learning_rate": 0.00019329310773584708, "loss": 0.2779, "step": 1165 }, { "epoch": 0.2359846184982797, "grad_norm": 0.6302611827850342, "learning_rate": 0.00019328165069172184, "loss": 0.3018, "step": 1166 }, { "epoch": 0.23618700667880996, "grad_norm": 0.2542176842689514, "learning_rate": 0.00019327018421033798, "loss": 0.2328, "step": 1167 }, { "epoch": 0.2363893948593402, "grad_norm": 0.29622212052345276, "learning_rate": 0.00019325870829285554, "loss": 0.2291, "step": 1168 }, { "epoch": 0.23659178303987047, "grad_norm": 0.27297911047935486, "learning_rate": 0.00019324722294043558, "loss": 0.2508, "step": 1169 }, { "epoch": 0.23679417122040072, "grad_norm": 0.4703531563282013, "learning_rate": 0.00019323572815424002, "loss": 0.3078, "step": 1170 }, { "epoch": 0.23699655940093098, "grad_norm": 0.34398144483566284, "learning_rate": 0.0001932242239354318, "loss": 0.2776, "step": 1171 }, { "epoch": 0.23719894758146123, "grad_norm": 0.27496278285980225, "learning_rate": 0.0001932127102851748, "loss": 0.2555, "step": 1172 }, { "epoch": 0.2374013357619915, "grad_norm": 0.2514081597328186, "learning_rate": 0.00019320118720463382, "loss": 0.261, "step": 1173 }, { "epoch": 0.23760372394252174, "grad_norm": 0.37026259303092957, "learning_rate": 0.0001931896546949747, "loss": 0.251, "step": 1174 }, { "epoch": 0.23780611212305203, "grad_norm": 0.28460580110549927, "learning_rate": 0.00019317811275736411, "loss": 0.2342, "step": 1175 }, { "epoch": 0.23800850030358228, "grad_norm": 0.43450552225112915, "learning_rate": 0.0001931665613929698, "loss": 0.2765, "step": 1176 }, { "epoch": 0.23821088848411254, "grad_norm": 0.27132248878479004, "learning_rate": 0.00019315500060296037, "loss": 0.2299, "step": 1177 }, { "epoch": 0.2384132766646428, "grad_norm": 0.340587317943573, "learning_rate": 0.00019314343038850546, "loss": 0.233, "step": 1178 }, { "epoch": 0.23861566484517305, "grad_norm": 0.29796266555786133, "learning_rate": 0.0001931318507507756, "loss": 0.2512, "step": 1179 }, { "epoch": 0.2388180530257033, "grad_norm": 0.303588330745697, "learning_rate": 0.00019312026169094232, "loss": 0.2563, "step": 1180 }, { "epoch": 0.23902044120623356, "grad_norm": 0.3156834542751312, "learning_rate": 0.00019310866321017804, "loss": 0.2848, "step": 1181 }, { "epoch": 0.2392228293867638, "grad_norm": 0.3846820294857025, "learning_rate": 0.00019309705530965623, "loss": 0.2661, "step": 1182 }, { "epoch": 0.23942521756729407, "grad_norm": 0.3869188129901886, "learning_rate": 0.00019308543799055117, "loss": 0.2486, "step": 1183 }, { "epoch": 0.23962760574782432, "grad_norm": 0.22980014979839325, "learning_rate": 0.00019307381125403827, "loss": 0.2238, "step": 1184 }, { "epoch": 0.23982999392835458, "grad_norm": 0.43149107694625854, "learning_rate": 0.00019306217510129374, "loss": 0.2774, "step": 1185 }, { "epoch": 0.24003238210888483, "grad_norm": 0.3303294777870178, "learning_rate": 0.00019305052953349483, "loss": 0.2472, "step": 1186 }, { "epoch": 0.2402347702894151, "grad_norm": 0.3163663148880005, "learning_rate": 0.0001930388745518197, "loss": 0.2756, "step": 1187 }, { "epoch": 0.24043715846994534, "grad_norm": 0.26152318716049194, "learning_rate": 0.0001930272101574475, "loss": 0.2315, "step": 1188 }, { "epoch": 0.2406395466504756, "grad_norm": 0.3256945013999939, "learning_rate": 0.00019301553635155832, "loss": 0.2446, "step": 1189 }, { "epoch": 0.24084193483100588, "grad_norm": 0.4159514605998993, "learning_rate": 0.00019300385313533313, "loss": 0.2739, "step": 1190 }, { "epoch": 0.24104432301153614, "grad_norm": 0.37670719623565674, "learning_rate": 0.00019299216050995397, "loss": 0.2332, "step": 1191 }, { "epoch": 0.2412467111920664, "grad_norm": 0.35532158613204956, "learning_rate": 0.00019298045847660378, "loss": 0.2533, "step": 1192 }, { "epoch": 0.24144909937259665, "grad_norm": 0.3971513509750366, "learning_rate": 0.00019296874703646642, "loss": 0.2385, "step": 1193 }, { "epoch": 0.2416514875531269, "grad_norm": 0.32217881083488464, "learning_rate": 0.00019295702619072675, "loss": 0.2215, "step": 1194 }, { "epoch": 0.24185387573365716, "grad_norm": 0.2856425344944, "learning_rate": 0.00019294529594057056, "loss": 0.2388, "step": 1195 }, { "epoch": 0.24205626391418741, "grad_norm": 0.34437721967697144, "learning_rate": 0.0001929335562871846, "loss": 0.2309, "step": 1196 }, { "epoch": 0.24225865209471767, "grad_norm": 0.32238832116127014, "learning_rate": 0.00019292180723175654, "loss": 0.2578, "step": 1197 }, { "epoch": 0.24246104027524792, "grad_norm": 0.27359238266944885, "learning_rate": 0.00019291004877547505, "loss": 0.2259, "step": 1198 }, { "epoch": 0.24266342845577818, "grad_norm": 0.2698918282985687, "learning_rate": 0.0001928982809195297, "loss": 0.198, "step": 1199 }, { "epoch": 0.24286581663630843, "grad_norm": 0.30236271023750305, "learning_rate": 0.00019288650366511108, "loss": 0.234, "step": 1200 }, { "epoch": 0.24286581663630843, "eval_loss": 0.30198293924331665, "eval_runtime": 0.74, "eval_samples_per_second": 6.757, "eval_steps_per_second": 1.351, "step": 1200 }, { "epoch": 0.2430682048168387, "grad_norm": 0.3634655773639679, "learning_rate": 0.00019287471701341064, "loss": 0.2727, "step": 1201 }, { "epoch": 0.24327059299736895, "grad_norm": 0.29206645488739014, "learning_rate": 0.00019286292096562087, "loss": 0.2519, "step": 1202 }, { "epoch": 0.2434729811778992, "grad_norm": 0.38446906208992004, "learning_rate": 0.00019285111552293517, "loss": 0.2559, "step": 1203 }, { "epoch": 0.24367536935842946, "grad_norm": 0.3113996982574463, "learning_rate": 0.0001928393006865479, "loss": 0.2475, "step": 1204 }, { "epoch": 0.24387775753895974, "grad_norm": 0.34893080592155457, "learning_rate": 0.00019282747645765427, "loss": 0.2776, "step": 1205 }, { "epoch": 0.24408014571949, "grad_norm": 0.3176359534263611, "learning_rate": 0.00019281564283745063, "loss": 0.2287, "step": 1206 }, { "epoch": 0.24428253390002025, "grad_norm": 0.3394664525985718, "learning_rate": 0.00019280379982713417, "loss": 0.2335, "step": 1207 }, { "epoch": 0.2444849220805505, "grad_norm": 0.35441362857818604, "learning_rate": 0.000192791947427903, "loss": 0.2258, "step": 1208 }, { "epoch": 0.24468731026108076, "grad_norm": 0.35951903462409973, "learning_rate": 0.00019278008564095628, "loss": 0.2358, "step": 1209 }, { "epoch": 0.24488969844161101, "grad_norm": 0.28018155694007874, "learning_rate": 0.00019276821446749398, "loss": 0.2826, "step": 1210 }, { "epoch": 0.24509208662214127, "grad_norm": 0.2939336597919464, "learning_rate": 0.00019275633390871717, "loss": 0.2652, "step": 1211 }, { "epoch": 0.24529447480267152, "grad_norm": 0.27605128288269043, "learning_rate": 0.0001927444439658278, "loss": 0.2196, "step": 1212 }, { "epoch": 0.24549686298320178, "grad_norm": 0.28433987498283386, "learning_rate": 0.00019273254464002867, "loss": 0.2266, "step": 1213 }, { "epoch": 0.24569925116373204, "grad_norm": 0.3328288793563843, "learning_rate": 0.00019272063593252377, "loss": 0.2314, "step": 1214 }, { "epoch": 0.2459016393442623, "grad_norm": 0.3645075261592865, "learning_rate": 0.0001927087178445178, "loss": 0.2484, "step": 1215 }, { "epoch": 0.24610402752479255, "grad_norm": 0.2907272279262543, "learning_rate": 0.00019269679037721654, "loss": 0.2077, "step": 1216 }, { "epoch": 0.2463064157053228, "grad_norm": 0.3099174201488495, "learning_rate": 0.00019268485353182672, "loss": 0.2581, "step": 1217 }, { "epoch": 0.24650880388585306, "grad_norm": 0.308207631111145, "learning_rate": 0.00019267290730955595, "loss": 0.2416, "step": 1218 }, { "epoch": 0.2467111920663833, "grad_norm": 0.2589235007762909, "learning_rate": 0.00019266095171161277, "loss": 0.2285, "step": 1219 }, { "epoch": 0.24691358024691357, "grad_norm": 0.4162542223930359, "learning_rate": 0.00019264898673920683, "loss": 0.2631, "step": 1220 }, { "epoch": 0.24711596842744385, "grad_norm": 0.37113407254219055, "learning_rate": 0.00019263701239354854, "loss": 0.2464, "step": 1221 }, { "epoch": 0.2473183566079741, "grad_norm": 0.2872184216976166, "learning_rate": 0.0001926250286758494, "loss": 0.25, "step": 1222 }, { "epoch": 0.24752074478850436, "grad_norm": 0.3551543056964874, "learning_rate": 0.0001926130355873217, "loss": 0.2656, "step": 1223 }, { "epoch": 0.24772313296903462, "grad_norm": 0.3972734808921814, "learning_rate": 0.0001926010331291789, "loss": 0.226, "step": 1224 }, { "epoch": 0.24792552114956487, "grad_norm": 0.27993929386138916, "learning_rate": 0.00019258902130263517, "loss": 0.2283, "step": 1225 }, { "epoch": 0.24812790933009513, "grad_norm": 0.26829513907432556, "learning_rate": 0.00019257700010890581, "loss": 0.2235, "step": 1226 }, { "epoch": 0.24833029751062538, "grad_norm": 0.3441103398799896, "learning_rate": 0.00019256496954920697, "loss": 0.2574, "step": 1227 }, { "epoch": 0.24853268569115564, "grad_norm": 0.3490145206451416, "learning_rate": 0.0001925529296247558, "loss": 0.303, "step": 1228 }, { "epoch": 0.2487350738716859, "grad_norm": 0.33735692501068115, "learning_rate": 0.00019254088033677036, "loss": 0.2282, "step": 1229 }, { "epoch": 0.24893746205221615, "grad_norm": 0.2959878742694855, "learning_rate": 0.00019252882168646965, "loss": 0.2375, "step": 1230 }, { "epoch": 0.2491398502327464, "grad_norm": 0.37007415294647217, "learning_rate": 0.00019251675367507368, "loss": 0.2432, "step": 1231 }, { "epoch": 0.24934223841327666, "grad_norm": 0.3428284525871277, "learning_rate": 0.00019250467630380332, "loss": 0.2706, "step": 1232 }, { "epoch": 0.2495446265938069, "grad_norm": 0.3415639400482178, "learning_rate": 0.00019249258957388047, "loss": 0.2563, "step": 1233 }, { "epoch": 0.24974701477433717, "grad_norm": 0.34776055812835693, "learning_rate": 0.0001924804934865279, "loss": 0.28, "step": 1234 }, { "epoch": 0.24994940295486742, "grad_norm": 0.35156211256980896, "learning_rate": 0.0001924683880429694, "loss": 0.2157, "step": 1235 }, { "epoch": 0.2501517911353977, "grad_norm": 0.2750054597854614, "learning_rate": 0.00019245627324442966, "loss": 0.2524, "step": 1236 }, { "epoch": 0.25035417931592796, "grad_norm": 0.27282828092575073, "learning_rate": 0.0001924441490921343, "loss": 0.1987, "step": 1237 }, { "epoch": 0.2505565674964582, "grad_norm": 0.3300628960132599, "learning_rate": 0.00019243201558730996, "loss": 0.2745, "step": 1238 }, { "epoch": 0.25075895567698847, "grad_norm": 0.354278028011322, "learning_rate": 0.00019241987273118416, "loss": 0.2715, "step": 1239 }, { "epoch": 0.2509613438575187, "grad_norm": 0.2568090856075287, "learning_rate": 0.0001924077205249854, "loss": 0.2251, "step": 1240 }, { "epoch": 0.251163732038049, "grad_norm": 0.3392896354198456, "learning_rate": 0.00019239555896994308, "loss": 0.224, "step": 1241 }, { "epoch": 0.25136612021857924, "grad_norm": 0.2745809853076935, "learning_rate": 0.0001923833880672876, "loss": 0.2355, "step": 1242 }, { "epoch": 0.2515685083991095, "grad_norm": 0.4213380813598633, "learning_rate": 0.00019237120781825028, "loss": 0.242, "step": 1243 }, { "epoch": 0.25177089657963975, "grad_norm": 0.4063540995121002, "learning_rate": 0.00019235901822406342, "loss": 0.3008, "step": 1244 }, { "epoch": 0.25197328476017, "grad_norm": 0.3177233934402466, "learning_rate": 0.00019234681928596018, "loss": 0.2917, "step": 1245 }, { "epoch": 0.25217567294070026, "grad_norm": 0.2831498086452484, "learning_rate": 0.0001923346110051748, "loss": 0.224, "step": 1246 }, { "epoch": 0.2523780611212305, "grad_norm": 0.26334744691848755, "learning_rate": 0.00019232239338294225, "loss": 0.2505, "step": 1247 }, { "epoch": 0.25258044930176077, "grad_norm": 0.391368567943573, "learning_rate": 0.0001923101664204987, "loss": 0.2583, "step": 1248 }, { "epoch": 0.252782837482291, "grad_norm": 0.44112735986709595, "learning_rate": 0.00019229793011908114, "loss": 0.2568, "step": 1249 }, { "epoch": 0.2529852256628213, "grad_norm": 0.2900398373603821, "learning_rate": 0.00019228568447992746, "loss": 0.2362, "step": 1250 }, { "epoch": 0.2529852256628213, "eval_loss": 0.29207420349121094, "eval_runtime": 0.7379, "eval_samples_per_second": 6.776, "eval_steps_per_second": 1.355, "step": 1250 }, { "epoch": 0.25318761384335153, "grad_norm": 0.41520947217941284, "learning_rate": 0.00019227342950427657, "loss": 0.2584, "step": 1251 }, { "epoch": 0.2533900020238818, "grad_norm": 0.2793034613132477, "learning_rate": 0.0001922611651933683, "loss": 0.2594, "step": 1252 }, { "epoch": 0.25359239020441204, "grad_norm": 0.3332253396511078, "learning_rate": 0.00019224889154844342, "loss": 0.2877, "step": 1253 }, { "epoch": 0.2537947783849423, "grad_norm": 0.343176931142807, "learning_rate": 0.00019223660857074364, "loss": 0.2364, "step": 1254 }, { "epoch": 0.25399716656547255, "grad_norm": 0.3649160861968994, "learning_rate": 0.00019222431626151164, "loss": 0.2773, "step": 1255 }, { "epoch": 0.2541995547460028, "grad_norm": 0.29223141074180603, "learning_rate": 0.00019221201462199102, "loss": 0.2687, "step": 1256 }, { "epoch": 0.25440194292653306, "grad_norm": 0.3030182421207428, "learning_rate": 0.00019219970365342635, "loss": 0.2394, "step": 1257 }, { "epoch": 0.2546043311070634, "grad_norm": 0.3496536910533905, "learning_rate": 0.00019218738335706305, "loss": 0.2833, "step": 1258 }, { "epoch": 0.25480671928759363, "grad_norm": 0.28787243366241455, "learning_rate": 0.00019217505373414766, "loss": 0.2556, "step": 1259 }, { "epoch": 0.2550091074681239, "grad_norm": 0.36825138330459595, "learning_rate": 0.0001921627147859275, "loss": 0.2762, "step": 1260 }, { "epoch": 0.25521149564865414, "grad_norm": 0.2672223150730133, "learning_rate": 0.0001921503665136509, "loss": 0.2851, "step": 1261 }, { "epoch": 0.2554138838291844, "grad_norm": 0.2490050494670868, "learning_rate": 0.00019213800891856717, "loss": 0.2449, "step": 1262 }, { "epoch": 0.25561627200971465, "grad_norm": 0.3491511940956116, "learning_rate": 0.0001921256420019265, "loss": 0.2671, "step": 1263 }, { "epoch": 0.2558186601902449, "grad_norm": 0.31115198135375977, "learning_rate": 0.00019211326576497998, "loss": 0.1985, "step": 1264 }, { "epoch": 0.25602104837077516, "grad_norm": 0.3278442621231079, "learning_rate": 0.0001921008802089798, "loss": 0.2285, "step": 1265 }, { "epoch": 0.2562234365513054, "grad_norm": 0.41726815700531006, "learning_rate": 0.00019208848533517893, "loss": 0.2733, "step": 1266 }, { "epoch": 0.25642582473183567, "grad_norm": 0.29319825768470764, "learning_rate": 0.00019207608114483145, "loss": 0.2535, "step": 1267 }, { "epoch": 0.2566282129123659, "grad_norm": 0.33001402020454407, "learning_rate": 0.00019206366763919216, "loss": 0.2332, "step": 1268 }, { "epoch": 0.2568306010928962, "grad_norm": 0.3235301375389099, "learning_rate": 0.00019205124481951703, "loss": 0.2793, "step": 1269 }, { "epoch": 0.25703298927342644, "grad_norm": 0.3671470880508423, "learning_rate": 0.0001920388126870628, "loss": 0.2597, "step": 1270 }, { "epoch": 0.2572353774539567, "grad_norm": 0.3819235861301422, "learning_rate": 0.00019202637124308728, "loss": 0.2557, "step": 1271 }, { "epoch": 0.25743776563448695, "grad_norm": 0.39303115010261536, "learning_rate": 0.00019201392048884914, "loss": 0.2645, "step": 1272 }, { "epoch": 0.2576401538150172, "grad_norm": 0.2981061041355133, "learning_rate": 0.000192001460425608, "loss": 0.2781, "step": 1273 }, { "epoch": 0.25784254199554746, "grad_norm": 0.2990420162677765, "learning_rate": 0.00019198899105462447, "loss": 0.2646, "step": 1274 }, { "epoch": 0.2580449301760777, "grad_norm": 0.39371538162231445, "learning_rate": 0.00019197651237716005, "loss": 0.2723, "step": 1275 }, { "epoch": 0.25824731835660797, "grad_norm": 0.3517228364944458, "learning_rate": 0.0001919640243944772, "loss": 0.2718, "step": 1276 }, { "epoch": 0.2584497065371382, "grad_norm": 0.30374854803085327, "learning_rate": 0.00019195152710783933, "loss": 0.2358, "step": 1277 }, { "epoch": 0.2586520947176685, "grad_norm": 0.3416211009025574, "learning_rate": 0.0001919390205185108, "loss": 0.2693, "step": 1278 }, { "epoch": 0.25885448289819873, "grad_norm": 0.2955116033554077, "learning_rate": 0.00019192650462775688, "loss": 0.2419, "step": 1279 }, { "epoch": 0.259056871078729, "grad_norm": 0.30653461813926697, "learning_rate": 0.00019191397943684377, "loss": 0.2676, "step": 1280 }, { "epoch": 0.25925925925925924, "grad_norm": 0.23594805598258972, "learning_rate": 0.00019190144494703865, "loss": 0.2407, "step": 1281 }, { "epoch": 0.2594616474397895, "grad_norm": 0.4294579029083252, "learning_rate": 0.00019188890115960967, "loss": 0.2699, "step": 1282 }, { "epoch": 0.25966403562031976, "grad_norm": 0.362417072057724, "learning_rate": 0.00019187634807582587, "loss": 0.2416, "step": 1283 }, { "epoch": 0.25986642380085, "grad_norm": 0.3066248595714569, "learning_rate": 0.00019186378569695716, "loss": 0.2449, "step": 1284 }, { "epoch": 0.26006881198138027, "grad_norm": 0.3117138743400574, "learning_rate": 0.00019185121402427458, "loss": 0.2798, "step": 1285 }, { "epoch": 0.2602712001619105, "grad_norm": 0.33185073733329773, "learning_rate": 0.00019183863305904995, "loss": 0.2752, "step": 1286 }, { "epoch": 0.2604735883424408, "grad_norm": 0.36043572425842285, "learning_rate": 0.00019182604280255604, "loss": 0.2694, "step": 1287 }, { "epoch": 0.26067597652297103, "grad_norm": 0.2410745471715927, "learning_rate": 0.00019181344325606666, "loss": 0.1937, "step": 1288 }, { "epoch": 0.26087836470350134, "grad_norm": 0.29702746868133545, "learning_rate": 0.0001918008344208565, "loss": 0.2393, "step": 1289 }, { "epoch": 0.2610807528840316, "grad_norm": 0.2712313234806061, "learning_rate": 0.00019178821629820117, "loss": 0.2199, "step": 1290 }, { "epoch": 0.26128314106456185, "grad_norm": 0.25044986605644226, "learning_rate": 0.0001917755888893772, "loss": 0.2121, "step": 1291 }, { "epoch": 0.2614855292450921, "grad_norm": 0.2928940951824188, "learning_rate": 0.00019176295219566213, "loss": 0.2684, "step": 1292 }, { "epoch": 0.26168791742562236, "grad_norm": 0.2997845709323883, "learning_rate": 0.00019175030621833446, "loss": 0.2613, "step": 1293 }, { "epoch": 0.2618903056061526, "grad_norm": 0.3501720130443573, "learning_rate": 0.0001917376509586735, "loss": 0.2604, "step": 1294 }, { "epoch": 0.2620926937866829, "grad_norm": 0.28727906942367554, "learning_rate": 0.00019172498641795963, "loss": 0.2018, "step": 1295 }, { "epoch": 0.26229508196721313, "grad_norm": 0.2778218984603882, "learning_rate": 0.00019171231259747405, "loss": 0.2164, "step": 1296 }, { "epoch": 0.2624974701477434, "grad_norm": 0.42163240909576416, "learning_rate": 0.00019169962949849904, "loss": 0.2536, "step": 1297 }, { "epoch": 0.26269985832827364, "grad_norm": 0.27423128485679626, "learning_rate": 0.00019168693712231773, "loss": 0.2335, "step": 1298 }, { "epoch": 0.2629022465088039, "grad_norm": 0.3440936505794525, "learning_rate": 0.00019167423547021418, "loss": 0.2326, "step": 1299 }, { "epoch": 0.26310463468933415, "grad_norm": 0.5512051582336426, "learning_rate": 0.00019166152454347336, "loss": 0.2086, "step": 1300 }, { "epoch": 0.26310463468933415, "eval_loss": 0.29028138518333435, "eval_runtime": 0.7404, "eval_samples_per_second": 6.753, "eval_steps_per_second": 1.351, "step": 1300 }, { "epoch": 0.2633070228698644, "grad_norm": 0.48292115330696106, "learning_rate": 0.00019164880434338133, "loss": 0.3138, "step": 1301 }, { "epoch": 0.26350941105039466, "grad_norm": 0.4760967791080475, "learning_rate": 0.00019163607487122494, "loss": 0.2372, "step": 1302 }, { "epoch": 0.2637117992309249, "grad_norm": 0.349447101354599, "learning_rate": 0.00019162333612829198, "loss": 0.263, "step": 1303 }, { "epoch": 0.26391418741145517, "grad_norm": 0.39979079365730286, "learning_rate": 0.00019161058811587126, "loss": 0.2567, "step": 1304 }, { "epoch": 0.2641165755919854, "grad_norm": 0.32025107741355896, "learning_rate": 0.00019159783083525254, "loss": 0.232, "step": 1305 }, { "epoch": 0.2643189637725157, "grad_norm": 0.3130270838737488, "learning_rate": 0.0001915850642877264, "loss": 0.2278, "step": 1306 }, { "epoch": 0.26452135195304594, "grad_norm": 0.46600160002708435, "learning_rate": 0.00019157228847458443, "loss": 0.258, "step": 1307 }, { "epoch": 0.2647237401335762, "grad_norm": 0.4704112410545349, "learning_rate": 0.00019155950339711918, "loss": 0.2413, "step": 1308 }, { "epoch": 0.26492612831410645, "grad_norm": 0.6043704748153687, "learning_rate": 0.00019154670905662414, "loss": 0.2561, "step": 1309 }, { "epoch": 0.2651285164946367, "grad_norm": 0.3092261254787445, "learning_rate": 0.00019153390545439362, "loss": 0.2234, "step": 1310 }, { "epoch": 0.26533090467516696, "grad_norm": 0.35605481266975403, "learning_rate": 0.00019152109259172302, "loss": 0.2598, "step": 1311 }, { "epoch": 0.2655332928556972, "grad_norm": 0.375472754240036, "learning_rate": 0.00019150827046990858, "loss": 0.2651, "step": 1312 }, { "epoch": 0.26573568103622747, "grad_norm": 0.41777515411376953, "learning_rate": 0.00019149543909024753, "loss": 0.2597, "step": 1313 }, { "epoch": 0.2659380692167577, "grad_norm": 0.41457483172416687, "learning_rate": 0.000191482598454038, "loss": 0.2745, "step": 1314 }, { "epoch": 0.266140457397288, "grad_norm": 0.3231428265571594, "learning_rate": 0.00019146974856257905, "loss": 0.2431, "step": 1315 }, { "epoch": 0.26634284557781823, "grad_norm": 0.3488149046897888, "learning_rate": 0.00019145688941717075, "loss": 0.2137, "step": 1316 }, { "epoch": 0.2665452337583485, "grad_norm": 0.42718151211738586, "learning_rate": 0.000191444021019114, "loss": 0.2431, "step": 1317 }, { "epoch": 0.26674762193887874, "grad_norm": 0.27386337518692017, "learning_rate": 0.00019143114336971073, "loss": 0.2168, "step": 1318 }, { "epoch": 0.26695001011940905, "grad_norm": 0.3161386251449585, "learning_rate": 0.00019141825647026376, "loss": 0.2723, "step": 1319 }, { "epoch": 0.2671523982999393, "grad_norm": 0.27285152673721313, "learning_rate": 0.0001914053603220768, "loss": 0.2013, "step": 1320 }, { "epoch": 0.26735478648046956, "grad_norm": 0.321103572845459, "learning_rate": 0.00019139245492645466, "loss": 0.2282, "step": 1321 }, { "epoch": 0.2675571746609998, "grad_norm": 0.39927050471305847, "learning_rate": 0.00019137954028470284, "loss": 0.2403, "step": 1322 }, { "epoch": 0.2677595628415301, "grad_norm": 0.4967322051525116, "learning_rate": 0.00019136661639812798, "loss": 0.2373, "step": 1323 }, { "epoch": 0.26796195102206033, "grad_norm": 0.3009711503982544, "learning_rate": 0.0001913536832680376, "loss": 0.2455, "step": 1324 }, { "epoch": 0.2681643392025906, "grad_norm": 0.27346163988113403, "learning_rate": 0.00019134074089574007, "loss": 0.2134, "step": 1325 }, { "epoch": 0.26836672738312084, "grad_norm": 0.24805238842964172, "learning_rate": 0.00019132778928254485, "loss": 0.2317, "step": 1326 }, { "epoch": 0.2685691155636511, "grad_norm": 0.251960426568985, "learning_rate": 0.00019131482842976217, "loss": 0.2462, "step": 1327 }, { "epoch": 0.26877150374418135, "grad_norm": 0.34561920166015625, "learning_rate": 0.0001913018583387033, "loss": 0.2778, "step": 1328 }, { "epoch": 0.2689738919247116, "grad_norm": 0.26779308915138245, "learning_rate": 0.00019128887901068045, "loss": 0.2451, "step": 1329 }, { "epoch": 0.26917628010524186, "grad_norm": 0.28858324885368347, "learning_rate": 0.00019127589044700668, "loss": 0.2433, "step": 1330 }, { "epoch": 0.2693786682857721, "grad_norm": 0.3663479685783386, "learning_rate": 0.00019126289264899607, "loss": 0.2349, "step": 1331 }, { "epoch": 0.26958105646630237, "grad_norm": 0.35032492876052856, "learning_rate": 0.0001912498856179636, "loss": 0.241, "step": 1332 }, { "epoch": 0.2697834446468326, "grad_norm": 0.26940011978149414, "learning_rate": 0.00019123686935522516, "loss": 0.2337, "step": 1333 }, { "epoch": 0.2699858328273629, "grad_norm": 0.30461275577545166, "learning_rate": 0.00019122384386209765, "loss": 0.2457, "step": 1334 }, { "epoch": 0.27018822100789314, "grad_norm": 0.27946141362190247, "learning_rate": 0.0001912108091398988, "loss": 0.2095, "step": 1335 }, { "epoch": 0.2703906091884234, "grad_norm": 0.3439542353153229, "learning_rate": 0.00019119776518994734, "loss": 0.2673, "step": 1336 }, { "epoch": 0.27059299736895365, "grad_norm": 0.6692569255828857, "learning_rate": 0.00019118471201356291, "loss": 0.2584, "step": 1337 }, { "epoch": 0.2707953855494839, "grad_norm": 0.30140796303749084, "learning_rate": 0.00019117164961206614, "loss": 0.2286, "step": 1338 }, { "epoch": 0.27099777373001416, "grad_norm": 0.3491653800010681, "learning_rate": 0.0001911585779867785, "loss": 0.2616, "step": 1339 }, { "epoch": 0.2712001619105444, "grad_norm": 0.27138373255729675, "learning_rate": 0.00019114549713902245, "loss": 0.2468, "step": 1340 }, { "epoch": 0.27140255009107467, "grad_norm": 0.41578251123428345, "learning_rate": 0.0001911324070701214, "loss": 0.2548, "step": 1341 }, { "epoch": 0.2716049382716049, "grad_norm": 0.28390955924987793, "learning_rate": 0.0001911193077813996, "loss": 0.2046, "step": 1342 }, { "epoch": 0.2718073264521352, "grad_norm": 0.32496026158332825, "learning_rate": 0.00019110619927418238, "loss": 0.2447, "step": 1343 }, { "epoch": 0.27200971463266543, "grad_norm": 0.29900750517845154, "learning_rate": 0.0001910930815497959, "loss": 0.2665, "step": 1344 }, { "epoch": 0.2722121028131957, "grad_norm": 0.441050261259079, "learning_rate": 0.00019107995460956723, "loss": 0.265, "step": 1345 }, { "epoch": 0.27241449099372594, "grad_norm": 0.3301616609096527, "learning_rate": 0.00019106681845482444, "loss": 0.2637, "step": 1346 }, { "epoch": 0.2726168791742562, "grad_norm": 0.3114319443702698, "learning_rate": 0.0001910536730868965, "loss": 0.2217, "step": 1347 }, { "epoch": 0.27281926735478645, "grad_norm": 0.38809117674827576, "learning_rate": 0.00019104051850711334, "loss": 0.2534, "step": 1348 }, { "epoch": 0.27302165553531677, "grad_norm": 0.321119487285614, "learning_rate": 0.0001910273547168058, "loss": 0.2569, "step": 1349 }, { "epoch": 0.273224043715847, "grad_norm": 0.2682242691516876, "learning_rate": 0.00019101418171730568, "loss": 0.2079, "step": 1350 }, { "epoch": 0.273224043715847, "eval_loss": 0.2806507647037506, "eval_runtime": 0.7397, "eval_samples_per_second": 6.76, "eval_steps_per_second": 1.352, "step": 1350 }, { "epoch": 0.2734264318963773, "grad_norm": 0.32450467348098755, "learning_rate": 0.00019100099950994563, "loss": 0.2446, "step": 1351 }, { "epoch": 0.27362882007690753, "grad_norm": 0.3531615138053894, "learning_rate": 0.00019098780809605933, "loss": 0.2664, "step": 1352 }, { "epoch": 0.2738312082574378, "grad_norm": 0.4027129113674164, "learning_rate": 0.0001909746074769813, "loss": 0.2531, "step": 1353 }, { "epoch": 0.27403359643796804, "grad_norm": 0.27756667137145996, "learning_rate": 0.0001909613976540471, "loss": 0.2147, "step": 1354 }, { "epoch": 0.2742359846184983, "grad_norm": 0.3009014427661896, "learning_rate": 0.00019094817862859312, "loss": 0.2544, "step": 1355 }, { "epoch": 0.27443837279902855, "grad_norm": 0.3375817537307739, "learning_rate": 0.00019093495040195673, "loss": 0.2391, "step": 1356 }, { "epoch": 0.2746407609795588, "grad_norm": 0.2892801761627197, "learning_rate": 0.0001909217129754762, "loss": 0.2123, "step": 1357 }, { "epoch": 0.27484314916008906, "grad_norm": 0.3353901207447052, "learning_rate": 0.00019090846635049085, "loss": 0.2591, "step": 1358 }, { "epoch": 0.2750455373406193, "grad_norm": 0.32220199704170227, "learning_rate": 0.00019089521052834073, "loss": 0.2508, "step": 1359 }, { "epoch": 0.2752479255211496, "grad_norm": 0.3636913299560547, "learning_rate": 0.00019088194551036695, "loss": 0.2253, "step": 1360 }, { "epoch": 0.2754503137016798, "grad_norm": 0.32039812207221985, "learning_rate": 0.00019086867129791154, "loss": 0.2578, "step": 1361 }, { "epoch": 0.2756527018822101, "grad_norm": 0.2603769302368164, "learning_rate": 0.00019085538789231742, "loss": 0.242, "step": 1362 }, { "epoch": 0.27585509006274034, "grad_norm": 0.34952616691589355, "learning_rate": 0.00019084209529492854, "loss": 0.2729, "step": 1363 }, { "epoch": 0.2760574782432706, "grad_norm": 0.40078848600387573, "learning_rate": 0.0001908287935070896, "loss": 0.2578, "step": 1364 }, { "epoch": 0.27625986642380085, "grad_norm": 0.3610617518424988, "learning_rate": 0.00019081548253014642, "loss": 0.2614, "step": 1365 }, { "epoch": 0.2764622546043311, "grad_norm": 0.3041462004184723, "learning_rate": 0.00019080216236544567, "loss": 0.2666, "step": 1366 }, { "epoch": 0.27666464278486136, "grad_norm": 0.28576037287712097, "learning_rate": 0.00019078883301433484, "loss": 0.232, "step": 1367 }, { "epoch": 0.2768670309653916, "grad_norm": 0.40742477774620056, "learning_rate": 0.00019077549447816256, "loss": 0.241, "step": 1368 }, { "epoch": 0.27706941914592187, "grad_norm": 0.5294364094734192, "learning_rate": 0.00019076214675827825, "loss": 0.2099, "step": 1369 }, { "epoch": 0.2772718073264521, "grad_norm": 0.32854121923446655, "learning_rate": 0.00019074878985603227, "loss": 0.2925, "step": 1370 }, { "epoch": 0.2774741955069824, "grad_norm": 0.37988781929016113, "learning_rate": 0.00019073542377277597, "loss": 0.2368, "step": 1371 }, { "epoch": 0.27767658368751263, "grad_norm": 0.30840012431144714, "learning_rate": 0.00019072204850986154, "loss": 0.2437, "step": 1372 }, { "epoch": 0.2778789718680429, "grad_norm": 0.3330319821834564, "learning_rate": 0.0001907086640686422, "loss": 0.2713, "step": 1373 }, { "epoch": 0.27808136004857315, "grad_norm": 0.31399691104888916, "learning_rate": 0.000190695270450472, "loss": 0.2816, "step": 1374 }, { "epoch": 0.2782837482291034, "grad_norm": 0.3240007162094116, "learning_rate": 0.000190681867656706, "loss": 0.2752, "step": 1375 }, { "epoch": 0.27848613640963366, "grad_norm": 0.3109317421913147, "learning_rate": 0.00019066845568870014, "loss": 0.2277, "step": 1376 }, { "epoch": 0.2786885245901639, "grad_norm": 0.34070509672164917, "learning_rate": 0.00019065503454781133, "loss": 0.2441, "step": 1377 }, { "epoch": 0.27889091277069417, "grad_norm": 0.42925670742988586, "learning_rate": 0.00019064160423539733, "loss": 0.2751, "step": 1378 }, { "epoch": 0.2790933009512244, "grad_norm": 0.3259209990501404, "learning_rate": 0.00019062816475281692, "loss": 0.2704, "step": 1379 }, { "epoch": 0.27929568913175473, "grad_norm": 0.36306875944137573, "learning_rate": 0.00019061471610142976, "loss": 0.2935, "step": 1380 }, { "epoch": 0.279498077312285, "grad_norm": 0.3489514887332916, "learning_rate": 0.00019060125828259641, "loss": 0.2218, "step": 1381 }, { "epoch": 0.27970046549281524, "grad_norm": 0.27014103531837463, "learning_rate": 0.00019058779129767843, "loss": 0.2545, "step": 1382 }, { "epoch": 0.2799028536733455, "grad_norm": 0.30571702122688293, "learning_rate": 0.00019057431514803825, "loss": 0.2527, "step": 1383 }, { "epoch": 0.28010524185387575, "grad_norm": 0.3467295169830322, "learning_rate": 0.00019056082983503924, "loss": 0.239, "step": 1384 }, { "epoch": 0.280307630034406, "grad_norm": 0.301474392414093, "learning_rate": 0.00019054733536004575, "loss": 0.2154, "step": 1385 }, { "epoch": 0.28051001821493626, "grad_norm": 0.3737625479698181, "learning_rate": 0.00019053383172442292, "loss": 0.2965, "step": 1386 }, { "epoch": 0.2807124063954665, "grad_norm": 0.3446069359779358, "learning_rate": 0.00019052031892953698, "loss": 0.2356, "step": 1387 }, { "epoch": 0.2809147945759968, "grad_norm": 0.3060499429702759, "learning_rate": 0.000190506796976755, "loss": 0.2277, "step": 1388 }, { "epoch": 0.28111718275652703, "grad_norm": 0.3208872377872467, "learning_rate": 0.00019049326586744497, "loss": 0.2256, "step": 1389 }, { "epoch": 0.2813195709370573, "grad_norm": 0.5294331312179565, "learning_rate": 0.00019047972560297583, "loss": 0.264, "step": 1390 }, { "epoch": 0.28152195911758754, "grad_norm": 0.5644270181655884, "learning_rate": 0.00019046617618471745, "loss": 0.2347, "step": 1391 }, { "epoch": 0.2817243472981178, "grad_norm": 0.31018948554992676, "learning_rate": 0.0001904526176140406, "loss": 0.2504, "step": 1392 }, { "epoch": 0.28192673547864805, "grad_norm": 0.30512624979019165, "learning_rate": 0.00019043904989231701, "loss": 0.2609, "step": 1393 }, { "epoch": 0.2821291236591783, "grad_norm": 0.28403523564338684, "learning_rate": 0.00019042547302091934, "loss": 0.2432, "step": 1394 }, { "epoch": 0.28233151183970856, "grad_norm": 0.31579962372779846, "learning_rate": 0.00019041188700122112, "loss": 0.2721, "step": 1395 }, { "epoch": 0.2825339000202388, "grad_norm": 0.3145096004009247, "learning_rate": 0.00019039829183459687, "loss": 0.2439, "step": 1396 }, { "epoch": 0.28273628820076907, "grad_norm": 0.40980634093284607, "learning_rate": 0.00019038468752242198, "loss": 0.2439, "step": 1397 }, { "epoch": 0.2829386763812993, "grad_norm": 0.47898849844932556, "learning_rate": 0.0001903710740660728, "loss": 0.2627, "step": 1398 }, { "epoch": 0.2831410645618296, "grad_norm": 0.2893619239330292, "learning_rate": 0.00019035745146692658, "loss": 0.2028, "step": 1399 }, { "epoch": 0.28334345274235984, "grad_norm": 0.2524210512638092, "learning_rate": 0.00019034381972636157, "loss": 0.2382, "step": 1400 }, { "epoch": 0.28334345274235984, "eval_loss": 0.28519296646118164, "eval_runtime": 0.736, "eval_samples_per_second": 6.794, "eval_steps_per_second": 1.359, "step": 1400 }, { "epoch": 0.2835458409228901, "grad_norm": 0.3225509524345398, "learning_rate": 0.0001903301788457568, "loss": 0.2582, "step": 1401 }, { "epoch": 0.28374822910342035, "grad_norm": 0.5756785273551941, "learning_rate": 0.00019031652882649241, "loss": 0.2918, "step": 1402 }, { "epoch": 0.2839506172839506, "grad_norm": 0.35944122076034546, "learning_rate": 0.00019030286966994928, "loss": 0.271, "step": 1403 }, { "epoch": 0.28415300546448086, "grad_norm": 0.25696319341659546, "learning_rate": 0.00019028920137750935, "loss": 0.2162, "step": 1404 }, { "epoch": 0.2843553936450111, "grad_norm": 0.41459545493125916, "learning_rate": 0.00019027552395055542, "loss": 0.276, "step": 1405 }, { "epoch": 0.28455778182554137, "grad_norm": 0.33863842487335205, "learning_rate": 0.0001902618373904712, "loss": 0.2558, "step": 1406 }, { "epoch": 0.2847601700060716, "grad_norm": 0.3595339059829712, "learning_rate": 0.0001902481416986414, "loss": 0.2439, "step": 1407 }, { "epoch": 0.2849625581866019, "grad_norm": 0.3099048137664795, "learning_rate": 0.00019023443687645158, "loss": 0.2133, "step": 1408 }, { "epoch": 0.28516494636713213, "grad_norm": 0.3222504258155823, "learning_rate": 0.00019022072292528827, "loss": 0.2519, "step": 1409 }, { "epoch": 0.28536733454766244, "grad_norm": 0.2719772458076477, "learning_rate": 0.00019020699984653887, "loss": 0.2447, "step": 1410 }, { "epoch": 0.2855697227281927, "grad_norm": 0.35438042879104614, "learning_rate": 0.00019019326764159176, "loss": 0.2485, "step": 1411 }, { "epoch": 0.28577211090872295, "grad_norm": 0.3307201862335205, "learning_rate": 0.00019017952631183622, "loss": 0.2474, "step": 1412 }, { "epoch": 0.2859744990892532, "grad_norm": 0.32247617840766907, "learning_rate": 0.00019016577585866245, "loss": 0.2476, "step": 1413 }, { "epoch": 0.28617688726978346, "grad_norm": 0.3016018867492676, "learning_rate": 0.0001901520162834616, "loss": 0.258, "step": 1414 }, { "epoch": 0.2863792754503137, "grad_norm": 0.32061678171157837, "learning_rate": 0.00019013824758762565, "loss": 0.2602, "step": 1415 }, { "epoch": 0.286581663630844, "grad_norm": 0.3942313492298126, "learning_rate": 0.00019012446977254767, "loss": 0.2585, "step": 1416 }, { "epoch": 0.28678405181137423, "grad_norm": 0.5417084097862244, "learning_rate": 0.00019011068283962147, "loss": 0.2643, "step": 1417 }, { "epoch": 0.2869864399919045, "grad_norm": 0.5987349152565002, "learning_rate": 0.0001900968867902419, "loss": 0.2477, "step": 1418 }, { "epoch": 0.28718882817243474, "grad_norm": 0.30098897218704224, "learning_rate": 0.00019008308162580474, "loss": 0.278, "step": 1419 }, { "epoch": 0.287391216352965, "grad_norm": 0.27245810627937317, "learning_rate": 0.0001900692673477066, "loss": 0.2326, "step": 1420 }, { "epoch": 0.28759360453349525, "grad_norm": 0.2615533173084259, "learning_rate": 0.00019005544395734502, "loss": 0.211, "step": 1421 }, { "epoch": 0.2877959927140255, "grad_norm": 0.3715825378894806, "learning_rate": 0.00019004161145611863, "loss": 0.2878, "step": 1422 }, { "epoch": 0.28799838089455576, "grad_norm": 0.35649946331977844, "learning_rate": 0.00019002776984542675, "loss": 0.2386, "step": 1423 }, { "epoch": 0.288200769075086, "grad_norm": 0.3113993704319, "learning_rate": 0.0001900139191266698, "loss": 0.2379, "step": 1424 }, { "epoch": 0.28840315725561627, "grad_norm": 0.3101493716239929, "learning_rate": 0.00019000005930124898, "loss": 0.2639, "step": 1425 }, { "epoch": 0.2886055454361465, "grad_norm": 0.2791244387626648, "learning_rate": 0.00018998619037056654, "loss": 0.2283, "step": 1426 }, { "epoch": 0.2888079336166768, "grad_norm": 0.3837342858314514, "learning_rate": 0.00018997231233602556, "loss": 0.2673, "step": 1427 }, { "epoch": 0.28901032179720704, "grad_norm": 0.39756709337234497, "learning_rate": 0.00018995842519903012, "loss": 0.2601, "step": 1428 }, { "epoch": 0.2892127099777373, "grad_norm": 0.25341853499412537, "learning_rate": 0.0001899445289609851, "loss": 0.1997, "step": 1429 }, { "epoch": 0.28941509815826755, "grad_norm": 0.2866188585758209, "learning_rate": 0.00018993062362329641, "loss": 0.2308, "step": 1430 }, { "epoch": 0.2896174863387978, "grad_norm": 0.34560084342956543, "learning_rate": 0.0001899167091873709, "loss": 0.2422, "step": 1431 }, { "epoch": 0.28981987451932806, "grad_norm": 0.3137929439544678, "learning_rate": 0.00018990278565461622, "loss": 0.2295, "step": 1432 }, { "epoch": 0.2900222626998583, "grad_norm": 0.35084256529808044, "learning_rate": 0.00018988885302644102, "loss": 0.303, "step": 1433 }, { "epoch": 0.29022465088038857, "grad_norm": 0.36562684178352356, "learning_rate": 0.00018987491130425488, "loss": 0.2403, "step": 1434 }, { "epoch": 0.2904270390609188, "grad_norm": 0.2591904103755951, "learning_rate": 0.00018986096048946824, "loss": 0.2006, "step": 1435 }, { "epoch": 0.2906294272414491, "grad_norm": 0.4177074432373047, "learning_rate": 0.00018984700058349252, "loss": 0.2805, "step": 1436 }, { "epoch": 0.29083181542197933, "grad_norm": 0.5934389233589172, "learning_rate": 0.00018983303158774003, "loss": 0.2531, "step": 1437 }, { "epoch": 0.2910342036025096, "grad_norm": 0.3575098216533661, "learning_rate": 0.00018981905350362404, "loss": 0.2335, "step": 1438 }, { "epoch": 0.29123659178303984, "grad_norm": 0.38351020216941833, "learning_rate": 0.00018980506633255864, "loss": 0.2444, "step": 1439 }, { "epoch": 0.29143897996357016, "grad_norm": 0.35039186477661133, "learning_rate": 0.00018979107007595895, "loss": 0.2598, "step": 1440 }, { "epoch": 0.2916413681441004, "grad_norm": 0.297146737575531, "learning_rate": 0.000189777064735241, "loss": 0.2332, "step": 1441 }, { "epoch": 0.29184375632463067, "grad_norm": 0.39954692125320435, "learning_rate": 0.0001897630503118216, "loss": 0.249, "step": 1442 }, { "epoch": 0.2920461445051609, "grad_norm": 0.549950122833252, "learning_rate": 0.0001897490268071187, "loss": 0.2489, "step": 1443 }, { "epoch": 0.2922485326856912, "grad_norm": 0.3395889103412628, "learning_rate": 0.00018973499422255094, "loss": 0.2877, "step": 1444 }, { "epoch": 0.29245092086622143, "grad_norm": 0.26225653290748596, "learning_rate": 0.00018972095255953808, "loss": 0.2412, "step": 1445 }, { "epoch": 0.2926533090467517, "grad_norm": 0.4954787790775299, "learning_rate": 0.00018970690181950066, "loss": 0.2555, "step": 1446 }, { "epoch": 0.29285569722728194, "grad_norm": 0.3388492166996002, "learning_rate": 0.0001896928420038602, "loss": 0.2505, "step": 1447 }, { "epoch": 0.2930580854078122, "grad_norm": 0.5826833844184875, "learning_rate": 0.00018967877311403913, "loss": 0.2618, "step": 1448 }, { "epoch": 0.29326047358834245, "grad_norm": 0.4034349024295807, "learning_rate": 0.00018966469515146076, "loss": 0.2711, "step": 1449 }, { "epoch": 0.2934628617688727, "grad_norm": 0.468124121427536, "learning_rate": 0.00018965060811754937, "loss": 0.268, "step": 1450 }, { "epoch": 0.2934628617688727, "eval_loss": 0.29375138878822327, "eval_runtime": 0.738, "eval_samples_per_second": 6.775, "eval_steps_per_second": 1.355, "step": 1450 }, { "epoch": 0.29366524994940296, "grad_norm": 0.4334132671356201, "learning_rate": 0.00018963651201373019, "loss": 0.2469, "step": 1451 }, { "epoch": 0.2938676381299332, "grad_norm": 0.3310418426990509, "learning_rate": 0.00018962240684142922, "loss": 0.2579, "step": 1452 }, { "epoch": 0.2940700263104635, "grad_norm": 0.3076673150062561, "learning_rate": 0.00018960829260207356, "loss": 0.2619, "step": 1453 }, { "epoch": 0.29427241449099373, "grad_norm": 0.3559109568595886, "learning_rate": 0.0001895941692970911, "loss": 0.2869, "step": 1454 }, { "epoch": 0.294474802671524, "grad_norm": 0.5222853422164917, "learning_rate": 0.00018958003692791066, "loss": 0.2916, "step": 1455 }, { "epoch": 0.29467719085205424, "grad_norm": 0.3845933675765991, "learning_rate": 0.00018956589549596207, "loss": 0.2368, "step": 1456 }, { "epoch": 0.2948795790325845, "grad_norm": 0.3979237973690033, "learning_rate": 0.00018955174500267594, "loss": 0.3314, "step": 1457 }, { "epoch": 0.29508196721311475, "grad_norm": 0.35677894949913025, "learning_rate": 0.00018953758544948393, "loss": 0.252, "step": 1458 }, { "epoch": 0.295284355393645, "grad_norm": 0.2729237377643585, "learning_rate": 0.00018952341683781856, "loss": 0.2204, "step": 1459 }, { "epoch": 0.29548674357417526, "grad_norm": 0.2631382942199707, "learning_rate": 0.0001895092391691132, "loss": 0.2401, "step": 1460 }, { "epoch": 0.2956891317547055, "grad_norm": 0.33820971846580505, "learning_rate": 0.00018949505244480225, "loss": 0.2214, "step": 1461 }, { "epoch": 0.29589151993523577, "grad_norm": 0.3907553255558014, "learning_rate": 0.00018948085666632092, "loss": 0.2483, "step": 1462 }, { "epoch": 0.296093908115766, "grad_norm": 0.6589711904525757, "learning_rate": 0.00018946665183510546, "loss": 0.2595, "step": 1463 }, { "epoch": 0.2962962962962963, "grad_norm": 0.3188416659832001, "learning_rate": 0.00018945243795259292, "loss": 0.2417, "step": 1464 }, { "epoch": 0.29649868447682654, "grad_norm": 0.37853628396987915, "learning_rate": 0.0001894382150202213, "loss": 0.2763, "step": 1465 }, { "epoch": 0.2967010726573568, "grad_norm": 0.3539217710494995, "learning_rate": 0.00018942398303942957, "loss": 0.2592, "step": 1466 }, { "epoch": 0.29690346083788705, "grad_norm": 0.4117416441440582, "learning_rate": 0.00018940974201165755, "loss": 0.2304, "step": 1467 }, { "epoch": 0.2971058490184173, "grad_norm": 0.3291616141796112, "learning_rate": 0.00018939549193834601, "loss": 0.2513, "step": 1468 }, { "epoch": 0.29730823719894756, "grad_norm": 0.3089660704135895, "learning_rate": 0.00018938123282093657, "loss": 0.2, "step": 1469 }, { "epoch": 0.2975106253794778, "grad_norm": 0.33169400691986084, "learning_rate": 0.0001893669646608719, "loss": 0.2808, "step": 1470 }, { "epoch": 0.2977130135600081, "grad_norm": 0.26605546474456787, "learning_rate": 0.00018935268745959543, "loss": 0.2435, "step": 1471 }, { "epoch": 0.2979154017405384, "grad_norm": 0.31251972913742065, "learning_rate": 0.00018933840121855165, "loss": 0.2528, "step": 1472 }, { "epoch": 0.29811778992106863, "grad_norm": 0.309332937002182, "learning_rate": 0.00018932410593918583, "loss": 0.2659, "step": 1473 }, { "epoch": 0.2983201781015989, "grad_norm": 0.29334747791290283, "learning_rate": 0.00018930980162294424, "loss": 0.2319, "step": 1474 }, { "epoch": 0.29852256628212914, "grad_norm": 0.3506908416748047, "learning_rate": 0.00018929548827127402, "loss": 0.2367, "step": 1475 }, { "epoch": 0.2987249544626594, "grad_norm": 0.2785925567150116, "learning_rate": 0.00018928116588562332, "loss": 0.2221, "step": 1476 }, { "epoch": 0.29892734264318965, "grad_norm": 0.33908000588417053, "learning_rate": 0.00018926683446744103, "loss": 0.2646, "step": 1477 }, { "epoch": 0.2991297308237199, "grad_norm": 0.33341488242149353, "learning_rate": 0.0001892524940181771, "loss": 0.268, "step": 1478 }, { "epoch": 0.29933211900425016, "grad_norm": 0.31938436627388, "learning_rate": 0.00018923814453928234, "loss": 0.2757, "step": 1479 }, { "epoch": 0.2995345071847804, "grad_norm": 0.31264033913612366, "learning_rate": 0.0001892237860322085, "loss": 0.225, "step": 1480 }, { "epoch": 0.2997368953653107, "grad_norm": 0.43807071447372437, "learning_rate": 0.00018920941849840815, "loss": 0.2412, "step": 1481 }, { "epoch": 0.29993928354584093, "grad_norm": 0.32840967178344727, "learning_rate": 0.00018919504193933495, "loss": 0.2412, "step": 1482 }, { "epoch": 0.3001416717263712, "grad_norm": 0.3693227171897888, "learning_rate": 0.00018918065635644332, "loss": 0.2632, "step": 1483 }, { "epoch": 0.30034405990690144, "grad_norm": 0.43262529373168945, "learning_rate": 0.00018916626175118862, "loss": 0.2599, "step": 1484 }, { "epoch": 0.3005464480874317, "grad_norm": 0.39893639087677, "learning_rate": 0.00018915185812502715, "loss": 0.2373, "step": 1485 }, { "epoch": 0.30074883626796195, "grad_norm": 0.30610501766204834, "learning_rate": 0.0001891374454794162, "loss": 0.2421, "step": 1486 }, { "epoch": 0.3009512244484922, "grad_norm": 0.3462240993976593, "learning_rate": 0.00018912302381581374, "loss": 0.2514, "step": 1487 }, { "epoch": 0.30115361262902246, "grad_norm": 0.37143474817276, "learning_rate": 0.00018910859313567895, "loss": 0.2465, "step": 1488 }, { "epoch": 0.3013560008095527, "grad_norm": 0.3381304442882538, "learning_rate": 0.0001890941534404717, "loss": 0.2338, "step": 1489 }, { "epoch": 0.30155838899008297, "grad_norm": 0.3870564103126526, "learning_rate": 0.00018907970473165287, "loss": 0.293, "step": 1490 }, { "epoch": 0.3017607771706132, "grad_norm": 0.44966599345207214, "learning_rate": 0.00018906524701068418, "loss": 0.2567, "step": 1491 }, { "epoch": 0.3019631653511435, "grad_norm": 0.39895737171173096, "learning_rate": 0.00018905078027902836, "loss": 0.2578, "step": 1492 }, { "epoch": 0.30216555353167374, "grad_norm": 0.46490851044654846, "learning_rate": 0.000189036304538149, "loss": 0.2288, "step": 1493 }, { "epoch": 0.302367941712204, "grad_norm": 0.3627021312713623, "learning_rate": 0.0001890218197895106, "loss": 0.2695, "step": 1494 }, { "epoch": 0.30257032989273425, "grad_norm": 0.29395443201065063, "learning_rate": 0.00018900732603457855, "loss": 0.2135, "step": 1495 }, { "epoch": 0.3027727180732645, "grad_norm": 0.3824223577976227, "learning_rate": 0.00018899282327481922, "loss": 0.2511, "step": 1496 }, { "epoch": 0.30297510625379476, "grad_norm": 0.31674912571907043, "learning_rate": 0.00018897831151169984, "loss": 0.2589, "step": 1497 }, { "epoch": 0.303177494434325, "grad_norm": 0.34949105978012085, "learning_rate": 0.00018896379074668848, "loss": 0.2262, "step": 1498 }, { "epoch": 0.30337988261485527, "grad_norm": 0.5163675546646118, "learning_rate": 0.0001889492609812543, "loss": 0.2443, "step": 1499 }, { "epoch": 0.3035822707953855, "grad_norm": 0.29069405794143677, "learning_rate": 0.00018893472221686723, "loss": 0.2487, "step": 1500 }, { "epoch": 0.3035822707953855, "eval_loss": 0.2916322648525238, "eval_runtime": 0.7412, "eval_samples_per_second": 6.746, "eval_steps_per_second": 1.349, "step": 1500 }, { "epoch": 0.30378465897591583, "grad_norm": 0.2935831546783447, "learning_rate": 0.0001889201744549981, "loss": 0.2856, "step": 1501 }, { "epoch": 0.3039870471564461, "grad_norm": 0.35391557216644287, "learning_rate": 0.0001889056176971188, "loss": 0.284, "step": 1502 }, { "epoch": 0.30418943533697634, "grad_norm": 0.3323562741279602, "learning_rate": 0.000188891051944702, "loss": 0.247, "step": 1503 }, { "epoch": 0.3043918235175066, "grad_norm": 0.3300694227218628, "learning_rate": 0.0001888764771992212, "loss": 0.2302, "step": 1504 }, { "epoch": 0.30459421169803685, "grad_norm": 0.3524026572704315, "learning_rate": 0.00018886189346215107, "loss": 0.234, "step": 1505 }, { "epoch": 0.3047965998785671, "grad_norm": 0.4068450331687927, "learning_rate": 0.00018884730073496698, "loss": 0.2716, "step": 1506 }, { "epoch": 0.30499898805909736, "grad_norm": 0.38134053349494934, "learning_rate": 0.00018883269901914522, "loss": 0.2431, "step": 1507 }, { "epoch": 0.3052013762396276, "grad_norm": 0.3229370713233948, "learning_rate": 0.00018881808831616313, "loss": 0.2367, "step": 1508 }, { "epoch": 0.3054037644201579, "grad_norm": 0.2964808940887451, "learning_rate": 0.0001888034686274988, "loss": 0.2336, "step": 1509 }, { "epoch": 0.30560615260068813, "grad_norm": 0.3803769648075104, "learning_rate": 0.00018878883995463133, "loss": 0.2276, "step": 1510 }, { "epoch": 0.3058085407812184, "grad_norm": 0.31368228793144226, "learning_rate": 0.00018877420229904067, "loss": 0.2578, "step": 1511 }, { "epoch": 0.30601092896174864, "grad_norm": 0.30417075753211975, "learning_rate": 0.00018875955566220772, "loss": 0.251, "step": 1512 }, { "epoch": 0.3062133171422789, "grad_norm": 0.3475414216518402, "learning_rate": 0.00018874490004561426, "loss": 0.2544, "step": 1513 }, { "epoch": 0.30641570532280915, "grad_norm": 0.41898611187934875, "learning_rate": 0.000188730235450743, "loss": 0.2621, "step": 1514 }, { "epoch": 0.3066180935033394, "grad_norm": 0.3886573910713196, "learning_rate": 0.0001887155618790776, "loss": 0.2168, "step": 1515 }, { "epoch": 0.30682048168386966, "grad_norm": 0.4358440339565277, "learning_rate": 0.0001887008793321025, "loss": 0.2754, "step": 1516 }, { "epoch": 0.3070228698643999, "grad_norm": 0.32611727714538574, "learning_rate": 0.0001886861878113032, "loss": 0.2672, "step": 1517 }, { "epoch": 0.30722525804493017, "grad_norm": 0.3701517581939697, "learning_rate": 0.00018867148731816592, "loss": 0.2416, "step": 1518 }, { "epoch": 0.3074276462254604, "grad_norm": 0.2883046567440033, "learning_rate": 0.00018865677785417798, "loss": 0.268, "step": 1519 }, { "epoch": 0.3076300344059907, "grad_norm": 0.4058697819709778, "learning_rate": 0.00018864205942082757, "loss": 0.2511, "step": 1520 }, { "epoch": 0.30783242258652094, "grad_norm": 0.286826491355896, "learning_rate": 0.00018862733201960365, "loss": 0.232, "step": 1521 }, { "epoch": 0.3080348107670512, "grad_norm": 0.45446181297302246, "learning_rate": 0.00018861259565199626, "loss": 0.2903, "step": 1522 }, { "epoch": 0.30823719894758145, "grad_norm": 0.24897044897079468, "learning_rate": 0.00018859785031949625, "loss": 0.2304, "step": 1523 }, { "epoch": 0.3084395871281117, "grad_norm": 0.3912540078163147, "learning_rate": 0.00018858309602359539, "loss": 0.2663, "step": 1524 }, { "epoch": 0.30864197530864196, "grad_norm": 0.3550991415977478, "learning_rate": 0.00018856833276578635, "loss": 0.2224, "step": 1525 }, { "epoch": 0.3088443634891722, "grad_norm": 0.323539137840271, "learning_rate": 0.00018855356054756273, "loss": 0.2861, "step": 1526 }, { "epoch": 0.30904675166970247, "grad_norm": 0.32665151357650757, "learning_rate": 0.00018853877937041906, "loss": 0.2436, "step": 1527 }, { "epoch": 0.3092491398502327, "grad_norm": 0.373546838760376, "learning_rate": 0.00018852398923585072, "loss": 0.2673, "step": 1528 }, { "epoch": 0.309451528030763, "grad_norm": 0.36496198177337646, "learning_rate": 0.000188509190145354, "loss": 0.2105, "step": 1529 }, { "epoch": 0.30965391621129323, "grad_norm": 0.34335947036743164, "learning_rate": 0.00018849438210042613, "loss": 0.2774, "step": 1530 }, { "epoch": 0.30985630439182354, "grad_norm": 0.29991385340690613, "learning_rate": 0.00018847956510256527, "loss": 0.2592, "step": 1531 }, { "epoch": 0.3100586925723538, "grad_norm": 0.36050549149513245, "learning_rate": 0.00018846473915327041, "loss": 0.2497, "step": 1532 }, { "epoch": 0.31026108075288406, "grad_norm": 0.3393295109272003, "learning_rate": 0.00018844990425404148, "loss": 0.2647, "step": 1533 }, { "epoch": 0.3104634689334143, "grad_norm": 0.39216938614845276, "learning_rate": 0.00018843506040637934, "loss": 0.2557, "step": 1534 }, { "epoch": 0.31066585711394457, "grad_norm": 0.2916669249534607, "learning_rate": 0.00018842020761178574, "loss": 0.2188, "step": 1535 }, { "epoch": 0.3108682452944748, "grad_norm": 0.35340821743011475, "learning_rate": 0.0001884053458717633, "loss": 0.2276, "step": 1536 }, { "epoch": 0.3110706334750051, "grad_norm": 0.2852288782596588, "learning_rate": 0.00018839047518781561, "loss": 0.2531, "step": 1537 }, { "epoch": 0.31127302165553533, "grad_norm": 0.4118358790874481, "learning_rate": 0.0001883755955614471, "loss": 0.2508, "step": 1538 }, { "epoch": 0.3114754098360656, "grad_norm": 0.3326147198677063, "learning_rate": 0.00018836070699416313, "loss": 0.2347, "step": 1539 }, { "epoch": 0.31167779801659584, "grad_norm": 0.42047885060310364, "learning_rate": 0.00018834580948746997, "loss": 0.2716, "step": 1540 }, { "epoch": 0.3118801861971261, "grad_norm": 0.2908095121383667, "learning_rate": 0.00018833090304287486, "loss": 0.25, "step": 1541 }, { "epoch": 0.31208257437765635, "grad_norm": 0.36547958850860596, "learning_rate": 0.0001883159876618858, "loss": 0.22, "step": 1542 }, { "epoch": 0.3122849625581866, "grad_norm": 0.4909718632698059, "learning_rate": 0.0001883010633460118, "loss": 0.2657, "step": 1543 }, { "epoch": 0.31248735073871686, "grad_norm": 0.34576284885406494, "learning_rate": 0.00018828613009676276, "loss": 0.2642, "step": 1544 }, { "epoch": 0.3126897389192471, "grad_norm": 0.3441828191280365, "learning_rate": 0.0001882711879156494, "loss": 0.2393, "step": 1545 }, { "epoch": 0.3128921270997774, "grad_norm": 0.24068088829517365, "learning_rate": 0.00018825623680418353, "loss": 0.2012, "step": 1546 }, { "epoch": 0.31309451528030763, "grad_norm": 0.24355578422546387, "learning_rate": 0.00018824127676387765, "loss": 0.2314, "step": 1547 }, { "epoch": 0.3132969034608379, "grad_norm": 0.3701528310775757, "learning_rate": 0.00018822630779624528, "loss": 0.2473, "step": 1548 }, { "epoch": 0.31349929164136814, "grad_norm": 0.3159331679344177, "learning_rate": 0.00018821132990280086, "loss": 0.2086, "step": 1549 }, { "epoch": 0.3137016798218984, "grad_norm": 0.5335456728935242, "learning_rate": 0.00018819634308505964, "loss": 0.3292, "step": 1550 }, { "epoch": 0.3137016798218984, "eval_loss": 0.29419490694999695, "eval_runtime": 0.738, "eval_samples_per_second": 6.775, "eval_steps_per_second": 1.355, "step": 1550 }, { "epoch": 0.31390406800242865, "grad_norm": 0.3889347314834595, "learning_rate": 0.0001881813473445379, "loss": 0.2466, "step": 1551 }, { "epoch": 0.3141064561829589, "grad_norm": 0.3013499081134796, "learning_rate": 0.00018816634268275267, "loss": 0.2538, "step": 1552 }, { "epoch": 0.31430884436348916, "grad_norm": 0.25156041979789734, "learning_rate": 0.00018815132910122206, "loss": 0.2323, "step": 1553 }, { "epoch": 0.3145112325440194, "grad_norm": 0.3922179341316223, "learning_rate": 0.00018813630660146488, "loss": 0.2784, "step": 1554 }, { "epoch": 0.31471362072454967, "grad_norm": 0.3269888162612915, "learning_rate": 0.00018812127518500106, "loss": 0.2898, "step": 1555 }, { "epoch": 0.3149160089050799, "grad_norm": 0.3011750280857086, "learning_rate": 0.00018810623485335118, "loss": 0.2831, "step": 1556 }, { "epoch": 0.3151183970856102, "grad_norm": 0.2626027762889862, "learning_rate": 0.00018809118560803704, "loss": 0.222, "step": 1557 }, { "epoch": 0.31532078526614044, "grad_norm": 0.32995250821113586, "learning_rate": 0.000188076127450581, "loss": 0.2607, "step": 1558 }, { "epoch": 0.3155231734466707, "grad_norm": 0.30816736817359924, "learning_rate": 0.00018806106038250659, "loss": 0.2375, "step": 1559 }, { "epoch": 0.31572556162720095, "grad_norm": 0.3002629280090332, "learning_rate": 0.00018804598440533808, "loss": 0.2601, "step": 1560 }, { "epoch": 0.3159279498077312, "grad_norm": 0.2814362347126007, "learning_rate": 0.00018803089952060075, "loss": 0.2373, "step": 1561 }, { "epoch": 0.3161303379882615, "grad_norm": 0.28549739718437195, "learning_rate": 0.00018801580572982068, "loss": 0.2239, "step": 1562 }, { "epoch": 0.31633272616879177, "grad_norm": 0.3680552542209625, "learning_rate": 0.00018800070303452495, "loss": 0.2394, "step": 1563 }, { "epoch": 0.316535114349322, "grad_norm": 0.39102286100387573, "learning_rate": 0.00018798559143624145, "loss": 0.2526, "step": 1564 }, { "epoch": 0.3167375025298523, "grad_norm": 0.26771053671836853, "learning_rate": 0.00018797047093649903, "loss": 0.2856, "step": 1565 }, { "epoch": 0.31693989071038253, "grad_norm": 0.27007582783699036, "learning_rate": 0.00018795534153682745, "loss": 0.2416, "step": 1566 }, { "epoch": 0.3171422788909128, "grad_norm": 0.32626280188560486, "learning_rate": 0.0001879402032387573, "loss": 0.2517, "step": 1567 }, { "epoch": 0.31734466707144304, "grad_norm": 0.3350610136985779, "learning_rate": 0.00018792505604382014, "loss": 0.264, "step": 1568 }, { "epoch": 0.3175470552519733, "grad_norm": 0.3124147653579712, "learning_rate": 0.00018790989995354836, "loss": 0.2744, "step": 1569 }, { "epoch": 0.31774944343250355, "grad_norm": 0.34561726450920105, "learning_rate": 0.0001878947349694754, "loss": 0.2469, "step": 1570 }, { "epoch": 0.3179518316130338, "grad_norm": 1.0323286056518555, "learning_rate": 0.00018787956109313537, "loss": 0.2873, "step": 1571 }, { "epoch": 0.31815421979356406, "grad_norm": 0.34411680698394775, "learning_rate": 0.00018786437832606347, "loss": 0.2407, "step": 1572 }, { "epoch": 0.3183566079740943, "grad_norm": 0.281716912984848, "learning_rate": 0.00018784918666979575, "loss": 0.2429, "step": 1573 }, { "epoch": 0.3185589961546246, "grad_norm": 0.29189565777778625, "learning_rate": 0.00018783398612586908, "loss": 0.2099, "step": 1574 }, { "epoch": 0.31876138433515483, "grad_norm": 0.41523846983909607, "learning_rate": 0.00018781877669582132, "loss": 0.2404, "step": 1575 }, { "epoch": 0.3189637725156851, "grad_norm": 0.34226346015930176, "learning_rate": 0.00018780355838119122, "loss": 0.2631, "step": 1576 }, { "epoch": 0.31916616069621534, "grad_norm": 0.40481576323509216, "learning_rate": 0.0001877883311835184, "loss": 0.2535, "step": 1577 }, { "epoch": 0.3193685488767456, "grad_norm": 0.42217254638671875, "learning_rate": 0.00018777309510434337, "loss": 0.2723, "step": 1578 }, { "epoch": 0.31957093705727585, "grad_norm": 0.3216110169887543, "learning_rate": 0.00018775785014520758, "loss": 0.2415, "step": 1579 }, { "epoch": 0.3197733252378061, "grad_norm": 0.32365474104881287, "learning_rate": 0.00018774259630765334, "loss": 0.2317, "step": 1580 }, { "epoch": 0.31997571341833636, "grad_norm": 0.32175514101982117, "learning_rate": 0.00018772733359322387, "loss": 0.2323, "step": 1581 }, { "epoch": 0.3201781015988666, "grad_norm": 0.32359227538108826, "learning_rate": 0.00018771206200346333, "loss": 0.2867, "step": 1582 }, { "epoch": 0.32038048977939687, "grad_norm": 0.4200432598590851, "learning_rate": 0.00018769678153991669, "loss": 0.2628, "step": 1583 }, { "epoch": 0.3205828779599271, "grad_norm": 0.35818588733673096, "learning_rate": 0.0001876814922041299, "loss": 0.2187, "step": 1584 }, { "epoch": 0.3207852661404574, "grad_norm": 0.3140599727630615, "learning_rate": 0.00018766619399764972, "loss": 0.2113, "step": 1585 }, { "epoch": 0.32098765432098764, "grad_norm": 0.38742703199386597, "learning_rate": 0.00018765088692202392, "loss": 0.2689, "step": 1586 }, { "epoch": 0.3211900425015179, "grad_norm": 0.3630368411540985, "learning_rate": 0.00018763557097880112, "loss": 0.2831, "step": 1587 }, { "epoch": 0.32139243068204815, "grad_norm": 0.3636877238750458, "learning_rate": 0.00018762024616953075, "loss": 0.2455, "step": 1588 }, { "epoch": 0.3215948188625784, "grad_norm": 0.292085736989975, "learning_rate": 0.00018760491249576332, "loss": 0.2404, "step": 1589 }, { "epoch": 0.32179720704310866, "grad_norm": 0.33262398838996887, "learning_rate": 0.00018758956995905, "loss": 0.2235, "step": 1590 }, { "epoch": 0.3219995952236389, "grad_norm": 0.4547630548477173, "learning_rate": 0.00018757421856094314, "loss": 0.2362, "step": 1591 }, { "epoch": 0.3222019834041692, "grad_norm": 0.2708628475666046, "learning_rate": 0.00018755885830299568, "loss": 0.2371, "step": 1592 }, { "epoch": 0.3224043715846995, "grad_norm": 0.3246055543422699, "learning_rate": 0.00018754348918676174, "loss": 0.2462, "step": 1593 }, { "epoch": 0.32260675976522973, "grad_norm": 0.26867252588272095, "learning_rate": 0.0001875281112137961, "loss": 0.2465, "step": 1594 }, { "epoch": 0.32280914794576, "grad_norm": 0.4057390093803406, "learning_rate": 0.00018751272438565463, "loss": 0.2211, "step": 1595 }, { "epoch": 0.32301153612629024, "grad_norm": 0.3229082226753235, "learning_rate": 0.00018749732870389392, "loss": 0.269, "step": 1596 }, { "epoch": 0.3232139243068205, "grad_norm": 0.23535144329071045, "learning_rate": 0.00018748192417007164, "loss": 0.2086, "step": 1597 }, { "epoch": 0.32341631248735075, "grad_norm": 0.30807140469551086, "learning_rate": 0.00018746651078574618, "loss": 0.2339, "step": 1598 }, { "epoch": 0.323618700667881, "grad_norm": 0.48715728521347046, "learning_rate": 0.00018745108855247695, "loss": 0.2279, "step": 1599 }, { "epoch": 0.32382108884841126, "grad_norm": 0.3199789822101593, "learning_rate": 0.00018743565747182417, "loss": 0.2642, "step": 1600 }, { "epoch": 0.32382108884841126, "eval_loss": 0.2936403155326843, "eval_runtime": 0.7383, "eval_samples_per_second": 6.772, "eval_steps_per_second": 1.354, "step": 1600 }, { "epoch": 0.3240234770289415, "grad_norm": 0.27700263261795044, "learning_rate": 0.00018742021754534905, "loss": 0.2183, "step": 1601 }, { "epoch": 0.3242258652094718, "grad_norm": 0.32673951983451843, "learning_rate": 0.00018740476877461356, "loss": 0.2448, "step": 1602 }, { "epoch": 0.32442825339000203, "grad_norm": 0.2943725287914276, "learning_rate": 0.00018738931116118074, "loss": 0.2474, "step": 1603 }, { "epoch": 0.3246306415705323, "grad_norm": 0.3454664647579193, "learning_rate": 0.00018737384470661437, "loss": 0.2397, "step": 1604 }, { "epoch": 0.32483302975106254, "grad_norm": 0.31171998381614685, "learning_rate": 0.0001873583694124792, "loss": 0.2565, "step": 1605 }, { "epoch": 0.3250354179315928, "grad_norm": 0.2882143259048462, "learning_rate": 0.00018734288528034085, "loss": 0.2684, "step": 1606 }, { "epoch": 0.32523780611212305, "grad_norm": 0.3082469403743744, "learning_rate": 0.00018732739231176585, "loss": 0.2119, "step": 1607 }, { "epoch": 0.3254401942926533, "grad_norm": 0.3238343298435211, "learning_rate": 0.00018731189050832158, "loss": 0.214, "step": 1608 }, { "epoch": 0.32564258247318356, "grad_norm": 0.27819204330444336, "learning_rate": 0.00018729637987157643, "loss": 0.2417, "step": 1609 }, { "epoch": 0.3258449706537138, "grad_norm": 0.29245132207870483, "learning_rate": 0.0001872808604030995, "loss": 0.2403, "step": 1610 }, { "epoch": 0.32604735883424407, "grad_norm": 0.3408953547477722, "learning_rate": 0.000187265332104461, "loss": 0.2475, "step": 1611 }, { "epoch": 0.3262497470147743, "grad_norm": 0.32225704193115234, "learning_rate": 0.00018724979497723184, "loss": 0.2317, "step": 1612 }, { "epoch": 0.3264521351953046, "grad_norm": 0.3903481066226959, "learning_rate": 0.00018723424902298392, "loss": 0.2989, "step": 1613 }, { "epoch": 0.32665452337583484, "grad_norm": 0.3457467555999756, "learning_rate": 0.00018721869424329003, "loss": 0.266, "step": 1614 }, { "epoch": 0.3268569115563651, "grad_norm": 0.29828596115112305, "learning_rate": 0.0001872031306397238, "loss": 0.2567, "step": 1615 }, { "epoch": 0.32705929973689535, "grad_norm": 0.44933828711509705, "learning_rate": 0.00018718755821385988, "loss": 0.2429, "step": 1616 }, { "epoch": 0.3272616879174256, "grad_norm": 0.3514581322669983, "learning_rate": 0.00018717197696727366, "loss": 0.2696, "step": 1617 }, { "epoch": 0.32746407609795586, "grad_norm": 0.2809898853302002, "learning_rate": 0.00018715638690154144, "loss": 0.2069, "step": 1618 }, { "epoch": 0.3276664642784861, "grad_norm": 0.36968308687210083, "learning_rate": 0.00018714078801824059, "loss": 0.2317, "step": 1619 }, { "epoch": 0.32786885245901637, "grad_norm": 0.41571712493896484, "learning_rate": 0.0001871251803189491, "loss": 0.2883, "step": 1620 }, { "epoch": 0.3280712406395466, "grad_norm": 0.38158875703811646, "learning_rate": 0.00018710956380524606, "loss": 0.2533, "step": 1621 }, { "epoch": 0.32827362882007693, "grad_norm": 0.24912859499454498, "learning_rate": 0.00018709393847871143, "loss": 0.2285, "step": 1622 }, { "epoch": 0.3284760170006072, "grad_norm": 0.3398696184158325, "learning_rate": 0.00018707830434092597, "loss": 0.2558, "step": 1623 }, { "epoch": 0.32867840518113745, "grad_norm": 0.3501795530319214, "learning_rate": 0.00018706266139347134, "loss": 0.2625, "step": 1624 }, { "epoch": 0.3288807933616677, "grad_norm": 0.27285391092300415, "learning_rate": 0.0001870470096379302, "loss": 0.2384, "step": 1625 }, { "epoch": 0.32908318154219796, "grad_norm": 0.31238171458244324, "learning_rate": 0.00018703134907588597, "loss": 0.2522, "step": 1626 }, { "epoch": 0.3292855697227282, "grad_norm": 0.3718354105949402, "learning_rate": 0.00018701567970892308, "loss": 0.2297, "step": 1627 }, { "epoch": 0.32948795790325847, "grad_norm": 0.34207820892333984, "learning_rate": 0.00018700000153862675, "loss": 0.3124, "step": 1628 }, { "epoch": 0.3296903460837887, "grad_norm": 0.2899322211742401, "learning_rate": 0.00018698431456658313, "loss": 0.2466, "step": 1629 }, { "epoch": 0.329892734264319, "grad_norm": 0.326610267162323, "learning_rate": 0.00018696861879437932, "loss": 0.2371, "step": 1630 }, { "epoch": 0.33009512244484923, "grad_norm": 0.36713942885398865, "learning_rate": 0.00018695291422360317, "loss": 0.2841, "step": 1631 }, { "epoch": 0.3302975106253795, "grad_norm": 0.3396678864955902, "learning_rate": 0.00018693720085584357, "loss": 0.2434, "step": 1632 }, { "epoch": 0.33049989880590974, "grad_norm": 0.32362422347068787, "learning_rate": 0.0001869214786926902, "loss": 0.2558, "step": 1633 }, { "epoch": 0.33070228698644, "grad_norm": 0.2856462895870209, "learning_rate": 0.00018690574773573367, "loss": 0.2628, "step": 1634 }, { "epoch": 0.33090467516697025, "grad_norm": 0.36801108717918396, "learning_rate": 0.00018689000798656545, "loss": 0.2754, "step": 1635 }, { "epoch": 0.3311070633475005, "grad_norm": 0.35658085346221924, "learning_rate": 0.000186874259446778, "loss": 0.2912, "step": 1636 }, { "epoch": 0.33130945152803076, "grad_norm": 0.3088845908641815, "learning_rate": 0.0001868585021179645, "loss": 0.2615, "step": 1637 }, { "epoch": 0.331511839708561, "grad_norm": 0.3361416757106781, "learning_rate": 0.00018684273600171918, "loss": 0.2523, "step": 1638 }, { "epoch": 0.3317142278890913, "grad_norm": 0.4012823700904846, "learning_rate": 0.00018682696109963704, "loss": 0.263, "step": 1639 }, { "epoch": 0.33191661606962153, "grad_norm": 0.30794715881347656, "learning_rate": 0.00018681117741331407, "loss": 0.2548, "step": 1640 }, { "epoch": 0.3321190042501518, "grad_norm": 0.539105236530304, "learning_rate": 0.00018679538494434703, "loss": 0.2455, "step": 1641 }, { "epoch": 0.33232139243068204, "grad_norm": 0.2805461287498474, "learning_rate": 0.0001867795836943337, "loss": 0.2171, "step": 1642 }, { "epoch": 0.3325237806112123, "grad_norm": 0.28716808557510376, "learning_rate": 0.00018676377366487265, "loss": 0.261, "step": 1643 }, { "epoch": 0.33272616879174255, "grad_norm": 0.23502685129642487, "learning_rate": 0.00018674795485756337, "loss": 0.2327, "step": 1644 }, { "epoch": 0.3329285569722728, "grad_norm": 0.37822720408439636, "learning_rate": 0.0001867321272740063, "loss": 0.2482, "step": 1645 }, { "epoch": 0.33313094515280306, "grad_norm": 0.2823399305343628, "learning_rate": 0.00018671629091580262, "loss": 0.251, "step": 1646 }, { "epoch": 0.3333333333333333, "grad_norm": 0.34142354130744934, "learning_rate": 0.00018670044578455455, "loss": 0.2245, "step": 1647 }, { "epoch": 0.33353572151386357, "grad_norm": 0.2987143099308014, "learning_rate": 0.0001866845918818651, "loss": 0.2458, "step": 1648 }, { "epoch": 0.3337381096943938, "grad_norm": 0.4302227199077606, "learning_rate": 0.00018666872920933823, "loss": 0.2637, "step": 1649 }, { "epoch": 0.3339404978749241, "grad_norm": 0.8706358671188354, "learning_rate": 0.0001866528577685787, "loss": 0.2845, "step": 1650 }, { "epoch": 0.3339404978749241, "eval_loss": 0.27961453795433044, "eval_runtime": 0.7359, "eval_samples_per_second": 6.794, "eval_steps_per_second": 1.359, "step": 1650 }, { "epoch": 0.33414288605545434, "grad_norm": 0.28847336769104004, "learning_rate": 0.00018663697756119232, "loss": 0.2259, "step": 1651 }, { "epoch": 0.3343452742359846, "grad_norm": 0.3432278335094452, "learning_rate": 0.00018662108858878557, "loss": 0.2526, "step": 1652 }, { "epoch": 0.3345476624165149, "grad_norm": 0.4454701244831085, "learning_rate": 0.000186605190852966, "loss": 0.2525, "step": 1653 }, { "epoch": 0.33475005059704516, "grad_norm": 0.498799204826355, "learning_rate": 0.00018658928435534198, "loss": 0.2523, "step": 1654 }, { "epoch": 0.3349524387775754, "grad_norm": 0.3040686547756195, "learning_rate": 0.00018657336909752272, "loss": 0.2291, "step": 1655 }, { "epoch": 0.33515482695810567, "grad_norm": 0.2661318778991699, "learning_rate": 0.00018655744508111837, "loss": 0.1961, "step": 1656 }, { "epoch": 0.3353572151386359, "grad_norm": 0.34656617045402527, "learning_rate": 0.00018654151230774, "loss": 0.2629, "step": 1657 }, { "epoch": 0.3355596033191662, "grad_norm": 0.35558953881263733, "learning_rate": 0.00018652557077899947, "loss": 0.2895, "step": 1658 }, { "epoch": 0.33576199149969643, "grad_norm": 0.2632910907268524, "learning_rate": 0.00018650962049650955, "loss": 0.2367, "step": 1659 }, { "epoch": 0.3359643796802267, "grad_norm": 0.3103640675544739, "learning_rate": 0.000186493661461884, "loss": 0.2895, "step": 1660 }, { "epoch": 0.33616676786075694, "grad_norm": 0.31030529737472534, "learning_rate": 0.00018647769367673733, "loss": 0.2499, "step": 1661 }, { "epoch": 0.3363691560412872, "grad_norm": 0.2659652531147003, "learning_rate": 0.00018646171714268504, "loss": 0.2279, "step": 1662 }, { "epoch": 0.33657154422181745, "grad_norm": 0.3482886254787445, "learning_rate": 0.00018644573186134343, "loss": 0.2635, "step": 1663 }, { "epoch": 0.3367739324023477, "grad_norm": 0.39144811034202576, "learning_rate": 0.00018642973783432974, "loss": 0.2736, "step": 1664 }, { "epoch": 0.33697632058287796, "grad_norm": 0.31855225563049316, "learning_rate": 0.00018641373506326207, "loss": 0.2553, "step": 1665 }, { "epoch": 0.3371787087634082, "grad_norm": 0.30389824509620667, "learning_rate": 0.0001863977235497594, "loss": 0.2157, "step": 1666 }, { "epoch": 0.3373810969439385, "grad_norm": 0.3267851769924164, "learning_rate": 0.00018638170329544164, "loss": 0.2581, "step": 1667 }, { "epoch": 0.33758348512446873, "grad_norm": 0.3200203478336334, "learning_rate": 0.00018636567430192953, "loss": 0.2484, "step": 1668 }, { "epoch": 0.337785873304999, "grad_norm": 0.2819439768791199, "learning_rate": 0.00018634963657084472, "loss": 0.2496, "step": 1669 }, { "epoch": 0.33798826148552924, "grad_norm": 0.3534572720527649, "learning_rate": 0.00018633359010380974, "loss": 0.2551, "step": 1670 }, { "epoch": 0.3381906496660595, "grad_norm": 0.3555509150028229, "learning_rate": 0.000186317534902448, "loss": 0.2806, "step": 1671 }, { "epoch": 0.33839303784658975, "grad_norm": 0.46702417731285095, "learning_rate": 0.00018630147096838378, "loss": 0.2207, "step": 1672 }, { "epoch": 0.33859542602712, "grad_norm": 0.31666767597198486, "learning_rate": 0.00018628539830324229, "loss": 0.2332, "step": 1673 }, { "epoch": 0.33879781420765026, "grad_norm": 0.3063281178474426, "learning_rate": 0.0001862693169086496, "loss": 0.2301, "step": 1674 }, { "epoch": 0.3390002023881805, "grad_norm": 0.23433008790016174, "learning_rate": 0.0001862532267862326, "loss": 0.2128, "step": 1675 }, { "epoch": 0.33920259056871077, "grad_norm": 0.3376384377479553, "learning_rate": 0.0001862371279376192, "loss": 0.2373, "step": 1676 }, { "epoch": 0.339404978749241, "grad_norm": 0.41629114747047424, "learning_rate": 0.00018622102036443806, "loss": 0.2496, "step": 1677 }, { "epoch": 0.3396073669297713, "grad_norm": 0.40785738825798035, "learning_rate": 0.00018620490406831875, "loss": 0.244, "step": 1678 }, { "epoch": 0.33980975511030154, "grad_norm": 0.3256091773509979, "learning_rate": 0.00018618877905089183, "loss": 0.2055, "step": 1679 }, { "epoch": 0.3400121432908318, "grad_norm": 0.3316114544868469, "learning_rate": 0.0001861726453137886, "loss": 0.207, "step": 1680 }, { "epoch": 0.34021453147136205, "grad_norm": 1.1633223295211792, "learning_rate": 0.00018615650285864132, "loss": 0.2441, "step": 1681 }, { "epoch": 0.3404169196518923, "grad_norm": 0.3376322090625763, "learning_rate": 0.0001861403516870831, "loss": 0.2722, "step": 1682 }, { "epoch": 0.3406193078324226, "grad_norm": 0.33407843112945557, "learning_rate": 0.00018612419180074797, "loss": 0.2484, "step": 1683 }, { "epoch": 0.34082169601295287, "grad_norm": 0.28343215584754944, "learning_rate": 0.0001861080232012708, "loss": 0.2318, "step": 1684 }, { "epoch": 0.3410240841934831, "grad_norm": 0.3230278193950653, "learning_rate": 0.00018609184589028733, "loss": 0.2357, "step": 1685 }, { "epoch": 0.3412264723740134, "grad_norm": 0.31829163432121277, "learning_rate": 0.0001860756598694343, "loss": 0.2468, "step": 1686 }, { "epoch": 0.34142886055454363, "grad_norm": 0.3306484520435333, "learning_rate": 0.00018605946514034915, "loss": 0.2483, "step": 1687 }, { "epoch": 0.3416312487350739, "grad_norm": 0.2926234006881714, "learning_rate": 0.00018604326170467035, "loss": 0.2282, "step": 1688 }, { "epoch": 0.34183363691560414, "grad_norm": 0.6779906749725342, "learning_rate": 0.00018602704956403716, "loss": 0.2533, "step": 1689 }, { "epoch": 0.3420360250961344, "grad_norm": 0.3214509189128876, "learning_rate": 0.00018601082872008977, "loss": 0.2078, "step": 1690 }, { "epoch": 0.34223841327666465, "grad_norm": 0.2985462248325348, "learning_rate": 0.00018599459917446924, "loss": 0.2648, "step": 1691 }, { "epoch": 0.3424408014571949, "grad_norm": 0.503711462020874, "learning_rate": 0.0001859783609288175, "loss": 0.2725, "step": 1692 }, { "epoch": 0.34264318963772517, "grad_norm": 0.3204715847969055, "learning_rate": 0.0001859621139847773, "loss": 0.2021, "step": 1693 }, { "epoch": 0.3428455778182554, "grad_norm": 0.28608301281929016, "learning_rate": 0.0001859458583439925, "loss": 0.2391, "step": 1694 }, { "epoch": 0.3430479659987857, "grad_norm": 0.3452533483505249, "learning_rate": 0.0001859295940081075, "loss": 0.2597, "step": 1695 }, { "epoch": 0.34325035417931593, "grad_norm": 0.3648858368396759, "learning_rate": 0.00018591332097876782, "loss": 0.2276, "step": 1696 }, { "epoch": 0.3434527423598462, "grad_norm": 0.2962428331375122, "learning_rate": 0.00018589703925761986, "loss": 0.2382, "step": 1697 }, { "epoch": 0.34365513054037644, "grad_norm": 0.33181461691856384, "learning_rate": 0.00018588074884631076, "loss": 0.2436, "step": 1698 }, { "epoch": 0.3438575187209067, "grad_norm": 0.32770097255706787, "learning_rate": 0.00018586444974648858, "loss": 0.241, "step": 1699 }, { "epoch": 0.34405990690143695, "grad_norm": 0.23778030276298523, "learning_rate": 0.00018584814195980238, "loss": 0.2011, "step": 1700 }, { "epoch": 0.34405990690143695, "eval_loss": 0.290479451417923, "eval_runtime": 0.7406, "eval_samples_per_second": 6.752, "eval_steps_per_second": 1.35, "step": 1700 }, { "epoch": 0.3442622950819672, "grad_norm": 0.40289005637168884, "learning_rate": 0.00018583182548790196, "loss": 0.2699, "step": 1701 }, { "epoch": 0.34446468326249746, "grad_norm": 0.46312880516052246, "learning_rate": 0.00018581550033243806, "loss": 0.2294, "step": 1702 }, { "epoch": 0.3446670714430277, "grad_norm": 0.35846683382987976, "learning_rate": 0.00018579916649506229, "loss": 0.2481, "step": 1703 }, { "epoch": 0.344869459623558, "grad_norm": 0.5839173197746277, "learning_rate": 0.00018578282397742712, "loss": 0.2502, "step": 1704 }, { "epoch": 0.3450718478040882, "grad_norm": 0.3601457476615906, "learning_rate": 0.00018576647278118594, "loss": 0.2289, "step": 1705 }, { "epoch": 0.3452742359846185, "grad_norm": 0.32447540760040283, "learning_rate": 0.000185750112907993, "loss": 0.2528, "step": 1706 }, { "epoch": 0.34547662416514874, "grad_norm": 0.3305363059043884, "learning_rate": 0.0001857337443595034, "loss": 0.302, "step": 1707 }, { "epoch": 0.345679012345679, "grad_norm": 0.27065616846084595, "learning_rate": 0.00018571736713737314, "loss": 0.2285, "step": 1708 }, { "epoch": 0.34588140052620925, "grad_norm": 0.32808080315589905, "learning_rate": 0.00018570098124325908, "loss": 0.2727, "step": 1709 }, { "epoch": 0.3460837887067395, "grad_norm": 0.29432907700538635, "learning_rate": 0.00018568458667881895, "loss": 0.2145, "step": 1710 }, { "epoch": 0.34628617688726976, "grad_norm": 0.3078475296497345, "learning_rate": 0.00018566818344571147, "loss": 0.2661, "step": 1711 }, { "epoch": 0.3464885650678, "grad_norm": 0.3330211341381073, "learning_rate": 0.0001856517715455961, "loss": 0.2525, "step": 1712 }, { "epoch": 0.3466909532483303, "grad_norm": 0.3153764605522156, "learning_rate": 0.0001856353509801332, "loss": 0.2511, "step": 1713 }, { "epoch": 0.3468933414288606, "grad_norm": 0.3954264521598816, "learning_rate": 0.00018561892175098405, "loss": 0.2523, "step": 1714 }, { "epoch": 0.34709572960939084, "grad_norm": 0.2736445367336273, "learning_rate": 0.0001856024838598108, "loss": 0.2146, "step": 1715 }, { "epoch": 0.3472981177899211, "grad_norm": 0.3401740789413452, "learning_rate": 0.0001855860373082764, "loss": 0.2577, "step": 1716 }, { "epoch": 0.34750050597045135, "grad_norm": 0.2959592044353485, "learning_rate": 0.00018556958209804485, "loss": 0.2301, "step": 1717 }, { "epoch": 0.3477028941509816, "grad_norm": 0.28072482347488403, "learning_rate": 0.00018555311823078083, "loss": 0.2558, "step": 1718 }, { "epoch": 0.34790528233151186, "grad_norm": 0.3037206530570984, "learning_rate": 0.00018553664570815001, "loss": 0.264, "step": 1719 }, { "epoch": 0.3481076705120421, "grad_norm": 0.27355295419692993, "learning_rate": 0.0001855201645318189, "loss": 0.2362, "step": 1720 }, { "epoch": 0.34831005869257237, "grad_norm": 0.35578373074531555, "learning_rate": 0.0001855036747034549, "loss": 0.2477, "step": 1721 }, { "epoch": 0.3485124468731026, "grad_norm": 0.4559978246688843, "learning_rate": 0.00018548717622472627, "loss": 0.2813, "step": 1722 }, { "epoch": 0.3487148350536329, "grad_norm": 0.42498183250427246, "learning_rate": 0.00018547066909730214, "loss": 0.2262, "step": 1723 }, { "epoch": 0.34891722323416313, "grad_norm": 0.5447008013725281, "learning_rate": 0.00018545415332285256, "loss": 0.2371, "step": 1724 }, { "epoch": 0.3491196114146934, "grad_norm": 0.3454398214817047, "learning_rate": 0.00018543762890304842, "loss": 0.2485, "step": 1725 }, { "epoch": 0.34932199959522364, "grad_norm": 0.3223482072353363, "learning_rate": 0.00018542109583956148, "loss": 0.2096, "step": 1726 }, { "epoch": 0.3495243877757539, "grad_norm": 0.31731921434402466, "learning_rate": 0.00018540455413406433, "loss": 0.2593, "step": 1727 }, { "epoch": 0.34972677595628415, "grad_norm": 0.33122220635414124, "learning_rate": 0.00018538800378823056, "loss": 0.2643, "step": 1728 }, { "epoch": 0.3499291641368144, "grad_norm": 0.2749335467815399, "learning_rate": 0.00018537144480373455, "loss": 0.2337, "step": 1729 }, { "epoch": 0.35013155231734466, "grad_norm": 0.39072299003601074, "learning_rate": 0.00018535487718225152, "loss": 0.2268, "step": 1730 }, { "epoch": 0.3503339404978749, "grad_norm": 0.32879638671875, "learning_rate": 0.00018533830092545763, "loss": 0.2519, "step": 1731 }, { "epoch": 0.3505363286784052, "grad_norm": 0.340533584356308, "learning_rate": 0.00018532171603502992, "loss": 0.2244, "step": 1732 }, { "epoch": 0.35073871685893543, "grad_norm": 0.2613264322280884, "learning_rate": 0.00018530512251264624, "loss": 0.246, "step": 1733 }, { "epoch": 0.3509411050394657, "grad_norm": 0.33618271350860596, "learning_rate": 0.00018528852035998536, "loss": 0.2353, "step": 1734 }, { "epoch": 0.35114349321999594, "grad_norm": 0.36929893493652344, "learning_rate": 0.00018527190957872694, "loss": 0.272, "step": 1735 }, { "epoch": 0.3513458814005262, "grad_norm": 0.3761570453643799, "learning_rate": 0.00018525529017055143, "loss": 0.2537, "step": 1736 }, { "epoch": 0.35154826958105645, "grad_norm": 0.2979852855205536, "learning_rate": 0.00018523866213714023, "loss": 0.2541, "step": 1737 }, { "epoch": 0.3517506577615867, "grad_norm": 0.30869945883750916, "learning_rate": 0.0001852220254801756, "loss": 0.2288, "step": 1738 }, { "epoch": 0.35195304594211696, "grad_norm": 0.2735971510410309, "learning_rate": 0.00018520538020134065, "loss": 0.2211, "step": 1739 }, { "epoch": 0.3521554341226472, "grad_norm": 0.38951921463012695, "learning_rate": 0.0001851887263023194, "loss": 0.2706, "step": 1740 }, { "epoch": 0.35235782230317747, "grad_norm": 0.32086440920829773, "learning_rate": 0.00018517206378479667, "loss": 0.27, "step": 1741 }, { "epoch": 0.3525602104837077, "grad_norm": 0.3548159897327423, "learning_rate": 0.00018515539265045826, "loss": 0.2511, "step": 1742 }, { "epoch": 0.35276259866423804, "grad_norm": 0.2878701090812683, "learning_rate": 0.00018513871290099074, "loss": 0.2381, "step": 1743 }, { "epoch": 0.3529649868447683, "grad_norm": 0.3361879587173462, "learning_rate": 0.00018512202453808158, "loss": 0.2539, "step": 1744 }, { "epoch": 0.35316737502529855, "grad_norm": 0.31610074639320374, "learning_rate": 0.00018510532756341918, "loss": 0.2477, "step": 1745 }, { "epoch": 0.3533697632058288, "grad_norm": 0.3475480079650879, "learning_rate": 0.00018508862197869273, "loss": 0.2321, "step": 1746 }, { "epoch": 0.35357215138635906, "grad_norm": 0.3138309121131897, "learning_rate": 0.0001850719077855923, "loss": 0.23, "step": 1747 }, { "epoch": 0.3537745395668893, "grad_norm": 0.3435781002044678, "learning_rate": 0.00018505518498580892, "loss": 0.2569, "step": 1748 }, { "epoch": 0.35397692774741957, "grad_norm": 0.3510514497756958, "learning_rate": 0.00018503845358103438, "loss": 0.2504, "step": 1749 }, { "epoch": 0.3541793159279498, "grad_norm": 0.3832964599132538, "learning_rate": 0.00018502171357296144, "loss": 0.2639, "step": 1750 }, { "epoch": 0.3541793159279498, "eval_loss": 0.28545942902565, "eval_runtime": 0.7382, "eval_samples_per_second": 6.773, "eval_steps_per_second": 1.355, "step": 1750 }, { "epoch": 0.3543817041084801, "grad_norm": 0.932768702507019, "learning_rate": 0.0001850049649632836, "loss": 0.2413, "step": 1751 }, { "epoch": 0.35458409228901033, "grad_norm": 0.30881059169769287, "learning_rate": 0.00018498820775369538, "loss": 0.2755, "step": 1752 }, { "epoch": 0.3547864804695406, "grad_norm": 0.35876527428627014, "learning_rate": 0.00018497144194589207, "loss": 0.2547, "step": 1753 }, { "epoch": 0.35498886865007084, "grad_norm": 0.45719102025032043, "learning_rate": 0.00018495466754156982, "loss": 0.2675, "step": 1754 }, { "epoch": 0.3551912568306011, "grad_norm": 0.3853405714035034, "learning_rate": 0.00018493788454242575, "loss": 0.2257, "step": 1755 }, { "epoch": 0.35539364501113135, "grad_norm": 0.4291214346885681, "learning_rate": 0.00018492109295015777, "loss": 0.2542, "step": 1756 }, { "epoch": 0.3555960331916616, "grad_norm": 0.3692077398300171, "learning_rate": 0.0001849042927664647, "loss": 0.2405, "step": 1757 }, { "epoch": 0.35579842137219186, "grad_norm": 0.36899515986442566, "learning_rate": 0.00018488748399304617, "loss": 0.2731, "step": 1758 }, { "epoch": 0.3560008095527221, "grad_norm": 0.28955820202827454, "learning_rate": 0.00018487066663160269, "loss": 0.2448, "step": 1759 }, { "epoch": 0.3562031977332524, "grad_norm": 0.27147674560546875, "learning_rate": 0.0001848538406838357, "loss": 0.2389, "step": 1760 }, { "epoch": 0.35640558591378263, "grad_norm": 0.38604146242141724, "learning_rate": 0.0001848370061514475, "loss": 0.2672, "step": 1761 }, { "epoch": 0.3566079740943129, "grad_norm": 0.3210741877555847, "learning_rate": 0.0001848201630361412, "loss": 0.2054, "step": 1762 }, { "epoch": 0.35681036227484314, "grad_norm": 0.312847763299942, "learning_rate": 0.0001848033113396208, "loss": 0.2201, "step": 1763 }, { "epoch": 0.3570127504553734, "grad_norm": 0.38909921050071716, "learning_rate": 0.00018478645106359117, "loss": 0.2775, "step": 1764 }, { "epoch": 0.35721513863590365, "grad_norm": 0.35675913095474243, "learning_rate": 0.0001847695822097581, "loss": 0.2529, "step": 1765 }, { "epoch": 0.3574175268164339, "grad_norm": 0.31625452637672424, "learning_rate": 0.0001847527047798282, "loss": 0.247, "step": 1766 }, { "epoch": 0.35761991499696416, "grad_norm": 0.31915339827537537, "learning_rate": 0.00018473581877550887, "loss": 0.2495, "step": 1767 }, { "epoch": 0.3578223031774944, "grad_norm": 0.2950085997581482, "learning_rate": 0.00018471892419850855, "loss": 0.2239, "step": 1768 }, { "epoch": 0.35802469135802467, "grad_norm": 0.35980451107025146, "learning_rate": 0.00018470202105053644, "loss": 0.2518, "step": 1769 }, { "epoch": 0.3582270795385549, "grad_norm": 0.40969300270080566, "learning_rate": 0.0001846851093333026, "loss": 0.2701, "step": 1770 }, { "epoch": 0.3584294677190852, "grad_norm": 0.2965695559978485, "learning_rate": 0.000184668189048518, "loss": 0.2833, "step": 1771 }, { "epoch": 0.35863185589961544, "grad_norm": 0.28182271122932434, "learning_rate": 0.00018465126019789443, "loss": 0.2136, "step": 1772 }, { "epoch": 0.3588342440801457, "grad_norm": 0.3636777698993683, "learning_rate": 0.0001846343227831446, "loss": 0.2936, "step": 1773 }, { "epoch": 0.359036632260676, "grad_norm": 0.3793594539165497, "learning_rate": 0.00018461737680598202, "loss": 0.2448, "step": 1774 }, { "epoch": 0.35923902044120626, "grad_norm": 0.3261548578739166, "learning_rate": 0.00018460042226812115, "loss": 0.2335, "step": 1775 }, { "epoch": 0.3594414086217365, "grad_norm": 0.2846795916557312, "learning_rate": 0.00018458345917127727, "loss": 0.247, "step": 1776 }, { "epoch": 0.35964379680226677, "grad_norm": 0.32100164890289307, "learning_rate": 0.0001845664875171665, "loss": 0.2537, "step": 1777 }, { "epoch": 0.359846184982797, "grad_norm": 0.2889302372932434, "learning_rate": 0.00018454950730750587, "loss": 0.2233, "step": 1778 }, { "epoch": 0.3600485731633273, "grad_norm": 0.36676448583602905, "learning_rate": 0.00018453251854401326, "loss": 0.2355, "step": 1779 }, { "epoch": 0.36025096134385753, "grad_norm": 0.3235335052013397, "learning_rate": 0.00018451552122840742, "loss": 0.2381, "step": 1780 }, { "epoch": 0.3604533495243878, "grad_norm": 0.30230575799942017, "learning_rate": 0.00018449851536240798, "loss": 0.2659, "step": 1781 }, { "epoch": 0.36065573770491804, "grad_norm": 0.4702809751033783, "learning_rate": 0.00018448150094773532, "loss": 0.2251, "step": 1782 }, { "epoch": 0.3608581258854483, "grad_norm": 0.2967616021633148, "learning_rate": 0.00018446447798611088, "loss": 0.2537, "step": 1783 }, { "epoch": 0.36106051406597856, "grad_norm": 0.296118825674057, "learning_rate": 0.00018444744647925685, "loss": 0.258, "step": 1784 }, { "epoch": 0.3612629022465088, "grad_norm": 0.35841524600982666, "learning_rate": 0.00018443040642889628, "loss": 0.2287, "step": 1785 }, { "epoch": 0.36146529042703907, "grad_norm": 0.2674766182899475, "learning_rate": 0.00018441335783675312, "loss": 0.2381, "step": 1786 }, { "epoch": 0.3616676786075693, "grad_norm": 0.305122435092926, "learning_rate": 0.0001843963007045521, "loss": 0.2657, "step": 1787 }, { "epoch": 0.3618700667880996, "grad_norm": 0.35348016023635864, "learning_rate": 0.00018437923503401897, "loss": 0.2608, "step": 1788 }, { "epoch": 0.36207245496862983, "grad_norm": 0.3226334750652313, "learning_rate": 0.00018436216082688022, "loss": 0.2273, "step": 1789 }, { "epoch": 0.3622748431491601, "grad_norm": 0.24315498769283295, "learning_rate": 0.00018434507808486324, "loss": 0.2292, "step": 1790 }, { "epoch": 0.36247723132969034, "grad_norm": 0.3406563103199005, "learning_rate": 0.00018432798680969627, "loss": 0.251, "step": 1791 }, { "epoch": 0.3626796195102206, "grad_norm": 0.3662257194519043, "learning_rate": 0.00018431088700310844, "loss": 0.2665, "step": 1792 }, { "epoch": 0.36288200769075085, "grad_norm": 0.2890268862247467, "learning_rate": 0.00018429377866682972, "loss": 0.2508, "step": 1793 }, { "epoch": 0.3630843958712811, "grad_norm": 0.2680732011795044, "learning_rate": 0.000184276661802591, "loss": 0.2556, "step": 1794 }, { "epoch": 0.36328678405181136, "grad_norm": 0.42836645245552063, "learning_rate": 0.00018425953641212393, "loss": 0.2372, "step": 1795 }, { "epoch": 0.3634891722323416, "grad_norm": 0.4649638533592224, "learning_rate": 0.00018424240249716108, "loss": 0.2675, "step": 1796 }, { "epoch": 0.3636915604128719, "grad_norm": 0.2570657432079315, "learning_rate": 0.00018422526005943586, "loss": 0.2549, "step": 1797 }, { "epoch": 0.36389394859340213, "grad_norm": 0.3064950406551361, "learning_rate": 0.00018420810910068264, "loss": 0.2694, "step": 1798 }, { "epoch": 0.3640963367739324, "grad_norm": 0.28641650080680847, "learning_rate": 0.0001841909496226365, "loss": 0.2657, "step": 1799 }, { "epoch": 0.36429872495446264, "grad_norm": 0.2810456156730652, "learning_rate": 0.00018417378162703348, "loss": 0.2133, "step": 1800 }, { "epoch": 0.36429872495446264, "eval_loss": 0.28418654203414917, "eval_runtime": 0.7405, "eval_samples_per_second": 6.752, "eval_steps_per_second": 1.35, "step": 1800 }, { "epoch": 0.3645011131349929, "grad_norm": 0.2836998403072357, "learning_rate": 0.00018415660511561047, "loss": 0.2408, "step": 1801 }, { "epoch": 0.36470350131552315, "grad_norm": 0.26643380522727966, "learning_rate": 0.00018413942009010522, "loss": 0.2094, "step": 1802 }, { "epoch": 0.3649058894960534, "grad_norm": 0.2961254417896271, "learning_rate": 0.00018412222655225628, "loss": 0.2621, "step": 1803 }, { "epoch": 0.3651082776765837, "grad_norm": 0.3298720121383667, "learning_rate": 0.00018410502450380315, "loss": 0.2449, "step": 1804 }, { "epoch": 0.36531066585711397, "grad_norm": 0.34053587913513184, "learning_rate": 0.00018408781394648615, "loss": 0.2536, "step": 1805 }, { "epoch": 0.3655130540376442, "grad_norm": 0.3451336622238159, "learning_rate": 0.00018407059488204645, "loss": 0.2532, "step": 1806 }, { "epoch": 0.3657154422181745, "grad_norm": 0.33622947335243225, "learning_rate": 0.00018405336731222612, "loss": 0.2062, "step": 1807 }, { "epoch": 0.36591783039870474, "grad_norm": 0.30621781945228577, "learning_rate": 0.00018403613123876803, "loss": 0.241, "step": 1808 }, { "epoch": 0.366120218579235, "grad_norm": 0.32667985558509827, "learning_rate": 0.000184018886663416, "loss": 0.2363, "step": 1809 }, { "epoch": 0.36632260675976525, "grad_norm": 0.31269723176956177, "learning_rate": 0.00018400163358791454, "loss": 0.2572, "step": 1810 }, { "epoch": 0.3665249949402955, "grad_norm": 0.31308773159980774, "learning_rate": 0.00018398437201400927, "loss": 0.2629, "step": 1811 }, { "epoch": 0.36672738312082576, "grad_norm": 0.23163512349128723, "learning_rate": 0.0001839671019434465, "loss": 0.1939, "step": 1812 }, { "epoch": 0.366929771301356, "grad_norm": 0.2652641832828522, "learning_rate": 0.00018394982337797337, "loss": 0.2445, "step": 1813 }, { "epoch": 0.36713215948188627, "grad_norm": 0.4329833388328552, "learning_rate": 0.00018393253631933797, "loss": 0.2726, "step": 1814 }, { "epoch": 0.3673345476624165, "grad_norm": 0.44865334033966064, "learning_rate": 0.0001839152407692893, "loss": 0.2265, "step": 1815 }, { "epoch": 0.3675369358429468, "grad_norm": 0.26356157660484314, "learning_rate": 0.000183897936729577, "loss": 0.214, "step": 1816 }, { "epoch": 0.36773932402347703, "grad_norm": 0.2965501844882965, "learning_rate": 0.00018388062420195188, "loss": 0.1911, "step": 1817 }, { "epoch": 0.3679417122040073, "grad_norm": 0.2967815399169922, "learning_rate": 0.00018386330318816529, "loss": 0.2478, "step": 1818 }, { "epoch": 0.36814410038453754, "grad_norm": 0.37267395853996277, "learning_rate": 0.00018384597368996966, "loss": 0.2583, "step": 1819 }, { "epoch": 0.3683464885650678, "grad_norm": 0.36238357424736023, "learning_rate": 0.00018382863570911822, "loss": 0.2703, "step": 1820 }, { "epoch": 0.36854887674559805, "grad_norm": 0.3597848415374756, "learning_rate": 0.00018381128924736502, "loss": 0.277, "step": 1821 }, { "epoch": 0.3687512649261283, "grad_norm": 0.37893056869506836, "learning_rate": 0.00018379393430646498, "loss": 0.2414, "step": 1822 }, { "epoch": 0.36895365310665856, "grad_norm": 0.2763799726963043, "learning_rate": 0.00018377657088817392, "loss": 0.2366, "step": 1823 }, { "epoch": 0.3691560412871888, "grad_norm": 0.32323357462882996, "learning_rate": 0.00018375919899424846, "loss": 0.2512, "step": 1824 }, { "epoch": 0.3693584294677191, "grad_norm": 0.2714237868785858, "learning_rate": 0.00018374181862644613, "loss": 0.242, "step": 1825 }, { "epoch": 0.36956081764824933, "grad_norm": 0.557920515537262, "learning_rate": 0.00018372442978652532, "loss": 0.2205, "step": 1826 }, { "epoch": 0.3697632058287796, "grad_norm": 0.2829962372779846, "learning_rate": 0.00018370703247624516, "loss": 0.2467, "step": 1827 }, { "epoch": 0.36996559400930984, "grad_norm": 0.2736676037311554, "learning_rate": 0.00018368962669736578, "loss": 0.2482, "step": 1828 }, { "epoch": 0.3701679821898401, "grad_norm": 0.3360370397567749, "learning_rate": 0.00018367221245164816, "loss": 0.2753, "step": 1829 }, { "epoch": 0.37037037037037035, "grad_norm": 0.40492188930511475, "learning_rate": 0.000183654789740854, "loss": 0.2671, "step": 1830 }, { "epoch": 0.3705727585509006, "grad_norm": 0.3024072051048279, "learning_rate": 0.00018363735856674604, "loss": 0.2373, "step": 1831 }, { "epoch": 0.37077514673143086, "grad_norm": 0.3565376102924347, "learning_rate": 0.0001836199189310877, "loss": 0.2321, "step": 1832 }, { "epoch": 0.3709775349119611, "grad_norm": 0.4333427846431732, "learning_rate": 0.00018360247083564342, "loss": 0.2743, "step": 1833 }, { "epoch": 0.3711799230924914, "grad_norm": 0.3357686996459961, "learning_rate": 0.00018358501428217833, "loss": 0.2359, "step": 1834 }, { "epoch": 0.3713823112730217, "grad_norm": 0.30689799785614014, "learning_rate": 0.00018356754927245856, "loss": 0.258, "step": 1835 }, { "epoch": 0.37158469945355194, "grad_norm": 0.7203855514526367, "learning_rate": 0.00018355007580825102, "loss": 0.2673, "step": 1836 }, { "epoch": 0.3717870876340822, "grad_norm": 0.5838192701339722, "learning_rate": 0.0001835325938913235, "loss": 0.2674, "step": 1837 }, { "epoch": 0.37198947581461245, "grad_norm": 0.2569965124130249, "learning_rate": 0.0001835151035234446, "loss": 0.229, "step": 1838 }, { "epoch": 0.3721918639951427, "grad_norm": 0.3677656352519989, "learning_rate": 0.00018349760470638384, "loss": 0.2847, "step": 1839 }, { "epoch": 0.37239425217567296, "grad_norm": 0.32607802748680115, "learning_rate": 0.00018348009744191158, "loss": 0.2258, "step": 1840 }, { "epoch": 0.3725966403562032, "grad_norm": 0.8226847648620605, "learning_rate": 0.00018346258173179903, "loss": 0.2736, "step": 1841 }, { "epoch": 0.37279902853673347, "grad_norm": 0.33003148436546326, "learning_rate": 0.00018344505757781818, "loss": 0.2587, "step": 1842 }, { "epoch": 0.3730014167172637, "grad_norm": 0.6836985945701599, "learning_rate": 0.000183427524981742, "loss": 0.2281, "step": 1843 }, { "epoch": 0.373203804897794, "grad_norm": 0.4346601963043213, "learning_rate": 0.00018340998394534425, "loss": 0.2798, "step": 1844 }, { "epoch": 0.37340619307832423, "grad_norm": 0.28121423721313477, "learning_rate": 0.0001833924344703995, "loss": 0.2677, "step": 1845 }, { "epoch": 0.3736085812588545, "grad_norm": 0.45256420969963074, "learning_rate": 0.00018337487655868331, "loss": 0.2561, "step": 1846 }, { "epoch": 0.37381096943938474, "grad_norm": 0.5170138478279114, "learning_rate": 0.00018335731021197193, "loss": 0.3101, "step": 1847 }, { "epoch": 0.374013357619915, "grad_norm": 0.3438572585582733, "learning_rate": 0.00018333973543204255, "loss": 0.2484, "step": 1848 }, { "epoch": 0.37421574580044525, "grad_norm": 0.6279707551002502, "learning_rate": 0.00018332215222067322, "loss": 0.2347, "step": 1849 }, { "epoch": 0.3744181339809755, "grad_norm": 0.3407716155052185, "learning_rate": 0.0001833045605796428, "loss": 0.2515, "step": 1850 }, { "epoch": 0.3744181339809755, "eval_loss": 0.28794625401496887, "eval_runtime": 0.7392, "eval_samples_per_second": 6.764, "eval_steps_per_second": 1.353, "step": 1850 }, { "epoch": 0.37462052216150576, "grad_norm": 0.2511065900325775, "learning_rate": 0.00018328696051073107, "loss": 0.2173, "step": 1851 }, { "epoch": 0.374822910342036, "grad_norm": 0.2791818380355835, "learning_rate": 0.00018326935201571859, "loss": 0.2783, "step": 1852 }, { "epoch": 0.3750252985225663, "grad_norm": 0.3694707155227661, "learning_rate": 0.0001832517350963868, "loss": 0.2367, "step": 1853 }, { "epoch": 0.37522768670309653, "grad_norm": 0.4342188537120819, "learning_rate": 0.000183234109754518, "loss": 0.2433, "step": 1854 }, { "epoch": 0.3754300748836268, "grad_norm": 0.35381773114204407, "learning_rate": 0.00018321647599189538, "loss": 0.2492, "step": 1855 }, { "epoch": 0.37563246306415704, "grad_norm": 0.3225228488445282, "learning_rate": 0.00018319883381030287, "loss": 0.214, "step": 1856 }, { "epoch": 0.3758348512446873, "grad_norm": 0.3866134285926819, "learning_rate": 0.00018318118321152533, "loss": 0.2737, "step": 1857 }, { "epoch": 0.37603723942521755, "grad_norm": 0.28414440155029297, "learning_rate": 0.00018316352419734853, "loss": 0.2119, "step": 1858 }, { "epoch": 0.3762396276057478, "grad_norm": 0.3543158173561096, "learning_rate": 0.00018314585676955893, "loss": 0.2344, "step": 1859 }, { "epoch": 0.37644201578627806, "grad_norm": 0.37424933910369873, "learning_rate": 0.00018312818092994403, "loss": 0.2577, "step": 1860 }, { "epoch": 0.3766444039668083, "grad_norm": 0.48105794191360474, "learning_rate": 0.00018311049668029197, "loss": 0.2546, "step": 1861 }, { "epoch": 0.37684679214733857, "grad_norm": 0.3142531216144562, "learning_rate": 0.000183092804022392, "loss": 0.2466, "step": 1862 }, { "epoch": 0.3770491803278688, "grad_norm": 0.3777507245540619, "learning_rate": 0.00018307510295803396, "loss": 0.2528, "step": 1863 }, { "epoch": 0.3772515685083991, "grad_norm": 0.35614216327667236, "learning_rate": 0.0001830573934890087, "loss": 0.2479, "step": 1864 }, { "epoch": 0.3774539566889294, "grad_norm": 0.40795329213142395, "learning_rate": 0.00018303967561710788, "loss": 0.2684, "step": 1865 }, { "epoch": 0.37765634486945965, "grad_norm": 0.3819682002067566, "learning_rate": 0.000183021949344124, "loss": 0.2542, "step": 1866 }, { "epoch": 0.3778587330499899, "grad_norm": 0.3602333962917328, "learning_rate": 0.00018300421467185046, "loss": 0.2412, "step": 1867 }, { "epoch": 0.37806112123052016, "grad_norm": 0.41661393642425537, "learning_rate": 0.0001829864716020814, "loss": 0.239, "step": 1868 }, { "epoch": 0.3782635094110504, "grad_norm": 0.40309882164001465, "learning_rate": 0.00018296872013661192, "loss": 0.2956, "step": 1869 }, { "epoch": 0.37846589759158067, "grad_norm": 0.3767727017402649, "learning_rate": 0.0001829509602772379, "loss": 0.2564, "step": 1870 }, { "epoch": 0.3786682857721109, "grad_norm": 0.3340590000152588, "learning_rate": 0.00018293319202575614, "loss": 0.2595, "step": 1871 }, { "epoch": 0.3788706739526412, "grad_norm": 0.4940049648284912, "learning_rate": 0.0001829154153839642, "loss": 0.2399, "step": 1872 }, { "epoch": 0.37907306213317143, "grad_norm": 0.2716989815235138, "learning_rate": 0.00018289763035366055, "loss": 0.2129, "step": 1873 }, { "epoch": 0.3792754503137017, "grad_norm": 0.32507073879241943, "learning_rate": 0.00018287983693664455, "loss": 0.2391, "step": 1874 }, { "epoch": 0.37947783849423194, "grad_norm": 0.3758697211742401, "learning_rate": 0.00018286203513471623, "loss": 0.241, "step": 1875 }, { "epoch": 0.3796802266747622, "grad_norm": 0.3964853584766388, "learning_rate": 0.00018284422494967668, "loss": 0.2686, "step": 1876 }, { "epoch": 0.37988261485529246, "grad_norm": 0.5915670394897461, "learning_rate": 0.00018282640638332773, "loss": 0.2551, "step": 1877 }, { "epoch": 0.3800850030358227, "grad_norm": 0.3817998468875885, "learning_rate": 0.00018280857943747206, "loss": 0.2885, "step": 1878 }, { "epoch": 0.38028739121635297, "grad_norm": 0.31637680530548096, "learning_rate": 0.00018279074411391323, "loss": 0.2691, "step": 1879 }, { "epoch": 0.3804897793968832, "grad_norm": 0.3863159120082855, "learning_rate": 0.00018277290041445563, "loss": 0.2769, "step": 1880 }, { "epoch": 0.3806921675774135, "grad_norm": 0.28158068656921387, "learning_rate": 0.0001827550483409045, "loss": 0.2668, "step": 1881 }, { "epoch": 0.38089455575794373, "grad_norm": 0.39321863651275635, "learning_rate": 0.0001827371878950659, "loss": 0.2705, "step": 1882 }, { "epoch": 0.381096943938474, "grad_norm": 0.3414781987667084, "learning_rate": 0.00018271931907874677, "loss": 0.2484, "step": 1883 }, { "epoch": 0.38129933211900424, "grad_norm": 0.2941926419734955, "learning_rate": 0.00018270144189375492, "loss": 0.229, "step": 1884 }, { "epoch": 0.3815017202995345, "grad_norm": 0.30828848481178284, "learning_rate": 0.00018268355634189893, "loss": 0.2382, "step": 1885 }, { "epoch": 0.38170410848006475, "grad_norm": 0.3570677936077118, "learning_rate": 0.00018266566242498833, "loss": 0.2914, "step": 1886 }, { "epoch": 0.381906496660595, "grad_norm": 0.33216559886932373, "learning_rate": 0.0001826477601448334, "loss": 0.2525, "step": 1887 }, { "epoch": 0.38210888484112526, "grad_norm": 0.37334170937538147, "learning_rate": 0.0001826298495032453, "loss": 0.2152, "step": 1888 }, { "epoch": 0.3823112730216555, "grad_norm": 0.3412468731403351, "learning_rate": 0.00018261193050203605, "loss": 0.2453, "step": 1889 }, { "epoch": 0.3825136612021858, "grad_norm": 0.255756676197052, "learning_rate": 0.0001825940031430185, "loss": 0.2203, "step": 1890 }, { "epoch": 0.38271604938271603, "grad_norm": 0.29467645287513733, "learning_rate": 0.0001825760674280064, "loss": 0.2491, "step": 1891 }, { "epoch": 0.3829184375632463, "grad_norm": 0.4214814305305481, "learning_rate": 0.00018255812335881425, "loss": 0.273, "step": 1892 }, { "epoch": 0.38312082574377654, "grad_norm": 0.43483030796051025, "learning_rate": 0.0001825401709372574, "loss": 0.2466, "step": 1893 }, { "epoch": 0.3833232139243068, "grad_norm": 0.31902021169662476, "learning_rate": 0.0001825222101651522, "loss": 0.2536, "step": 1894 }, { "epoch": 0.3835256021048371, "grad_norm": 0.39684993028640747, "learning_rate": 0.00018250424104431564, "loss": 0.2377, "step": 1895 }, { "epoch": 0.38372799028536736, "grad_norm": 0.5531018376350403, "learning_rate": 0.00018248626357656567, "loss": 0.241, "step": 1896 }, { "epoch": 0.3839303784658976, "grad_norm": 0.3912695348262787, "learning_rate": 0.0001824682777637211, "loss": 0.2672, "step": 1897 }, { "epoch": 0.38413276664642787, "grad_norm": 0.2787422835826874, "learning_rate": 0.0001824502836076015, "loss": 0.2097, "step": 1898 }, { "epoch": 0.3843351548269581, "grad_norm": 0.28876063227653503, "learning_rate": 0.00018243228111002732, "loss": 0.2439, "step": 1899 }, { "epoch": 0.3845375430074884, "grad_norm": 0.3721954822540283, "learning_rate": 0.00018241427027281993, "loss": 0.2368, "step": 1900 }, { "epoch": 0.3845375430074884, "eval_loss": 0.2891212999820709, "eval_runtime": 0.7385, "eval_samples_per_second": 6.77, "eval_steps_per_second": 1.354, "step": 1900 }, { "epoch": 0.38473993118801864, "grad_norm": 0.43678271770477295, "learning_rate": 0.00018239625109780144, "loss": 0.2922, "step": 1901 }, { "epoch": 0.3849423193685489, "grad_norm": 0.34848496317863464, "learning_rate": 0.0001823782235867948, "loss": 0.2649, "step": 1902 }, { "epoch": 0.38514470754907915, "grad_norm": 0.4435858726501465, "learning_rate": 0.00018236018774162388, "loss": 0.2728, "step": 1903 }, { "epoch": 0.3853470957296094, "grad_norm": 0.44345182180404663, "learning_rate": 0.00018234214356411342, "loss": 0.2716, "step": 1904 }, { "epoch": 0.38554948391013966, "grad_norm": 0.38408511877059937, "learning_rate": 0.00018232409105608884, "loss": 0.2434, "step": 1905 }, { "epoch": 0.3857518720906699, "grad_norm": 0.2690526843070984, "learning_rate": 0.0001823060302193765, "loss": 0.2581, "step": 1906 }, { "epoch": 0.38595426027120017, "grad_norm": 0.32186561822891235, "learning_rate": 0.00018228796105580373, "loss": 0.254, "step": 1907 }, { "epoch": 0.3861566484517304, "grad_norm": 0.3032004237174988, "learning_rate": 0.00018226988356719845, "loss": 0.2353, "step": 1908 }, { "epoch": 0.3863590366322607, "grad_norm": 0.3549427390098572, "learning_rate": 0.0001822517977553896, "loss": 0.2407, "step": 1909 }, { "epoch": 0.38656142481279093, "grad_norm": 0.3449702858924866, "learning_rate": 0.00018223370362220696, "loss": 0.2516, "step": 1910 }, { "epoch": 0.3867638129933212, "grad_norm": 0.39067935943603516, "learning_rate": 0.00018221560116948103, "loss": 0.2618, "step": 1911 }, { "epoch": 0.38696620117385144, "grad_norm": 0.37322261929512024, "learning_rate": 0.00018219749039904322, "loss": 0.226, "step": 1912 }, { "epoch": 0.3871685893543817, "grad_norm": 0.4211459457874298, "learning_rate": 0.00018217937131272585, "loss": 0.2813, "step": 1913 }, { "epoch": 0.38737097753491195, "grad_norm": 0.3087145686149597, "learning_rate": 0.00018216124391236198, "loss": 0.2307, "step": 1914 }, { "epoch": 0.3875733657154422, "grad_norm": 0.3727726936340332, "learning_rate": 0.00018214310819978556, "loss": 0.265, "step": 1915 }, { "epoch": 0.38777575389597246, "grad_norm": 0.3550623655319214, "learning_rate": 0.00018212496417683137, "loss": 0.2585, "step": 1916 }, { "epoch": 0.3879781420765027, "grad_norm": 0.30683794617652893, "learning_rate": 0.000182106811845335, "loss": 0.2422, "step": 1917 }, { "epoch": 0.388180530257033, "grad_norm": 0.4710239768028259, "learning_rate": 0.000182088651207133, "loss": 0.2756, "step": 1918 }, { "epoch": 0.38838291843756323, "grad_norm": 0.30055004358291626, "learning_rate": 0.0001820704822640626, "loss": 0.233, "step": 1919 }, { "epoch": 0.3885853066180935, "grad_norm": 0.33683738112449646, "learning_rate": 0.00018205230501796196, "loss": 0.2428, "step": 1920 }, { "epoch": 0.38878769479862374, "grad_norm": 0.25107863545417786, "learning_rate": 0.00018203411947067006, "loss": 0.2183, "step": 1921 }, { "epoch": 0.388990082979154, "grad_norm": 0.30623263120651245, "learning_rate": 0.00018201592562402672, "loss": 0.2546, "step": 1922 }, { "epoch": 0.38919247115968425, "grad_norm": 0.2566131353378296, "learning_rate": 0.0001819977234798726, "loss": 0.2203, "step": 1923 }, { "epoch": 0.3893948593402145, "grad_norm": 0.3132251501083374, "learning_rate": 0.00018197951304004922, "loss": 0.2521, "step": 1924 }, { "epoch": 0.3895972475207448, "grad_norm": 0.29413077235221863, "learning_rate": 0.00018196129430639896, "loss": 0.2248, "step": 1925 }, { "epoch": 0.38979963570127507, "grad_norm": 0.32894453406333923, "learning_rate": 0.0001819430672807649, "loss": 0.2401, "step": 1926 }, { "epoch": 0.3900020238818053, "grad_norm": 0.26279136538505554, "learning_rate": 0.0001819248319649911, "loss": 0.1968, "step": 1927 }, { "epoch": 0.3902044120623356, "grad_norm": 0.24191400408744812, "learning_rate": 0.00018190658836092244, "loss": 0.2482, "step": 1928 }, { "epoch": 0.39040680024286584, "grad_norm": 0.2685995399951935, "learning_rate": 0.00018188833647040463, "loss": 0.2259, "step": 1929 }, { "epoch": 0.3906091884233961, "grad_norm": 0.4994000196456909, "learning_rate": 0.00018187007629528416, "loss": 0.2561, "step": 1930 }, { "epoch": 0.39081157660392635, "grad_norm": 0.29093116521835327, "learning_rate": 0.00018185180783740842, "loss": 0.2539, "step": 1931 }, { "epoch": 0.3910139647844566, "grad_norm": 0.49854952096939087, "learning_rate": 0.00018183353109862561, "loss": 0.2879, "step": 1932 }, { "epoch": 0.39121635296498686, "grad_norm": 0.3427339494228363, "learning_rate": 0.00018181524608078484, "loss": 0.2422, "step": 1933 }, { "epoch": 0.3914187411455171, "grad_norm": 0.32090428471565247, "learning_rate": 0.0001817969527857359, "loss": 0.2734, "step": 1934 }, { "epoch": 0.39162112932604737, "grad_norm": 0.33059003949165344, "learning_rate": 0.00018177865121532963, "loss": 0.2627, "step": 1935 }, { "epoch": 0.3918235175065776, "grad_norm": 0.3862765431404114, "learning_rate": 0.00018176034137141746, "loss": 0.2851, "step": 1936 }, { "epoch": 0.3920259056871079, "grad_norm": 0.31288978457450867, "learning_rate": 0.00018174202325585184, "loss": 0.234, "step": 1937 }, { "epoch": 0.39222829386763813, "grad_norm": 0.2682187855243683, "learning_rate": 0.00018172369687048608, "loss": 0.2177, "step": 1938 }, { "epoch": 0.3924306820481684, "grad_norm": 0.2934335172176361, "learning_rate": 0.00018170536221717416, "loss": 0.223, "step": 1939 }, { "epoch": 0.39263307022869864, "grad_norm": 0.5119560360908508, "learning_rate": 0.00018168701929777102, "loss": 0.2685, "step": 1940 }, { "epoch": 0.3928354584092289, "grad_norm": 0.889604926109314, "learning_rate": 0.00018166866811413236, "loss": 0.2363, "step": 1941 }, { "epoch": 0.39303784658975915, "grad_norm": 0.3027733862400055, "learning_rate": 0.00018165030866811486, "loss": 0.2128, "step": 1942 }, { "epoch": 0.3932402347702894, "grad_norm": 0.2894009053707123, "learning_rate": 0.00018163194096157582, "loss": 0.2565, "step": 1943 }, { "epoch": 0.39344262295081966, "grad_norm": 0.3722357451915741, "learning_rate": 0.0001816135649963736, "loss": 0.2587, "step": 1944 }, { "epoch": 0.3936450111313499, "grad_norm": 0.4462873339653015, "learning_rate": 0.00018159518077436718, "loss": 0.3092, "step": 1945 }, { "epoch": 0.3938473993118802, "grad_norm": 0.5751633048057556, "learning_rate": 0.0001815767882974166, "loss": 0.2906, "step": 1946 }, { "epoch": 0.39404978749241043, "grad_norm": 0.47575053572654724, "learning_rate": 0.00018155838756738252, "loss": 0.2553, "step": 1947 }, { "epoch": 0.3942521756729407, "grad_norm": 2.4225292205810547, "learning_rate": 0.00018153997858612656, "loss": 0.2262, "step": 1948 }, { "epoch": 0.39445456385347094, "grad_norm": 0.41146135330200195, "learning_rate": 0.00018152156135551117, "loss": 0.2422, "step": 1949 }, { "epoch": 0.3946569520340012, "grad_norm": 0.3842962682247162, "learning_rate": 0.00018150313587739957, "loss": 0.2319, "step": 1950 }, { "epoch": 0.3946569520340012, "eval_loss": 0.29187315702438354, "eval_runtime": 0.7369, "eval_samples_per_second": 6.785, "eval_steps_per_second": 1.357, "step": 1950 }, { "epoch": 0.39485934021453145, "grad_norm": 0.3393075168132782, "learning_rate": 0.00018148470215365595, "loss": 0.2476, "step": 1951 }, { "epoch": 0.3950617283950617, "grad_norm": 0.7554660439491272, "learning_rate": 0.00018146626018614512, "loss": 0.2346, "step": 1952 }, { "epoch": 0.39526411657559196, "grad_norm": 0.2916422188282013, "learning_rate": 0.00018144780997673293, "loss": 0.2481, "step": 1953 }, { "epoch": 0.3954665047561222, "grad_norm": 0.4447515308856964, "learning_rate": 0.00018142935152728592, "loss": 0.2622, "step": 1954 }, { "epoch": 0.39566889293665247, "grad_norm": 0.3644491732120514, "learning_rate": 0.00018141088483967157, "loss": 0.2683, "step": 1955 }, { "epoch": 0.3958712811171828, "grad_norm": 0.43378445506095886, "learning_rate": 0.00018139240991575813, "loss": 0.2068, "step": 1956 }, { "epoch": 0.39607366929771304, "grad_norm": 0.3398344814777374, "learning_rate": 0.00018137392675741468, "loss": 0.1931, "step": 1957 }, { "epoch": 0.3962760574782433, "grad_norm": 0.6017144322395325, "learning_rate": 0.0001813554353665112, "loss": 0.2184, "step": 1958 }, { "epoch": 0.39647844565877355, "grad_norm": 0.36190178990364075, "learning_rate": 0.00018133693574491836, "loss": 0.2764, "step": 1959 }, { "epoch": 0.3966808338393038, "grad_norm": 0.33858707547187805, "learning_rate": 0.00018131842789450786, "loss": 0.252, "step": 1960 }, { "epoch": 0.39688322201983406, "grad_norm": 0.3058442771434784, "learning_rate": 0.00018129991181715208, "loss": 0.19, "step": 1961 }, { "epoch": 0.3970856102003643, "grad_norm": 0.5234771370887756, "learning_rate": 0.00018128138751472432, "loss": 0.2875, "step": 1962 }, { "epoch": 0.39728799838089457, "grad_norm": 0.3918607532978058, "learning_rate": 0.00018126285498909863, "loss": 0.2565, "step": 1963 }, { "epoch": 0.3974903865614248, "grad_norm": 0.4046613872051239, "learning_rate": 0.00018124431424214996, "loss": 0.2803, "step": 1964 }, { "epoch": 0.3976927747419551, "grad_norm": 0.3878602087497711, "learning_rate": 0.00018122576527575404, "loss": 0.2708, "step": 1965 }, { "epoch": 0.39789516292248533, "grad_norm": 0.3302357792854309, "learning_rate": 0.0001812072080917875, "loss": 0.2407, "step": 1966 }, { "epoch": 0.3980975511030156, "grad_norm": 0.38548722863197327, "learning_rate": 0.00018118864269212775, "loss": 0.2543, "step": 1967 }, { "epoch": 0.39829993928354585, "grad_norm": 0.5989793539047241, "learning_rate": 0.00018117006907865298, "loss": 0.2317, "step": 1968 }, { "epoch": 0.3985023274640761, "grad_norm": 0.38723450899124146, "learning_rate": 0.0001811514872532424, "loss": 0.2418, "step": 1969 }, { "epoch": 0.39870471564460636, "grad_norm": 0.303592711687088, "learning_rate": 0.0001811328972177758, "loss": 0.2282, "step": 1970 }, { "epoch": 0.3989071038251366, "grad_norm": 1.0240917205810547, "learning_rate": 0.000181114298974134, "loss": 0.2696, "step": 1971 }, { "epoch": 0.39910949200566687, "grad_norm": 0.2980985641479492, "learning_rate": 0.0001810956925241986, "loss": 0.2568, "step": 1972 }, { "epoch": 0.3993118801861971, "grad_norm": 0.34745925664901733, "learning_rate": 0.0001810770778698519, "loss": 0.2424, "step": 1973 }, { "epoch": 0.3995142683667274, "grad_norm": 0.4588114321231842, "learning_rate": 0.0001810584550129772, "loss": 0.2843, "step": 1974 }, { "epoch": 0.39971665654725763, "grad_norm": 0.407896488904953, "learning_rate": 0.00018103982395545855, "loss": 0.2606, "step": 1975 }, { "epoch": 0.3999190447277879, "grad_norm": 0.41062796115875244, "learning_rate": 0.00018102118469918085, "loss": 0.2513, "step": 1976 }, { "epoch": 0.40012143290831814, "grad_norm": 0.30710867047309875, "learning_rate": 0.00018100253724602988, "loss": 0.2434, "step": 1977 }, { "epoch": 0.4003238210888484, "grad_norm": 0.3533878028392792, "learning_rate": 0.0001809838815978921, "loss": 0.2552, "step": 1978 }, { "epoch": 0.40052620926937865, "grad_norm": 0.442613810300827, "learning_rate": 0.00018096521775665494, "loss": 0.2837, "step": 1979 }, { "epoch": 0.4007285974499089, "grad_norm": 0.42794176936149597, "learning_rate": 0.0001809465457242066, "loss": 0.3078, "step": 1980 }, { "epoch": 0.40093098563043916, "grad_norm": 0.41974005103111267, "learning_rate": 0.00018092786550243613, "loss": 0.2543, "step": 1981 }, { "epoch": 0.4011333738109694, "grad_norm": 0.34940439462661743, "learning_rate": 0.00018090917709323337, "loss": 0.2334, "step": 1982 }, { "epoch": 0.4013357619914997, "grad_norm": 0.39555102586746216, "learning_rate": 0.0001808904804984891, "loss": 0.2469, "step": 1983 }, { "epoch": 0.40153815017202993, "grad_norm": 0.4463624656200409, "learning_rate": 0.00018087177572009475, "loss": 0.2559, "step": 1984 }, { "epoch": 0.4017405383525602, "grad_norm": 0.3513191342353821, "learning_rate": 0.00018085306275994272, "loss": 0.2247, "step": 1985 }, { "epoch": 0.4019429265330905, "grad_norm": 0.38531386852264404, "learning_rate": 0.00018083434161992616, "loss": 0.2478, "step": 1986 }, { "epoch": 0.40214531471362075, "grad_norm": 0.34127408266067505, "learning_rate": 0.00018081561230193913, "loss": 0.2543, "step": 1987 }, { "epoch": 0.402347702894151, "grad_norm": 0.2705632746219635, "learning_rate": 0.00018079687480787642, "loss": 0.1989, "step": 1988 }, { "epoch": 0.40255009107468126, "grad_norm": 0.3331010043621063, "learning_rate": 0.00018077812913963373, "loss": 0.203, "step": 1989 }, { "epoch": 0.4027524792552115, "grad_norm": 1.0123331546783447, "learning_rate": 0.0001807593752991075, "loss": 0.2963, "step": 1990 }, { "epoch": 0.40295486743574177, "grad_norm": 0.46535953879356384, "learning_rate": 0.00018074061328819508, "loss": 0.2546, "step": 1991 }, { "epoch": 0.403157255616272, "grad_norm": 0.2680860459804535, "learning_rate": 0.00018072184310879462, "loss": 0.2216, "step": 1992 }, { "epoch": 0.4033596437968023, "grad_norm": 0.32052627205848694, "learning_rate": 0.00018070306476280508, "loss": 0.234, "step": 1993 }, { "epoch": 0.40356203197733254, "grad_norm": 0.37091732025146484, "learning_rate": 0.00018068427825212625, "loss": 0.2721, "step": 1994 }, { "epoch": 0.4037644201578628, "grad_norm": 0.37389910221099854, "learning_rate": 0.00018066548357865874, "loss": 0.2477, "step": 1995 }, { "epoch": 0.40396680833839305, "grad_norm": 0.3704164922237396, "learning_rate": 0.00018064668074430404, "loss": 0.2748, "step": 1996 }, { "epoch": 0.4041691965189233, "grad_norm": 0.45123517513275146, "learning_rate": 0.0001806278697509644, "loss": 0.2488, "step": 1997 }, { "epoch": 0.40437158469945356, "grad_norm": 0.3480460047721863, "learning_rate": 0.00018060905060054289, "loss": 0.2332, "step": 1998 }, { "epoch": 0.4045739728799838, "grad_norm": 0.35321128368377686, "learning_rate": 0.0001805902232949435, "loss": 0.2777, "step": 1999 }, { "epoch": 0.40477636106051407, "grad_norm": 0.45171496272087097, "learning_rate": 0.0001805713878360709, "loss": 0.2721, "step": 2000 }, { "epoch": 0.40477636106051407, "eval_loss": 0.28785374760627747, "eval_runtime": 0.7406, "eval_samples_per_second": 6.751, "eval_steps_per_second": 1.35, "step": 2000 }, { "epoch": 0.4049787492410443, "grad_norm": 0.42943838238716125, "learning_rate": 0.00018055254422583074, "loss": 0.2848, "step": 2001 }, { "epoch": 0.4051811374215746, "grad_norm": 0.32872962951660156, "learning_rate": 0.00018053369246612936, "loss": 0.2504, "step": 2002 }, { "epoch": 0.40538352560210483, "grad_norm": 0.3315301537513733, "learning_rate": 0.00018051483255887403, "loss": 0.2573, "step": 2003 }, { "epoch": 0.4055859137826351, "grad_norm": 0.4020966589450836, "learning_rate": 0.00018049596450597278, "loss": 0.3004, "step": 2004 }, { "epoch": 0.40578830196316534, "grad_norm": 0.3283519148826599, "learning_rate": 0.00018047708830933444, "loss": 0.2762, "step": 2005 }, { "epoch": 0.4059906901436956, "grad_norm": 0.32964780926704407, "learning_rate": 0.00018045820397086875, "loss": 0.2672, "step": 2006 }, { "epoch": 0.40619307832422585, "grad_norm": 0.42414024472236633, "learning_rate": 0.00018043931149248622, "loss": 0.2627, "step": 2007 }, { "epoch": 0.4063954665047561, "grad_norm": 0.26213163137435913, "learning_rate": 0.0001804204108760982, "loss": 0.2386, "step": 2008 }, { "epoch": 0.40659785468528636, "grad_norm": 0.28232935070991516, "learning_rate": 0.00018040150212361687, "loss": 0.2458, "step": 2009 }, { "epoch": 0.4068002428658166, "grad_norm": 0.5508570075035095, "learning_rate": 0.00018038258523695518, "loss": 0.285, "step": 2010 }, { "epoch": 0.4070026310463469, "grad_norm": 0.275995135307312, "learning_rate": 0.00018036366021802693, "loss": 0.2674, "step": 2011 }, { "epoch": 0.40720501922687713, "grad_norm": 0.3155290186405182, "learning_rate": 0.00018034472706874682, "loss": 0.2601, "step": 2012 }, { "epoch": 0.4074074074074074, "grad_norm": 0.32950180768966675, "learning_rate": 0.00018032578579103029, "loss": 0.2688, "step": 2013 }, { "epoch": 0.40760979558793764, "grad_norm": 0.3493838608264923, "learning_rate": 0.00018030683638679354, "loss": 0.2314, "step": 2014 }, { "epoch": 0.4078121837684679, "grad_norm": 0.3500867486000061, "learning_rate": 0.00018028787885795378, "loss": 0.2472, "step": 2015 }, { "epoch": 0.4080145719489982, "grad_norm": 0.528800368309021, "learning_rate": 0.00018026891320642888, "loss": 0.237, "step": 2016 }, { "epoch": 0.40821696012952846, "grad_norm": 0.5186204314231873, "learning_rate": 0.0001802499394341376, "loss": 0.2233, "step": 2017 }, { "epoch": 0.4084193483100587, "grad_norm": 0.3000638484954834, "learning_rate": 0.0001802309575429995, "loss": 0.2269, "step": 2018 }, { "epoch": 0.40862173649058897, "grad_norm": 0.33724966645240784, "learning_rate": 0.00018021196753493496, "loss": 0.2404, "step": 2019 }, { "epoch": 0.4088241246711192, "grad_norm": 0.41299405694007874, "learning_rate": 0.00018019296941186523, "loss": 0.2333, "step": 2020 }, { "epoch": 0.4090265128516495, "grad_norm": 0.33299964666366577, "learning_rate": 0.00018017396317571228, "loss": 0.2553, "step": 2021 }, { "epoch": 0.40922890103217974, "grad_norm": 0.3150463402271271, "learning_rate": 0.00018015494882839898, "loss": 0.2884, "step": 2022 }, { "epoch": 0.40943128921271, "grad_norm": 0.31459060311317444, "learning_rate": 0.00018013592637184904, "loss": 0.2183, "step": 2023 }, { "epoch": 0.40963367739324025, "grad_norm": 0.45589613914489746, "learning_rate": 0.00018011689580798695, "loss": 0.2286, "step": 2024 }, { "epoch": 0.4098360655737705, "grad_norm": 0.2886069416999817, "learning_rate": 0.00018009785713873794, "loss": 0.2158, "step": 2025 }, { "epoch": 0.41003845375430076, "grad_norm": 0.444607138633728, "learning_rate": 0.00018007881036602823, "loss": 0.2677, "step": 2026 }, { "epoch": 0.410240841934831, "grad_norm": 0.3646940290927887, "learning_rate": 0.00018005975549178476, "loss": 0.2754, "step": 2027 }, { "epoch": 0.41044323011536127, "grad_norm": 0.2616381347179413, "learning_rate": 0.00018004069251793524, "loss": 0.2218, "step": 2028 }, { "epoch": 0.4106456182958915, "grad_norm": 0.42195913195610046, "learning_rate": 0.00018002162144640837, "loss": 0.2607, "step": 2029 }, { "epoch": 0.4108480064764218, "grad_norm": 0.3766661286354065, "learning_rate": 0.00018000254227913348, "loss": 0.2566, "step": 2030 }, { "epoch": 0.41105039465695203, "grad_norm": 0.3197172284126282, "learning_rate": 0.00017998345501804078, "loss": 0.2309, "step": 2031 }, { "epoch": 0.4112527828374823, "grad_norm": 0.29483306407928467, "learning_rate": 0.0001799643596650614, "loss": 0.2379, "step": 2032 }, { "epoch": 0.41145517101801254, "grad_norm": 0.2921574115753174, "learning_rate": 0.00017994525622212713, "loss": 0.2266, "step": 2033 }, { "epoch": 0.4116575591985428, "grad_norm": 0.2598446011543274, "learning_rate": 0.00017992614469117073, "loss": 0.2166, "step": 2034 }, { "epoch": 0.41185994737907305, "grad_norm": 0.3084995150566101, "learning_rate": 0.00017990702507412565, "loss": 0.2619, "step": 2035 }, { "epoch": 0.4120623355596033, "grad_norm": 0.4501848816871643, "learning_rate": 0.0001798878973729262, "loss": 0.265, "step": 2036 }, { "epoch": 0.41226472374013357, "grad_norm": 0.4476367235183716, "learning_rate": 0.0001798687615895076, "loss": 0.2292, "step": 2037 }, { "epoch": 0.4124671119206638, "grad_norm": 0.3379128873348236, "learning_rate": 0.00017984961772580572, "loss": 0.2448, "step": 2038 }, { "epoch": 0.4126695001011941, "grad_norm": 0.3354593813419342, "learning_rate": 0.00017983046578375737, "loss": 0.2529, "step": 2039 }, { "epoch": 0.41287188828172433, "grad_norm": 0.377542644739151, "learning_rate": 0.0001798113057653002, "loss": 0.2685, "step": 2040 }, { "epoch": 0.4130742764622546, "grad_norm": 0.3194352984428406, "learning_rate": 0.0001797921376723725, "loss": 0.2338, "step": 2041 }, { "epoch": 0.41327666464278484, "grad_norm": 0.3700343072414398, "learning_rate": 0.00017977296150691356, "loss": 0.2454, "step": 2042 }, { "epoch": 0.4134790528233151, "grad_norm": 0.5020773410797119, "learning_rate": 0.00017975377727086347, "loss": 0.2591, "step": 2043 }, { "epoch": 0.41368144100384535, "grad_norm": 0.37990298867225647, "learning_rate": 0.000179734584966163, "loss": 0.2663, "step": 2044 }, { "epoch": 0.4138838291843756, "grad_norm": 0.3366142213344574, "learning_rate": 0.00017971538459475388, "loss": 0.2282, "step": 2045 }, { "epoch": 0.41408621736490586, "grad_norm": 0.328730970621109, "learning_rate": 0.00017969617615857858, "loss": 0.2102, "step": 2046 }, { "epoch": 0.4142886055454362, "grad_norm": 0.3981629014015198, "learning_rate": 0.00017967695965958046, "loss": 0.2718, "step": 2047 }, { "epoch": 0.41449099372596643, "grad_norm": 0.43636375665664673, "learning_rate": 0.00017965773509970355, "loss": 0.2602, "step": 2048 }, { "epoch": 0.4146933819064967, "grad_norm": 0.5631572604179382, "learning_rate": 0.00017963850248089286, "loss": 0.2629, "step": 2049 }, { "epoch": 0.41489577008702694, "grad_norm": 0.4168608486652374, "learning_rate": 0.00017961926180509415, "loss": 0.2745, "step": 2050 }, { "epoch": 0.41489577008702694, "eval_loss": 0.2666853070259094, "eval_runtime": 0.7389, "eval_samples_per_second": 6.767, "eval_steps_per_second": 1.353, "step": 2050 }, { "epoch": 0.4150981582675572, "grad_norm": 0.35379037261009216, "learning_rate": 0.00017960001307425395, "loss": 0.2712, "step": 2051 }, { "epoch": 0.41530054644808745, "grad_norm": 0.31800132989883423, "learning_rate": 0.00017958075629031966, "loss": 0.2797, "step": 2052 }, { "epoch": 0.4155029346286177, "grad_norm": 0.3469184935092926, "learning_rate": 0.00017956149145523947, "loss": 0.2749, "step": 2053 }, { "epoch": 0.41570532280914796, "grad_norm": 0.38023021817207336, "learning_rate": 0.00017954221857096242, "loss": 0.2419, "step": 2054 }, { "epoch": 0.4159077109896782, "grad_norm": 0.32849445939064026, "learning_rate": 0.0001795229376394383, "loss": 0.2509, "step": 2055 }, { "epoch": 0.41611009917020847, "grad_norm": 0.36450543999671936, "learning_rate": 0.0001795036486626178, "loss": 0.2962, "step": 2056 }, { "epoch": 0.4163124873507387, "grad_norm": 0.3229864537715912, "learning_rate": 0.00017948435164245234, "loss": 0.2398, "step": 2057 }, { "epoch": 0.416514875531269, "grad_norm": 0.2975095510482788, "learning_rate": 0.00017946504658089422, "loss": 0.2286, "step": 2058 }, { "epoch": 0.41671726371179924, "grad_norm": 0.3617817759513855, "learning_rate": 0.00017944573347989645, "loss": 0.2797, "step": 2059 }, { "epoch": 0.4169196518923295, "grad_norm": 0.466573566198349, "learning_rate": 0.00017942641234141302, "loss": 0.2173, "step": 2060 }, { "epoch": 0.41712204007285975, "grad_norm": 0.29917478561401367, "learning_rate": 0.0001794070831673986, "loss": 0.2679, "step": 2061 }, { "epoch": 0.41732442825339, "grad_norm": 0.36867937445640564, "learning_rate": 0.00017938774595980872, "loss": 0.2855, "step": 2062 }, { "epoch": 0.41752681643392026, "grad_norm": 0.27075591683387756, "learning_rate": 0.0001793684007205997, "loss": 0.2606, "step": 2063 }, { "epoch": 0.4177292046144505, "grad_norm": 0.3169737756252289, "learning_rate": 0.00017934904745172872, "loss": 0.2165, "step": 2064 }, { "epoch": 0.41793159279498077, "grad_norm": 0.292100191116333, "learning_rate": 0.00017932968615515365, "loss": 0.2559, "step": 2065 }, { "epoch": 0.418133980975511, "grad_norm": 0.4538803696632385, "learning_rate": 0.0001793103168328334, "loss": 0.2537, "step": 2066 }, { "epoch": 0.4183363691560413, "grad_norm": 0.5876901745796204, "learning_rate": 0.00017929093948672748, "loss": 0.2414, "step": 2067 }, { "epoch": 0.41853875733657153, "grad_norm": 0.8129988312721252, "learning_rate": 0.00017927155411879628, "loss": 0.2363, "step": 2068 }, { "epoch": 0.4187411455171018, "grad_norm": 0.45293712615966797, "learning_rate": 0.00017925216073100102, "loss": 0.2158, "step": 2069 }, { "epoch": 0.41894353369763204, "grad_norm": 0.28010156750679016, "learning_rate": 0.00017923275932530373, "loss": 0.2707, "step": 2070 }, { "epoch": 0.4191459218781623, "grad_norm": 0.34166184067726135, "learning_rate": 0.00017921334990366722, "loss": 0.2437, "step": 2071 }, { "epoch": 0.41934831005869255, "grad_norm": 0.3896978199481964, "learning_rate": 0.00017919393246805513, "loss": 0.2076, "step": 2072 }, { "epoch": 0.4195506982392228, "grad_norm": 0.6046048402786255, "learning_rate": 0.00017917450702043195, "loss": 0.2931, "step": 2073 }, { "epoch": 0.41975308641975306, "grad_norm": 0.3625839948654175, "learning_rate": 0.0001791550735627629, "loss": 0.217, "step": 2074 }, { "epoch": 0.4199554746002833, "grad_norm": 0.37617138028144836, "learning_rate": 0.00017913563209701408, "loss": 0.2681, "step": 2075 }, { "epoch": 0.4201578627808136, "grad_norm": 0.40031054615974426, "learning_rate": 0.00017911618262515238, "loss": 0.2421, "step": 2076 }, { "epoch": 0.4203602509613439, "grad_norm": 0.40487319231033325, "learning_rate": 0.00017909672514914546, "loss": 0.2537, "step": 2077 }, { "epoch": 0.42056263914187414, "grad_norm": 0.39844515919685364, "learning_rate": 0.00017907725967096182, "loss": 0.2642, "step": 2078 }, { "epoch": 0.4207650273224044, "grad_norm": 0.3935433626174927, "learning_rate": 0.00017905778619257086, "loss": 0.2726, "step": 2079 }, { "epoch": 0.42096741550293465, "grad_norm": 0.3426590859889984, "learning_rate": 0.00017903830471594257, "loss": 0.2064, "step": 2080 }, { "epoch": 0.4211698036834649, "grad_norm": 0.3915655314922333, "learning_rate": 0.000179018815243048, "loss": 0.2312, "step": 2081 }, { "epoch": 0.42137219186399516, "grad_norm": 0.470255970954895, "learning_rate": 0.00017899931777585882, "loss": 0.2045, "step": 2082 }, { "epoch": 0.4215745800445254, "grad_norm": 0.3128467798233032, "learning_rate": 0.00017897981231634758, "loss": 0.2497, "step": 2083 }, { "epoch": 0.42177696822505567, "grad_norm": 0.31205683946609497, "learning_rate": 0.00017896029886648766, "loss": 0.23, "step": 2084 }, { "epoch": 0.4219793564055859, "grad_norm": 0.40596142411231995, "learning_rate": 0.00017894077742825325, "loss": 0.2658, "step": 2085 }, { "epoch": 0.4221817445861162, "grad_norm": 0.2824687957763672, "learning_rate": 0.00017892124800361926, "loss": 0.232, "step": 2086 }, { "epoch": 0.42238413276664644, "grad_norm": 0.4363657236099243, "learning_rate": 0.00017890171059456155, "loss": 0.2639, "step": 2087 }, { "epoch": 0.4225865209471767, "grad_norm": 0.5147897601127625, "learning_rate": 0.0001788821652030566, "loss": 0.2583, "step": 2088 }, { "epoch": 0.42278890912770695, "grad_norm": 0.3708350658416748, "learning_rate": 0.00017886261183108193, "loss": 0.2607, "step": 2089 }, { "epoch": 0.4229912973082372, "grad_norm": 0.28474700450897217, "learning_rate": 0.00017884305048061568, "loss": 0.2268, "step": 2090 }, { "epoch": 0.42319368548876746, "grad_norm": 0.3953563868999481, "learning_rate": 0.0001788234811536369, "loss": 0.2706, "step": 2091 }, { "epoch": 0.4233960736692977, "grad_norm": 0.2877426743507385, "learning_rate": 0.00017880390385212534, "loss": 0.2428, "step": 2092 }, { "epoch": 0.42359846184982797, "grad_norm": 0.2851710319519043, "learning_rate": 0.0001787843185780617, "loss": 0.237, "step": 2093 }, { "epoch": 0.4238008500303582, "grad_norm": 0.3443615734577179, "learning_rate": 0.00017876472533342734, "loss": 0.2469, "step": 2094 }, { "epoch": 0.4240032382108885, "grad_norm": 0.2852996289730072, "learning_rate": 0.00017874512412020458, "loss": 0.2365, "step": 2095 }, { "epoch": 0.42420562639141873, "grad_norm": 0.3117428719997406, "learning_rate": 0.0001787255149403764, "loss": 0.2594, "step": 2096 }, { "epoch": 0.424408014571949, "grad_norm": 0.32744285464286804, "learning_rate": 0.0001787058977959267, "loss": 0.2664, "step": 2097 }, { "epoch": 0.42461040275247924, "grad_norm": 0.4537400007247925, "learning_rate": 0.00017868627268884007, "loss": 0.2401, "step": 2098 }, { "epoch": 0.4248127909330095, "grad_norm": 0.3057894706726074, "learning_rate": 0.00017866663962110203, "loss": 0.2438, "step": 2099 }, { "epoch": 0.42501517911353975, "grad_norm": 0.45849281549453735, "learning_rate": 0.00017864699859469887, "loss": 0.2275, "step": 2100 }, { "epoch": 0.42501517911353975, "eval_loss": 0.2693714201450348, "eval_runtime": 0.7384, "eval_samples_per_second": 6.771, "eval_steps_per_second": 1.354, "step": 2100 }, { "epoch": 0.42521756729407, "grad_norm": 0.45448294281959534, "learning_rate": 0.0001786273496116176, "loss": 0.2915, "step": 2101 }, { "epoch": 0.42541995547460026, "grad_norm": 0.36509305238723755, "learning_rate": 0.0001786076926738461, "loss": 0.2425, "step": 2102 }, { "epoch": 0.4256223436551305, "grad_norm": 0.33645161986351013, "learning_rate": 0.00017858802778337313, "loss": 0.2239, "step": 2103 }, { "epoch": 0.4258247318356608, "grad_norm": 0.31513741612434387, "learning_rate": 0.0001785683549421881, "loss": 0.2711, "step": 2104 }, { "epoch": 0.42602712001619103, "grad_norm": 0.3990754783153534, "learning_rate": 0.0001785486741522813, "loss": 0.2172, "step": 2105 }, { "epoch": 0.4262295081967213, "grad_norm": 0.34632688760757446, "learning_rate": 0.00017852898541564387, "loss": 0.2578, "step": 2106 }, { "epoch": 0.4264318963772516, "grad_norm": 0.3364465832710266, "learning_rate": 0.0001785092887342677, "loss": 0.2357, "step": 2107 }, { "epoch": 0.42663428455778185, "grad_norm": 0.3629503548145294, "learning_rate": 0.0001784895841101455, "loss": 0.2397, "step": 2108 }, { "epoch": 0.4268366727383121, "grad_norm": 0.34377363324165344, "learning_rate": 0.00017846987154527072, "loss": 0.2656, "step": 2109 }, { "epoch": 0.42703906091884236, "grad_norm": 0.2795635461807251, "learning_rate": 0.00017845015104163775, "loss": 0.2153, "step": 2110 }, { "epoch": 0.4272414490993726, "grad_norm": 0.3671300411224365, "learning_rate": 0.0001784304226012416, "loss": 0.2438, "step": 2111 }, { "epoch": 0.42744383727990287, "grad_norm": 0.3172593414783478, "learning_rate": 0.00017841068622607832, "loss": 0.2426, "step": 2112 }, { "epoch": 0.4276462254604331, "grad_norm": 0.39299485087394714, "learning_rate": 0.00017839094191814453, "loss": 0.2618, "step": 2113 }, { "epoch": 0.4278486136409634, "grad_norm": 0.44122424721717834, "learning_rate": 0.00017837118967943782, "loss": 0.281, "step": 2114 }, { "epoch": 0.42805100182149364, "grad_norm": 0.2724647521972656, "learning_rate": 0.00017835142951195642, "loss": 0.25, "step": 2115 }, { "epoch": 0.4282533900020239, "grad_norm": 0.2748100161552429, "learning_rate": 0.00017833166141769958, "loss": 0.2344, "step": 2116 }, { "epoch": 0.42845577818255415, "grad_norm": 0.3923550844192505, "learning_rate": 0.00017831188539866712, "loss": 0.2644, "step": 2117 }, { "epoch": 0.4286581663630844, "grad_norm": 0.34341961145401, "learning_rate": 0.00017829210145685982, "loss": 0.2778, "step": 2118 }, { "epoch": 0.42886055454361466, "grad_norm": 0.3682166635990143, "learning_rate": 0.00017827230959427919, "loss": 0.2611, "step": 2119 }, { "epoch": 0.4290629427241449, "grad_norm": 0.3362092971801758, "learning_rate": 0.0001782525098129276, "loss": 0.2391, "step": 2120 }, { "epoch": 0.42926533090467517, "grad_norm": 0.33998823165893555, "learning_rate": 0.00017823270211480817, "loss": 0.2168, "step": 2121 }, { "epoch": 0.4294677190852054, "grad_norm": 0.30269312858581543, "learning_rate": 0.00017821288650192481, "loss": 0.2279, "step": 2122 }, { "epoch": 0.4296701072657357, "grad_norm": 0.4033576250076294, "learning_rate": 0.00017819306297628225, "loss": 0.2308, "step": 2123 }, { "epoch": 0.42987249544626593, "grad_norm": 0.2952467203140259, "learning_rate": 0.00017817323153988606, "loss": 0.2606, "step": 2124 }, { "epoch": 0.4300748836267962, "grad_norm": 0.3474556505680084, "learning_rate": 0.0001781533921947426, "loss": 0.2605, "step": 2125 }, { "epoch": 0.43027727180732644, "grad_norm": 0.30628398060798645, "learning_rate": 0.00017813354494285896, "loss": 0.2623, "step": 2126 }, { "epoch": 0.4304796599878567, "grad_norm": 0.26096710562705994, "learning_rate": 0.00017811368978624305, "loss": 0.2445, "step": 2127 }, { "epoch": 0.43068204816838696, "grad_norm": 0.32468295097351074, "learning_rate": 0.00017809382672690367, "loss": 0.2433, "step": 2128 }, { "epoch": 0.4308844363489172, "grad_norm": 0.3501925468444824, "learning_rate": 0.00017807395576685035, "loss": 0.2197, "step": 2129 }, { "epoch": 0.43108682452944747, "grad_norm": 0.335664838552475, "learning_rate": 0.0001780540769080934, "loss": 0.2278, "step": 2130 }, { "epoch": 0.4312892127099777, "grad_norm": 0.5295902490615845, "learning_rate": 0.00017803419015264394, "loss": 0.2839, "step": 2131 }, { "epoch": 0.431491600890508, "grad_norm": 0.5748984217643738, "learning_rate": 0.00017801429550251392, "loss": 0.2706, "step": 2132 }, { "epoch": 0.43169398907103823, "grad_norm": 0.3495369553565979, "learning_rate": 0.0001779943929597161, "loss": 0.2406, "step": 2133 }, { "epoch": 0.4318963772515685, "grad_norm": 0.419474333524704, "learning_rate": 0.00017797448252626397, "loss": 0.2339, "step": 2134 }, { "epoch": 0.43209876543209874, "grad_norm": 0.3343390226364136, "learning_rate": 0.00017795456420417188, "loss": 0.2389, "step": 2135 }, { "epoch": 0.432301153612629, "grad_norm": 0.41015762090682983, "learning_rate": 0.00017793463799545495, "loss": 0.2492, "step": 2136 }, { "epoch": 0.43250354179315925, "grad_norm": 0.28410017490386963, "learning_rate": 0.0001779147039021291, "loss": 0.233, "step": 2137 }, { "epoch": 0.43270592997368956, "grad_norm": 0.3253934979438782, "learning_rate": 0.00017789476192621106, "loss": 0.2742, "step": 2138 }, { "epoch": 0.4329083181542198, "grad_norm": 0.29073867201805115, "learning_rate": 0.00017787481206971837, "loss": 0.2235, "step": 2139 }, { "epoch": 0.4331107063347501, "grad_norm": 0.31343135237693787, "learning_rate": 0.0001778548543346693, "loss": 0.2774, "step": 2140 }, { "epoch": 0.43331309451528033, "grad_norm": 0.31908923387527466, "learning_rate": 0.000177834888723083, "loss": 0.264, "step": 2141 }, { "epoch": 0.4335154826958106, "grad_norm": 0.25621846318244934, "learning_rate": 0.00017781491523697937, "loss": 0.2283, "step": 2142 }, { "epoch": 0.43371787087634084, "grad_norm": 0.47303399443626404, "learning_rate": 0.00017779493387837914, "loss": 0.2955, "step": 2143 }, { "epoch": 0.4339202590568711, "grad_norm": 0.604739248752594, "learning_rate": 0.00017777494464930378, "loss": 0.2493, "step": 2144 }, { "epoch": 0.43412264723740135, "grad_norm": 0.31334736943244934, "learning_rate": 0.0001777549475517756, "loss": 0.2581, "step": 2145 }, { "epoch": 0.4343250354179316, "grad_norm": 0.2978392243385315, "learning_rate": 0.0001777349425878177, "loss": 0.2616, "step": 2146 }, { "epoch": 0.43452742359846186, "grad_norm": 0.39638951420783997, "learning_rate": 0.00017771492975945396, "loss": 0.2632, "step": 2147 }, { "epoch": 0.4347298117789921, "grad_norm": 0.32750117778778076, "learning_rate": 0.00017769490906870909, "loss": 0.2686, "step": 2148 }, { "epoch": 0.43493219995952237, "grad_norm": 0.3518666625022888, "learning_rate": 0.00017767488051760857, "loss": 0.2759, "step": 2149 }, { "epoch": 0.4351345881400526, "grad_norm": 0.3919273912906647, "learning_rate": 0.00017765484410817866, "loss": 0.2458, "step": 2150 }, { "epoch": 0.4351345881400526, "eval_loss": 0.27276766300201416, "eval_runtime": 0.7375, "eval_samples_per_second": 6.779, "eval_steps_per_second": 1.356, "step": 2150 }, { "epoch": 0.4353369763205829, "grad_norm": 0.312533438205719, "learning_rate": 0.00017763479984244645, "loss": 0.2375, "step": 2151 }, { "epoch": 0.43553936450111314, "grad_norm": 0.441134512424469, "learning_rate": 0.00017761474772243983, "loss": 0.2061, "step": 2152 }, { "epoch": 0.4357417526816434, "grad_norm": 0.36551475524902344, "learning_rate": 0.00017759468775018742, "loss": 0.2307, "step": 2153 }, { "epoch": 0.43594414086217365, "grad_norm": 0.35309749841690063, "learning_rate": 0.00017757461992771867, "loss": 0.2429, "step": 2154 }, { "epoch": 0.4361465290427039, "grad_norm": 0.2728305757045746, "learning_rate": 0.00017755454425706388, "loss": 0.2372, "step": 2155 }, { "epoch": 0.43634891722323416, "grad_norm": 0.3202503025531769, "learning_rate": 0.00017753446074025408, "loss": 0.2282, "step": 2156 }, { "epoch": 0.4365513054037644, "grad_norm": 0.36079493165016174, "learning_rate": 0.00017751436937932108, "loss": 0.2268, "step": 2157 }, { "epoch": 0.43675369358429467, "grad_norm": 0.3829249441623688, "learning_rate": 0.00017749427017629756, "loss": 0.2148, "step": 2158 }, { "epoch": 0.4369560817648249, "grad_norm": 0.39769890904426575, "learning_rate": 0.0001774741631332169, "loss": 0.275, "step": 2159 }, { "epoch": 0.4371584699453552, "grad_norm": 0.3986724615097046, "learning_rate": 0.00017745404825211336, "loss": 0.2757, "step": 2160 }, { "epoch": 0.43736085812588543, "grad_norm": 0.30949562788009644, "learning_rate": 0.00017743392553502192, "loss": 0.2538, "step": 2161 }, { "epoch": 0.4375632463064157, "grad_norm": 0.2870640754699707, "learning_rate": 0.0001774137949839784, "loss": 0.2631, "step": 2162 }, { "epoch": 0.43776563448694594, "grad_norm": 0.3055521249771118, "learning_rate": 0.0001773936566010194, "loss": 0.2433, "step": 2163 }, { "epoch": 0.4379680226674762, "grad_norm": 0.2762567400932312, "learning_rate": 0.0001773735103881823, "loss": 0.261, "step": 2164 }, { "epoch": 0.43817041084800645, "grad_norm": 0.2516343891620636, "learning_rate": 0.00017735335634750532, "loss": 0.2424, "step": 2165 }, { "epoch": 0.4383727990285367, "grad_norm": 0.29465994238853455, "learning_rate": 0.0001773331944810274, "loss": 0.2296, "step": 2166 }, { "epoch": 0.43857518720906696, "grad_norm": 0.24924996495246887, "learning_rate": 0.00017731302479078828, "loss": 0.2154, "step": 2167 }, { "epoch": 0.4387775753895973, "grad_norm": 0.46507659554481506, "learning_rate": 0.00017729284727882857, "loss": 0.2635, "step": 2168 }, { "epoch": 0.43897996357012753, "grad_norm": 0.3225403428077698, "learning_rate": 0.0001772726619471896, "loss": 0.2381, "step": 2169 }, { "epoch": 0.4391823517506578, "grad_norm": 0.32947832345962524, "learning_rate": 0.0001772524687979135, "loss": 0.2426, "step": 2170 }, { "epoch": 0.43938473993118804, "grad_norm": 0.29286065697669983, "learning_rate": 0.0001772322678330432, "loss": 0.2385, "step": 2171 }, { "epoch": 0.4395871281117183, "grad_norm": 0.40721961855888367, "learning_rate": 0.0001772120590546224, "loss": 0.2746, "step": 2172 }, { "epoch": 0.43978951629224855, "grad_norm": 0.2583456337451935, "learning_rate": 0.0001771918424646957, "loss": 0.2177, "step": 2173 }, { "epoch": 0.4399919044727788, "grad_norm": 0.3217853903770447, "learning_rate": 0.00017717161806530833, "loss": 0.2292, "step": 2174 }, { "epoch": 0.44019429265330906, "grad_norm": 0.32177701592445374, "learning_rate": 0.00017715138585850637, "loss": 0.2568, "step": 2175 }, { "epoch": 0.4403966808338393, "grad_norm": 0.3047245740890503, "learning_rate": 0.00017713114584633674, "loss": 0.2221, "step": 2176 }, { "epoch": 0.44059906901436957, "grad_norm": 0.43895235657691956, "learning_rate": 0.00017711089803084713, "loss": 0.2433, "step": 2177 }, { "epoch": 0.4408014571948998, "grad_norm": 0.3079501688480377, "learning_rate": 0.00017709064241408593, "loss": 0.2418, "step": 2178 }, { "epoch": 0.4410038453754301, "grad_norm": 0.3755057752132416, "learning_rate": 0.00017707037899810247, "loss": 0.2585, "step": 2179 }, { "epoch": 0.44120623355596034, "grad_norm": 0.3147794306278229, "learning_rate": 0.00017705010778494673, "loss": 0.2534, "step": 2180 }, { "epoch": 0.4414086217364906, "grad_norm": 0.37766438722610474, "learning_rate": 0.00017702982877666957, "loss": 0.252, "step": 2181 }, { "epoch": 0.44161100991702085, "grad_norm": 0.36453086137771606, "learning_rate": 0.0001770095419753226, "loss": 0.258, "step": 2182 }, { "epoch": 0.4418133980975511, "grad_norm": 0.2642430067062378, "learning_rate": 0.0001769892473829582, "loss": 0.2284, "step": 2183 }, { "epoch": 0.44201578627808136, "grad_norm": 0.3428244888782501, "learning_rate": 0.00017696894500162963, "loss": 0.2241, "step": 2184 }, { "epoch": 0.4422181744586116, "grad_norm": 0.32250645756721497, "learning_rate": 0.0001769486348333908, "loss": 0.2849, "step": 2185 }, { "epoch": 0.44242056263914187, "grad_norm": 0.7376700043678284, "learning_rate": 0.00017692831688029655, "loss": 0.2689, "step": 2186 }, { "epoch": 0.4426229508196721, "grad_norm": 0.39100563526153564, "learning_rate": 0.00017690799114440236, "loss": 0.2506, "step": 2187 }, { "epoch": 0.4428253390002024, "grad_norm": 0.3266545236110687, "learning_rate": 0.00017688765762776464, "loss": 0.2703, "step": 2188 }, { "epoch": 0.44302772718073263, "grad_norm": 0.391176700592041, "learning_rate": 0.00017686731633244045, "loss": 0.2712, "step": 2189 }, { "epoch": 0.4432301153612629, "grad_norm": 0.30557316541671753, "learning_rate": 0.00017684696726048778, "loss": 0.2413, "step": 2190 }, { "epoch": 0.44343250354179314, "grad_norm": 0.30448246002197266, "learning_rate": 0.00017682661041396532, "loss": 0.2763, "step": 2191 }, { "epoch": 0.4436348917223234, "grad_norm": 0.38532236218452454, "learning_rate": 0.00017680624579493253, "loss": 0.2951, "step": 2192 }, { "epoch": 0.44383727990285365, "grad_norm": 0.29205942153930664, "learning_rate": 0.0001767858734054497, "loss": 0.2443, "step": 2193 }, { "epoch": 0.4440396680833839, "grad_norm": 0.3226570188999176, "learning_rate": 0.00017676549324757793, "loss": 0.2426, "step": 2194 }, { "epoch": 0.44424205626391416, "grad_norm": 0.3055272400379181, "learning_rate": 0.00017674510532337905, "loss": 0.2766, "step": 2195 }, { "epoch": 0.4444444444444444, "grad_norm": 0.3346841335296631, "learning_rate": 0.00017672470963491567, "loss": 0.2532, "step": 2196 }, { "epoch": 0.4446468326249747, "grad_norm": 0.36648398637771606, "learning_rate": 0.00017670430618425123, "loss": 0.2539, "step": 2197 }, { "epoch": 0.444849220805505, "grad_norm": 0.25422319769859314, "learning_rate": 0.00017668389497344997, "loss": 0.2566, "step": 2198 }, { "epoch": 0.44505160898603524, "grad_norm": 0.3028642237186432, "learning_rate": 0.00017666347600457685, "loss": 0.2367, "step": 2199 }, { "epoch": 0.4452539971665655, "grad_norm": 0.4553399085998535, "learning_rate": 0.0001766430492796976, "loss": 0.2821, "step": 2200 }, { "epoch": 0.4452539971665655, "eval_loss": 0.270407497882843, "eval_runtime": 0.7397, "eval_samples_per_second": 6.76, "eval_steps_per_second": 1.352, "step": 2200 }, { "epoch": 0.44545638534709575, "grad_norm": 0.28558349609375, "learning_rate": 0.00017662261480087886, "loss": 0.2549, "step": 2201 }, { "epoch": 0.445658773527626, "grad_norm": 0.32957684993743896, "learning_rate": 0.00017660217257018794, "loss": 0.2448, "step": 2202 }, { "epoch": 0.44586116170815626, "grad_norm": 0.4825969934463501, "learning_rate": 0.00017658172258969298, "loss": 0.2355, "step": 2203 }, { "epoch": 0.4460635498886865, "grad_norm": 0.2827821969985962, "learning_rate": 0.00017656126486146291, "loss": 0.2386, "step": 2204 }, { "epoch": 0.44626593806921677, "grad_norm": 0.3403480350971222, "learning_rate": 0.0001765407993875674, "loss": 0.2504, "step": 2205 }, { "epoch": 0.446468326249747, "grad_norm": 0.2827328145503998, "learning_rate": 0.00017652032617007692, "loss": 0.2382, "step": 2206 }, { "epoch": 0.4466707144302773, "grad_norm": 0.3415543735027313, "learning_rate": 0.0001764998452110628, "loss": 0.2456, "step": 2207 }, { "epoch": 0.44687310261080754, "grad_norm": 0.3274790644645691, "learning_rate": 0.000176479356512597, "loss": 0.25, "step": 2208 }, { "epoch": 0.4470754907913378, "grad_norm": 0.27415189146995544, "learning_rate": 0.0001764588600767524, "loss": 0.2204, "step": 2209 }, { "epoch": 0.44727787897186805, "grad_norm": 0.29590359330177307, "learning_rate": 0.00017643835590560266, "loss": 0.2628, "step": 2210 }, { "epoch": 0.4474802671523983, "grad_norm": 0.3007771968841553, "learning_rate": 0.00017641784400122208, "loss": 0.2488, "step": 2211 }, { "epoch": 0.44768265533292856, "grad_norm": 0.4987753927707672, "learning_rate": 0.00017639732436568588, "loss": 0.291, "step": 2212 }, { "epoch": 0.4478850435134588, "grad_norm": 0.36342155933380127, "learning_rate": 0.00017637679700107005, "loss": 0.2815, "step": 2213 }, { "epoch": 0.44808743169398907, "grad_norm": 0.3229970932006836, "learning_rate": 0.0001763562619094513, "loss": 0.2731, "step": 2214 }, { "epoch": 0.4482898198745193, "grad_norm": 0.32807472348213196, "learning_rate": 0.0001763357190929072, "loss": 0.2226, "step": 2215 }, { "epoch": 0.4484922080550496, "grad_norm": 0.4111528694629669, "learning_rate": 0.000176315168553516, "loss": 0.2287, "step": 2216 }, { "epoch": 0.44869459623557983, "grad_norm": 0.2823032736778259, "learning_rate": 0.0001762946102933568, "loss": 0.2528, "step": 2217 }, { "epoch": 0.4488969844161101, "grad_norm": 0.3747027814388275, "learning_rate": 0.0001762740443145095, "loss": 0.2593, "step": 2218 }, { "epoch": 0.44909937259664034, "grad_norm": 0.3702718913555145, "learning_rate": 0.00017625347061905476, "loss": 0.2749, "step": 2219 }, { "epoch": 0.4493017607771706, "grad_norm": 0.29596519470214844, "learning_rate": 0.00017623288920907393, "loss": 0.2209, "step": 2220 }, { "epoch": 0.44950414895770086, "grad_norm": 0.32838091254234314, "learning_rate": 0.0001762123000866493, "loss": 0.2444, "step": 2221 }, { "epoch": 0.4497065371382311, "grad_norm": 0.37883222103118896, "learning_rate": 0.0001761917032538638, "loss": 0.2952, "step": 2222 }, { "epoch": 0.44990892531876137, "grad_norm": 0.37547382712364197, "learning_rate": 0.00017617109871280126, "loss": 0.2471, "step": 2223 }, { "epoch": 0.4501113134992916, "grad_norm": 0.41560691595077515, "learning_rate": 0.0001761504864655462, "loss": 0.2471, "step": 2224 }, { "epoch": 0.4503137016798219, "grad_norm": 0.41966041922569275, "learning_rate": 0.00017612986651418397, "loss": 0.2409, "step": 2225 }, { "epoch": 0.45051608986035213, "grad_norm": 0.3438867926597595, "learning_rate": 0.00017610923886080064, "loss": 0.269, "step": 2226 }, { "epoch": 0.4507184780408824, "grad_norm": 0.32733553647994995, "learning_rate": 0.00017608860350748316, "loss": 0.29, "step": 2227 }, { "epoch": 0.4509208662214127, "grad_norm": 0.48707279562950134, "learning_rate": 0.00017606796045631918, "loss": 0.2627, "step": 2228 }, { "epoch": 0.45112325440194295, "grad_norm": 0.29957085847854614, "learning_rate": 0.0001760473097093971, "loss": 0.2679, "step": 2229 }, { "epoch": 0.4513256425824732, "grad_norm": 0.26773086190223694, "learning_rate": 0.00017602665126880616, "loss": 0.2451, "step": 2230 }, { "epoch": 0.45152803076300346, "grad_norm": 0.3124980628490448, "learning_rate": 0.00017600598513663643, "loss": 0.2856, "step": 2231 }, { "epoch": 0.4517304189435337, "grad_norm": 0.2624861001968384, "learning_rate": 0.00017598531131497863, "loss": 0.2523, "step": 2232 }, { "epoch": 0.451932807124064, "grad_norm": 0.26772695779800415, "learning_rate": 0.00017596462980592432, "loss": 0.2321, "step": 2233 }, { "epoch": 0.45213519530459423, "grad_norm": 0.3250735104084015, "learning_rate": 0.00017594394061156584, "loss": 0.2724, "step": 2234 }, { "epoch": 0.4523375834851245, "grad_norm": 0.3289940059185028, "learning_rate": 0.00017592324373399637, "loss": 0.249, "step": 2235 }, { "epoch": 0.45253997166565474, "grad_norm": 0.4136817455291748, "learning_rate": 0.00017590253917530973, "loss": 0.2712, "step": 2236 }, { "epoch": 0.452742359846185, "grad_norm": 0.31983017921447754, "learning_rate": 0.00017588182693760058, "loss": 0.2446, "step": 2237 }, { "epoch": 0.45294474802671525, "grad_norm": 0.3201993405818939, "learning_rate": 0.00017586110702296447, "loss": 0.2634, "step": 2238 }, { "epoch": 0.4531471362072455, "grad_norm": 0.32686853408813477, "learning_rate": 0.00017584037943349748, "loss": 0.2521, "step": 2239 }, { "epoch": 0.45334952438777576, "grad_norm": 0.3686739206314087, "learning_rate": 0.0001758196441712967, "loss": 0.2539, "step": 2240 }, { "epoch": 0.453551912568306, "grad_norm": 0.36773359775543213, "learning_rate": 0.00017579890123845993, "loss": 0.2652, "step": 2241 }, { "epoch": 0.45375430074883627, "grad_norm": 0.3526698350906372, "learning_rate": 0.00017577815063708565, "loss": 0.2548, "step": 2242 }, { "epoch": 0.4539566889293665, "grad_norm": 0.301490843296051, "learning_rate": 0.0001757573923692732, "loss": 0.2734, "step": 2243 }, { "epoch": 0.4541590771098968, "grad_norm": 0.26612064242362976, "learning_rate": 0.00017573662643712276, "loss": 0.2493, "step": 2244 }, { "epoch": 0.45436146529042704, "grad_norm": 0.38326704502105713, "learning_rate": 0.0001757158528427351, "loss": 0.2493, "step": 2245 }, { "epoch": 0.4545638534709573, "grad_norm": 0.34822413325309753, "learning_rate": 0.00017569507158821197, "loss": 0.2568, "step": 2246 }, { "epoch": 0.45476624165148755, "grad_norm": 0.27359241247177124, "learning_rate": 0.0001756742826756557, "loss": 0.2329, "step": 2247 }, { "epoch": 0.4549686298320178, "grad_norm": 0.28838682174682617, "learning_rate": 0.0001756534861071696, "loss": 0.2284, "step": 2248 }, { "epoch": 0.45517101801254806, "grad_norm": 0.32817342877388, "learning_rate": 0.00017563268188485758, "loss": 0.2332, "step": 2249 }, { "epoch": 0.4553734061930783, "grad_norm": 0.2918015122413635, "learning_rate": 0.00017561187001082442, "loss": 0.262, "step": 2250 }, { "epoch": 0.4553734061930783, "eval_loss": 0.2709426283836365, "eval_runtime": 0.7401, "eval_samples_per_second": 6.756, "eval_steps_per_second": 1.351, "step": 2250 }, { "epoch": 0.45557579437360857, "grad_norm": 0.2817946970462799, "learning_rate": 0.00017559105048717562, "loss": 0.2314, "step": 2251 }, { "epoch": 0.4557781825541388, "grad_norm": 0.3302723467350006, "learning_rate": 0.0001755702233160175, "loss": 0.2695, "step": 2252 }, { "epoch": 0.4559805707346691, "grad_norm": 0.29945623874664307, "learning_rate": 0.00017554938849945716, "loss": 0.2501, "step": 2253 }, { "epoch": 0.45618295891519933, "grad_norm": 0.2748788297176361, "learning_rate": 0.0001755285460396024, "loss": 0.2396, "step": 2254 }, { "epoch": 0.4563853470957296, "grad_norm": 0.2754972577095032, "learning_rate": 0.0001755076959385619, "loss": 0.2274, "step": 2255 }, { "epoch": 0.45658773527625984, "grad_norm": 0.34508219361305237, "learning_rate": 0.00017548683819844496, "loss": 0.2576, "step": 2256 }, { "epoch": 0.4567901234567901, "grad_norm": 0.29633739590644836, "learning_rate": 0.00017546597282136185, "loss": 0.2708, "step": 2257 }, { "epoch": 0.45699251163732035, "grad_norm": 0.27955976128578186, "learning_rate": 0.00017544509980942346, "loss": 0.2663, "step": 2258 }, { "epoch": 0.45719489981785066, "grad_norm": 0.31975439190864563, "learning_rate": 0.0001754242191647415, "loss": 0.2182, "step": 2259 }, { "epoch": 0.4573972879983809, "grad_norm": 0.3412756323814392, "learning_rate": 0.00017540333088942846, "loss": 0.2395, "step": 2260 }, { "epoch": 0.4575996761789112, "grad_norm": 0.2998964786529541, "learning_rate": 0.00017538243498559759, "loss": 0.2522, "step": 2261 }, { "epoch": 0.45780206435944143, "grad_norm": 0.665895938873291, "learning_rate": 0.00017536153145536294, "loss": 0.2933, "step": 2262 }, { "epoch": 0.4580044525399717, "grad_norm": 0.3041466176509857, "learning_rate": 0.0001753406203008393, "loss": 0.2466, "step": 2263 }, { "epoch": 0.45820684072050194, "grad_norm": 0.5185272097587585, "learning_rate": 0.00017531970152414222, "loss": 0.2446, "step": 2264 }, { "epoch": 0.4584092289010322, "grad_norm": 0.2728477120399475, "learning_rate": 0.00017529877512738806, "loss": 0.2084, "step": 2265 }, { "epoch": 0.45861161708156245, "grad_norm": 0.3017343282699585, "learning_rate": 0.00017527784111269395, "loss": 0.276, "step": 2266 }, { "epoch": 0.4588140052620927, "grad_norm": 0.32620134949684143, "learning_rate": 0.00017525689948217775, "loss": 0.22, "step": 2267 }, { "epoch": 0.45901639344262296, "grad_norm": 0.34241124987602234, "learning_rate": 0.00017523595023795813, "loss": 0.2587, "step": 2268 }, { "epoch": 0.4592187816231532, "grad_norm": 0.4101926386356354, "learning_rate": 0.00017521499338215454, "loss": 0.2947, "step": 2269 }, { "epoch": 0.45942116980368347, "grad_norm": 0.5489742755889893, "learning_rate": 0.00017519402891688708, "loss": 0.2311, "step": 2270 }, { "epoch": 0.4596235579842137, "grad_norm": 0.3954886794090271, "learning_rate": 0.00017517305684427677, "loss": 0.2739, "step": 2271 }, { "epoch": 0.459825946164744, "grad_norm": 0.3286878764629364, "learning_rate": 0.00017515207716644539, "loss": 0.2892, "step": 2272 }, { "epoch": 0.46002833434527424, "grad_norm": 0.36722058057785034, "learning_rate": 0.0001751310898855154, "loss": 0.2398, "step": 2273 }, { "epoch": 0.4602307225258045, "grad_norm": 0.30072125792503357, "learning_rate": 0.0001751100950036101, "loss": 0.2548, "step": 2274 }, { "epoch": 0.46043311070633475, "grad_norm": 0.30294889211654663, "learning_rate": 0.0001750890925228535, "loss": 0.268, "step": 2275 }, { "epoch": 0.460635498886865, "grad_norm": 0.28848111629486084, "learning_rate": 0.00017506808244537037, "loss": 0.2599, "step": 2276 }, { "epoch": 0.46083788706739526, "grad_norm": 0.31934136152267456, "learning_rate": 0.00017504706477328635, "loss": 0.2682, "step": 2277 }, { "epoch": 0.4610402752479255, "grad_norm": 0.29640549421310425, "learning_rate": 0.0001750260395087278, "loss": 0.2309, "step": 2278 }, { "epoch": 0.46124266342845577, "grad_norm": 0.37822410464286804, "learning_rate": 0.0001750050066538218, "loss": 0.2387, "step": 2279 }, { "epoch": 0.461445051608986, "grad_norm": 0.31998032331466675, "learning_rate": 0.00017498396621069625, "loss": 0.2472, "step": 2280 }, { "epoch": 0.4616474397895163, "grad_norm": 0.4722261428833008, "learning_rate": 0.00017496291818147982, "loss": 0.251, "step": 2281 }, { "epoch": 0.46184982797004653, "grad_norm": 0.36852967739105225, "learning_rate": 0.00017494186256830188, "loss": 0.2541, "step": 2282 }, { "epoch": 0.4620522161505768, "grad_norm": 0.2886607348918915, "learning_rate": 0.00017492079937329264, "loss": 0.2552, "step": 2283 }, { "epoch": 0.46225460433110704, "grad_norm": 0.3052384555339813, "learning_rate": 0.00017489972859858306, "loss": 0.2354, "step": 2284 }, { "epoch": 0.4624569925116373, "grad_norm": 0.29837775230407715, "learning_rate": 0.00017487865024630485, "loss": 0.238, "step": 2285 }, { "epoch": 0.46265938069216755, "grad_norm": 0.30569151043891907, "learning_rate": 0.0001748575643185905, "loss": 0.2933, "step": 2286 }, { "epoch": 0.4628617688726978, "grad_norm": 0.3457178771495819, "learning_rate": 0.0001748364708175733, "loss": 0.2378, "step": 2287 }, { "epoch": 0.46306415705322806, "grad_norm": 0.2781577706336975, "learning_rate": 0.00017481536974538718, "loss": 0.2398, "step": 2288 }, { "epoch": 0.4632665452337584, "grad_norm": 0.3738473951816559, "learning_rate": 0.000174794261104167, "loss": 0.2376, "step": 2289 }, { "epoch": 0.46346893341428863, "grad_norm": 0.3048925995826721, "learning_rate": 0.0001747731448960483, "loss": 0.2325, "step": 2290 }, { "epoch": 0.4636713215948189, "grad_norm": 0.27175387740135193, "learning_rate": 0.00017475202112316737, "loss": 0.2007, "step": 2291 }, { "epoch": 0.46387370977534914, "grad_norm": 0.26002347469329834, "learning_rate": 0.0001747308897876613, "loss": 0.227, "step": 2292 }, { "epoch": 0.4640760979558794, "grad_norm": 0.33025866746902466, "learning_rate": 0.00017470975089166793, "loss": 0.2397, "step": 2293 }, { "epoch": 0.46427848613640965, "grad_norm": 0.4511990249156952, "learning_rate": 0.00017468860443732592, "loss": 0.2607, "step": 2294 }, { "epoch": 0.4644808743169399, "grad_norm": 0.3069620132446289, "learning_rate": 0.0001746674504267746, "loss": 0.2274, "step": 2295 }, { "epoch": 0.46468326249747016, "grad_norm": 0.3227700889110565, "learning_rate": 0.00017464628886215415, "loss": 0.258, "step": 2296 }, { "epoch": 0.4648856506780004, "grad_norm": 0.3729799687862396, "learning_rate": 0.00017462511974560542, "loss": 0.2519, "step": 2297 }, { "epoch": 0.4650880388585307, "grad_norm": 0.3248327076435089, "learning_rate": 0.0001746039430792701, "loss": 0.2428, "step": 2298 }, { "epoch": 0.4652904270390609, "grad_norm": 0.34883835911750793, "learning_rate": 0.00017458275886529062, "loss": 0.2611, "step": 2299 }, { "epoch": 0.4654928152195912, "grad_norm": 0.3219202756881714, "learning_rate": 0.0001745615671058102, "loss": 0.2113, "step": 2300 }, { "epoch": 0.4654928152195912, "eval_loss": 0.2663731575012207, "eval_runtime": 0.7416, "eval_samples_per_second": 6.742, "eval_steps_per_second": 1.348, "step": 2300 }, { "epoch": 0.46569520340012144, "grad_norm": 0.34709736704826355, "learning_rate": 0.0001745403678029728, "loss": 0.213, "step": 2301 }, { "epoch": 0.4658975915806517, "grad_norm": 0.30065563321113586, "learning_rate": 0.00017451916095892312, "loss": 0.2546, "step": 2302 }, { "epoch": 0.46609997976118195, "grad_norm": 0.35152703523635864, "learning_rate": 0.00017449794657580664, "loss": 0.2203, "step": 2303 }, { "epoch": 0.4663023679417122, "grad_norm": 0.32369470596313477, "learning_rate": 0.00017447672465576965, "loss": 0.2377, "step": 2304 }, { "epoch": 0.46650475612224246, "grad_norm": 0.2950402796268463, "learning_rate": 0.0001744554952009591, "loss": 0.2326, "step": 2305 }, { "epoch": 0.4667071443027727, "grad_norm": 0.4171277582645416, "learning_rate": 0.0001744342582135228, "loss": 0.264, "step": 2306 }, { "epoch": 0.46690953248330297, "grad_norm": 0.27577680349349976, "learning_rate": 0.00017441301369560934, "loss": 0.2414, "step": 2307 }, { "epoch": 0.4671119206638332, "grad_norm": 0.4021974205970764, "learning_rate": 0.0001743917616493679, "loss": 0.27, "step": 2308 }, { "epoch": 0.4673143088443635, "grad_norm": 0.32153424620628357, "learning_rate": 0.00017437050207694865, "loss": 0.2423, "step": 2309 }, { "epoch": 0.46751669702489373, "grad_norm": 0.29214033484458923, "learning_rate": 0.00017434923498050233, "loss": 0.2429, "step": 2310 }, { "epoch": 0.467719085205424, "grad_norm": 0.29247456789016724, "learning_rate": 0.00017432796036218054, "loss": 0.1918, "step": 2311 }, { "epoch": 0.46792147338595425, "grad_norm": 0.332529217004776, "learning_rate": 0.00017430667822413567, "loss": 0.2558, "step": 2312 }, { "epoch": 0.4681238615664845, "grad_norm": 0.2968290150165558, "learning_rate": 0.00017428538856852077, "loss": 0.2213, "step": 2313 }, { "epoch": 0.46832624974701476, "grad_norm": 0.48056352138519287, "learning_rate": 0.0001742640913974897, "loss": 0.1798, "step": 2314 }, { "epoch": 0.468528637927545, "grad_norm": 0.40288710594177246, "learning_rate": 0.00017424278671319713, "loss": 0.2906, "step": 2315 }, { "epoch": 0.46873102610807527, "grad_norm": 0.37656524777412415, "learning_rate": 0.00017422147451779844, "loss": 0.2925, "step": 2316 }, { "epoch": 0.4689334142886055, "grad_norm": 0.30084243416786194, "learning_rate": 0.00017420015481344972, "loss": 0.2435, "step": 2317 }, { "epoch": 0.4691358024691358, "grad_norm": 0.25637879967689514, "learning_rate": 0.00017417882760230793, "loss": 0.218, "step": 2318 }, { "epoch": 0.4693381906496661, "grad_norm": 0.3462231159210205, "learning_rate": 0.00017415749288653072, "loss": 0.2463, "step": 2319 }, { "epoch": 0.46954057883019634, "grad_norm": 0.31451940536499023, "learning_rate": 0.0001741361506682765, "loss": 0.2761, "step": 2320 }, { "epoch": 0.4697429670107266, "grad_norm": 0.468211829662323, "learning_rate": 0.00017411480094970444, "loss": 0.2239, "step": 2321 }, { "epoch": 0.46994535519125685, "grad_norm": 0.3163670599460602, "learning_rate": 0.00017409344373297452, "loss": 0.2576, "step": 2322 }, { "epoch": 0.4701477433717871, "grad_norm": 0.2964145839214325, "learning_rate": 0.00017407207902024737, "loss": 0.2495, "step": 2323 }, { "epoch": 0.47035013155231736, "grad_norm": 0.3863260746002197, "learning_rate": 0.00017405070681368457, "loss": 0.2539, "step": 2324 }, { "epoch": 0.4705525197328476, "grad_norm": 0.30294448137283325, "learning_rate": 0.0001740293271154482, "loss": 0.2599, "step": 2325 }, { "epoch": 0.4707549079133779, "grad_norm": 0.3313601016998291, "learning_rate": 0.0001740079399277013, "loss": 0.258, "step": 2326 }, { "epoch": 0.47095729609390813, "grad_norm": 0.3621000051498413, "learning_rate": 0.00017398654525260763, "loss": 0.238, "step": 2327 }, { "epoch": 0.4711596842744384, "grad_norm": 0.36232197284698486, "learning_rate": 0.0001739651430923316, "loss": 0.2701, "step": 2328 }, { "epoch": 0.47136207245496864, "grad_norm": 0.34678053855895996, "learning_rate": 0.00017394373344903853, "loss": 0.2305, "step": 2329 }, { "epoch": 0.4715644606354989, "grad_norm": 0.29713359475135803, "learning_rate": 0.00017392231632489439, "loss": 0.2345, "step": 2330 }, { "epoch": 0.47176684881602915, "grad_norm": 0.3039023280143738, "learning_rate": 0.00017390089172206592, "loss": 0.2383, "step": 2331 }, { "epoch": 0.4719692369965594, "grad_norm": 0.264533668756485, "learning_rate": 0.0001738794596427207, "loss": 0.2304, "step": 2332 }, { "epoch": 0.47217162517708966, "grad_norm": 0.272783100605011, "learning_rate": 0.00017385802008902692, "loss": 0.2483, "step": 2333 }, { "epoch": 0.4723740133576199, "grad_norm": 0.28291958570480347, "learning_rate": 0.00017383657306315367, "loss": 0.2146, "step": 2334 }, { "epoch": 0.47257640153815017, "grad_norm": 0.3644791841506958, "learning_rate": 0.00017381511856727068, "loss": 0.2394, "step": 2335 }, { "epoch": 0.4727787897186804, "grad_norm": 0.35561367869377136, "learning_rate": 0.00017379365660354857, "loss": 0.2201, "step": 2336 }, { "epoch": 0.4729811778992107, "grad_norm": 0.30612918734550476, "learning_rate": 0.00017377218717415857, "loss": 0.228, "step": 2337 }, { "epoch": 0.47318356607974094, "grad_norm": 0.4196929931640625, "learning_rate": 0.00017375071028127276, "loss": 0.2628, "step": 2338 }, { "epoch": 0.4733859542602712, "grad_norm": 0.3164200782775879, "learning_rate": 0.00017372922592706397, "loss": 0.2569, "step": 2339 }, { "epoch": 0.47358834244080145, "grad_norm": 0.3175007700920105, "learning_rate": 0.00017370773411370572, "loss": 0.2318, "step": 2340 }, { "epoch": 0.4737907306213317, "grad_norm": 0.3435089588165283, "learning_rate": 0.00017368623484337233, "loss": 0.2441, "step": 2341 }, { "epoch": 0.47399311880186196, "grad_norm": 0.2463127225637436, "learning_rate": 0.00017366472811823888, "loss": 0.2352, "step": 2342 }, { "epoch": 0.4741955069823922, "grad_norm": 0.4272097051143646, "learning_rate": 0.00017364321394048118, "loss": 0.2282, "step": 2343 }, { "epoch": 0.47439789516292247, "grad_norm": 0.45957881212234497, "learning_rate": 0.0001736216923122758, "loss": 0.2859, "step": 2344 }, { "epoch": 0.4746002833434527, "grad_norm": 0.36976000666618347, "learning_rate": 0.00017360016323580014, "loss": 0.1975, "step": 2345 }, { "epoch": 0.474802671523983, "grad_norm": 0.287800669670105, "learning_rate": 0.00017357862671323225, "loss": 0.2337, "step": 2346 }, { "epoch": 0.47500505970451323, "grad_norm": 0.36222925782203674, "learning_rate": 0.00017355708274675093, "loss": 0.2198, "step": 2347 }, { "epoch": 0.4752074478850435, "grad_norm": 0.3630425035953522, "learning_rate": 0.00017353553133853583, "loss": 0.246, "step": 2348 }, { "epoch": 0.47540983606557374, "grad_norm": 0.35259371995925903, "learning_rate": 0.00017351397249076725, "loss": 0.2544, "step": 2349 }, { "epoch": 0.47561222424610405, "grad_norm": 0.43726104497909546, "learning_rate": 0.00017349240620562632, "loss": 0.1957, "step": 2350 }, { "epoch": 0.47561222424610405, "eval_loss": 0.27133333683013916, "eval_runtime": 0.7399, "eval_samples_per_second": 6.757, "eval_steps_per_second": 1.351, "step": 2350 }, { "epoch": 0.4758146124266343, "grad_norm": 0.3979310691356659, "learning_rate": 0.00017347083248529484, "loss": 0.2692, "step": 2351 }, { "epoch": 0.47601700060716456, "grad_norm": 0.3271339237689972, "learning_rate": 0.00017344925133195552, "loss": 0.2406, "step": 2352 }, { "epoch": 0.4762193887876948, "grad_norm": 0.2918234169483185, "learning_rate": 0.00017342766274779157, "loss": 0.212, "step": 2353 }, { "epoch": 0.4764217769682251, "grad_norm": 0.28919458389282227, "learning_rate": 0.00017340606673498722, "loss": 0.2696, "step": 2354 }, { "epoch": 0.47662416514875533, "grad_norm": 0.3120443522930145, "learning_rate": 0.00017338446329572723, "loss": 0.228, "step": 2355 }, { "epoch": 0.4768265533292856, "grad_norm": 0.28186094760894775, "learning_rate": 0.00017336285243219732, "loss": 0.2372, "step": 2356 }, { "epoch": 0.47702894150981584, "grad_norm": 0.29053986072540283, "learning_rate": 0.00017334123414658377, "loss": 0.2802, "step": 2357 }, { "epoch": 0.4772313296903461, "grad_norm": 0.36644598841667175, "learning_rate": 0.00017331960844107369, "loss": 0.2704, "step": 2358 }, { "epoch": 0.47743371787087635, "grad_norm": 0.37110552191734314, "learning_rate": 0.00017329797531785495, "loss": 0.2791, "step": 2359 }, { "epoch": 0.4776361060514066, "grad_norm": 0.2938483953475952, "learning_rate": 0.0001732763347791162, "loss": 0.2663, "step": 2360 }, { "epoch": 0.47783849423193686, "grad_norm": 0.27444276213645935, "learning_rate": 0.00017325468682704678, "loss": 0.2137, "step": 2361 }, { "epoch": 0.4780408824124671, "grad_norm": 0.30051189661026, "learning_rate": 0.0001732330314638368, "loss": 0.2213, "step": 2362 }, { "epoch": 0.47824327059299737, "grad_norm": 0.38845130801200867, "learning_rate": 0.00017321136869167712, "loss": 0.2404, "step": 2363 }, { "epoch": 0.4784456587735276, "grad_norm": 0.30238163471221924, "learning_rate": 0.00017318969851275935, "loss": 0.2617, "step": 2364 }, { "epoch": 0.4786480469540579, "grad_norm": 0.2759285867214203, "learning_rate": 0.00017316802092927586, "loss": 0.2482, "step": 2365 }, { "epoch": 0.47885043513458814, "grad_norm": 0.27550262212753296, "learning_rate": 0.00017314633594341973, "loss": 0.2125, "step": 2366 }, { "epoch": 0.4790528233151184, "grad_norm": 0.33247604966163635, "learning_rate": 0.00017312464355738488, "loss": 0.2808, "step": 2367 }, { "epoch": 0.47925521149564865, "grad_norm": 0.3416070342063904, "learning_rate": 0.00017310294377336587, "loss": 0.2645, "step": 2368 }, { "epoch": 0.4794575996761789, "grad_norm": 0.3552078604698181, "learning_rate": 0.00017308123659355804, "loss": 0.28, "step": 2369 }, { "epoch": 0.47965998785670916, "grad_norm": 0.2823755443096161, "learning_rate": 0.00017305952202015755, "loss": 0.2343, "step": 2370 }, { "epoch": 0.4798623760372394, "grad_norm": 0.31195518374443054, "learning_rate": 0.00017303780005536123, "loss": 0.2446, "step": 2371 }, { "epoch": 0.48006476421776967, "grad_norm": 0.3804989755153656, "learning_rate": 0.0001730160707013667, "loss": 0.274, "step": 2372 }, { "epoch": 0.4802671523982999, "grad_norm": 0.2393997609615326, "learning_rate": 0.00017299433396037223, "loss": 0.22, "step": 2373 }, { "epoch": 0.4804695405788302, "grad_norm": 0.3902396261692047, "learning_rate": 0.000172972589834577, "loss": 0.2347, "step": 2374 }, { "epoch": 0.48067192875936043, "grad_norm": 0.3158196210861206, "learning_rate": 0.00017295083832618083, "loss": 0.2477, "step": 2375 }, { "epoch": 0.4808743169398907, "grad_norm": 0.29111531376838684, "learning_rate": 0.0001729290794373843, "loss": 0.2271, "step": 2376 }, { "epoch": 0.48107670512042094, "grad_norm": 0.28990867733955383, "learning_rate": 0.00017290731317038874, "loss": 0.24, "step": 2377 }, { "epoch": 0.4812790933009512, "grad_norm": 0.3305584788322449, "learning_rate": 0.00017288553952739627, "loss": 0.2249, "step": 2378 }, { "epoch": 0.48148148148148145, "grad_norm": 0.5030553340911865, "learning_rate": 0.00017286375851060964, "loss": 0.2334, "step": 2379 }, { "epoch": 0.48168386966201177, "grad_norm": 0.267829567193985, "learning_rate": 0.0001728419701222325, "loss": 0.2448, "step": 2380 }, { "epoch": 0.481886257842542, "grad_norm": 0.33796215057373047, "learning_rate": 0.00017282017436446917, "loss": 0.2248, "step": 2381 }, { "epoch": 0.4820886460230723, "grad_norm": 0.29112884402275085, "learning_rate": 0.00017279837123952466, "loss": 0.2443, "step": 2382 }, { "epoch": 0.48229103420360253, "grad_norm": 0.27587100863456726, "learning_rate": 0.0001727765607496048, "loss": 0.2304, "step": 2383 }, { "epoch": 0.4824934223841328, "grad_norm": 0.4192744493484497, "learning_rate": 0.0001727547428969162, "loss": 0.2686, "step": 2384 }, { "epoch": 0.48269581056466304, "grad_norm": 0.3476436138153076, "learning_rate": 0.0001727329176836661, "loss": 0.2703, "step": 2385 }, { "epoch": 0.4828981987451933, "grad_norm": 0.28923192620277405, "learning_rate": 0.0001727110851120626, "loss": 0.2387, "step": 2386 }, { "epoch": 0.48310058692572355, "grad_norm": 0.31386253237724304, "learning_rate": 0.00017268924518431438, "loss": 0.2524, "step": 2387 }, { "epoch": 0.4833029751062538, "grad_norm": 0.28253522515296936, "learning_rate": 0.0001726673979026311, "loss": 0.2235, "step": 2388 }, { "epoch": 0.48350536328678406, "grad_norm": 0.3415163457393646, "learning_rate": 0.00017264554326922298, "loss": 0.2787, "step": 2389 }, { "epoch": 0.4837077514673143, "grad_norm": 0.3251768946647644, "learning_rate": 0.00017262368128630106, "loss": 0.2546, "step": 2390 }, { "epoch": 0.4839101396478446, "grad_norm": 0.3223573863506317, "learning_rate": 0.0001726018119560771, "loss": 0.2708, "step": 2391 }, { "epoch": 0.48411252782837483, "grad_norm": 0.30109891295433044, "learning_rate": 0.0001725799352807636, "loss": 0.2552, "step": 2392 }, { "epoch": 0.4843149160089051, "grad_norm": 0.45630934834480286, "learning_rate": 0.00017255805126257384, "loss": 0.2713, "step": 2393 }, { "epoch": 0.48451730418943534, "grad_norm": 0.3417312800884247, "learning_rate": 0.00017253615990372176, "loss": 0.294, "step": 2394 }, { "epoch": 0.4847196923699656, "grad_norm": 0.3401585519313812, "learning_rate": 0.00017251426120642216, "loss": 0.2577, "step": 2395 }, { "epoch": 0.48492208055049585, "grad_norm": 0.32333528995513916, "learning_rate": 0.0001724923551728905, "loss": 0.2418, "step": 2396 }, { "epoch": 0.4851244687310261, "grad_norm": 0.24791204929351807, "learning_rate": 0.000172470441805343, "loss": 0.2308, "step": 2397 }, { "epoch": 0.48532685691155636, "grad_norm": 0.26590603590011597, "learning_rate": 0.0001724485211059966, "loss": 0.2324, "step": 2398 }, { "epoch": 0.4855292450920866, "grad_norm": 0.29200440645217896, "learning_rate": 0.00017242659307706903, "loss": 0.238, "step": 2399 }, { "epoch": 0.48573163327261687, "grad_norm": 0.5061826109886169, "learning_rate": 0.00017240465772077877, "loss": 0.2549, "step": 2400 }, { "epoch": 0.48573163327261687, "eval_loss": 0.26132091879844666, "eval_runtime": 0.7361, "eval_samples_per_second": 6.793, "eval_steps_per_second": 1.359, "step": 2400 }, { "epoch": 0.4859340214531471, "grad_norm": 0.3257822096347809, "learning_rate": 0.00017238271503934493, "loss": 0.2536, "step": 2401 }, { "epoch": 0.4861364096336774, "grad_norm": 0.30192455649375916, "learning_rate": 0.00017236076503498752, "loss": 0.2279, "step": 2402 }, { "epoch": 0.48633879781420764, "grad_norm": 0.3379042446613312, "learning_rate": 0.00017233880770992717, "loss": 0.27, "step": 2403 }, { "epoch": 0.4865411859947379, "grad_norm": 0.3168107867240906, "learning_rate": 0.00017231684306638528, "loss": 0.2254, "step": 2404 }, { "epoch": 0.48674357417526815, "grad_norm": 0.36106303334236145, "learning_rate": 0.00017229487110658403, "loss": 0.2457, "step": 2405 }, { "epoch": 0.4869459623557984, "grad_norm": 0.26096561551094055, "learning_rate": 0.0001722728918327463, "loss": 0.2176, "step": 2406 }, { "epoch": 0.48714835053632866, "grad_norm": 0.4154165983200073, "learning_rate": 0.00017225090524709575, "loss": 0.2241, "step": 2407 }, { "epoch": 0.4873507387168589, "grad_norm": 0.32560572028160095, "learning_rate": 0.0001722289113518567, "loss": 0.2302, "step": 2408 }, { "epoch": 0.48755312689738917, "grad_norm": 0.3593572676181793, "learning_rate": 0.00017220691014925427, "loss": 0.2349, "step": 2409 }, { "epoch": 0.4877555150779195, "grad_norm": 0.4322805106639862, "learning_rate": 0.00017218490164151438, "loss": 0.2485, "step": 2410 }, { "epoch": 0.48795790325844973, "grad_norm": 0.5639968514442444, "learning_rate": 0.00017216288583086353, "loss": 0.2509, "step": 2411 }, { "epoch": 0.48816029143898, "grad_norm": 0.31465572118759155, "learning_rate": 0.0001721408627195291, "loss": 0.2316, "step": 2412 }, { "epoch": 0.48836267961951024, "grad_norm": 0.29703637957572937, "learning_rate": 0.00017211883230973916, "loss": 0.2652, "step": 2413 }, { "epoch": 0.4885650678000405, "grad_norm": 0.3234601616859436, "learning_rate": 0.0001720967946037225, "loss": 0.2635, "step": 2414 }, { "epoch": 0.48876745598057075, "grad_norm": 0.3324073553085327, "learning_rate": 0.00017207474960370865, "loss": 0.233, "step": 2415 }, { "epoch": 0.488969844161101, "grad_norm": 0.3848069906234741, "learning_rate": 0.0001720526973119279, "loss": 0.325, "step": 2416 }, { "epoch": 0.48917223234163126, "grad_norm": 0.36358702182769775, "learning_rate": 0.0001720306377306113, "loss": 0.2692, "step": 2417 }, { "epoch": 0.4893746205221615, "grad_norm": 0.2500711679458618, "learning_rate": 0.00017200857086199057, "loss": 0.2353, "step": 2418 }, { "epoch": 0.4895770087026918, "grad_norm": 0.24969623982906342, "learning_rate": 0.0001719864967082982, "loss": 0.2451, "step": 2419 }, { "epoch": 0.48977939688322203, "grad_norm": 0.2897688150405884, "learning_rate": 0.00017196441527176748, "loss": 0.2484, "step": 2420 }, { "epoch": 0.4899817850637523, "grad_norm": 0.32977643609046936, "learning_rate": 0.0001719423265546323, "loss": 0.2536, "step": 2421 }, { "epoch": 0.49018417324428254, "grad_norm": 0.286429226398468, "learning_rate": 0.00017192023055912742, "loss": 0.2234, "step": 2422 }, { "epoch": 0.4903865614248128, "grad_norm": 0.31992316246032715, "learning_rate": 0.00017189812728748828, "loss": 0.2045, "step": 2423 }, { "epoch": 0.49058894960534305, "grad_norm": 0.26441943645477295, "learning_rate": 0.00017187601674195098, "loss": 0.1897, "step": 2424 }, { "epoch": 0.4907913377858733, "grad_norm": 0.27432650327682495, "learning_rate": 0.00017185389892475256, "loss": 0.2453, "step": 2425 }, { "epoch": 0.49099372596640356, "grad_norm": 0.4113869071006775, "learning_rate": 0.0001718317738381306, "loss": 0.2395, "step": 2426 }, { "epoch": 0.4911961141469338, "grad_norm": 0.40150976181030273, "learning_rate": 0.0001718096414843234, "loss": 0.2889, "step": 2427 }, { "epoch": 0.49139850232746407, "grad_norm": 0.37498939037323, "learning_rate": 0.00017178750186557025, "loss": 0.2515, "step": 2428 }, { "epoch": 0.4916008905079943, "grad_norm": 0.3221639096736908, "learning_rate": 0.0001717653549841109, "loss": 0.2316, "step": 2429 }, { "epoch": 0.4918032786885246, "grad_norm": 0.3239342272281647, "learning_rate": 0.00017174320084218593, "loss": 0.294, "step": 2430 }, { "epoch": 0.49200566686905484, "grad_norm": 0.3614806532859802, "learning_rate": 0.00017172103944203672, "loss": 0.2425, "step": 2431 }, { "epoch": 0.4922080550495851, "grad_norm": 0.3906191885471344, "learning_rate": 0.0001716988707859053, "loss": 0.2704, "step": 2432 }, { "epoch": 0.49241044323011535, "grad_norm": 0.35471323132514954, "learning_rate": 0.00017167669487603443, "loss": 0.256, "step": 2433 }, { "epoch": 0.4926128314106456, "grad_norm": 0.34735792875289917, "learning_rate": 0.0001716545117146677, "loss": 0.2289, "step": 2434 }, { "epoch": 0.49281521959117586, "grad_norm": 0.338459849357605, "learning_rate": 0.00017163232130404932, "loss": 0.257, "step": 2435 }, { "epoch": 0.4930176077717061, "grad_norm": 0.29631951451301575, "learning_rate": 0.0001716101236464243, "loss": 0.2378, "step": 2436 }, { "epoch": 0.49321999595223637, "grad_norm": 0.3412487506866455, "learning_rate": 0.0001715879187440384, "loss": 0.2671, "step": 2437 }, { "epoch": 0.4934223841327666, "grad_norm": 0.30353328585624695, "learning_rate": 0.000171565706599138, "loss": 0.2573, "step": 2438 }, { "epoch": 0.4936247723132969, "grad_norm": 0.3252297043800354, "learning_rate": 0.00017154348721397033, "loss": 0.2481, "step": 2439 }, { "epoch": 0.49382716049382713, "grad_norm": 0.28343456983566284, "learning_rate": 0.00017152126059078335, "loss": 0.2594, "step": 2440 }, { "epoch": 0.49402954867435744, "grad_norm": 0.32058560848236084, "learning_rate": 0.0001714990267318257, "loss": 0.2391, "step": 2441 }, { "epoch": 0.4942319368548877, "grad_norm": 0.5241413116455078, "learning_rate": 0.00017147678563934676, "loss": 0.251, "step": 2442 }, { "epoch": 0.49443432503541795, "grad_norm": 0.26299574971199036, "learning_rate": 0.00017145453731559659, "loss": 0.2592, "step": 2443 }, { "epoch": 0.4946367132159482, "grad_norm": 0.532707691192627, "learning_rate": 0.00017143228176282613, "loss": 0.2871, "step": 2444 }, { "epoch": 0.49483910139647846, "grad_norm": 0.34572750329971313, "learning_rate": 0.00017141001898328693, "loss": 0.2461, "step": 2445 }, { "epoch": 0.4950414895770087, "grad_norm": 0.28672656416893005, "learning_rate": 0.00017138774897923131, "loss": 0.2246, "step": 2446 }, { "epoch": 0.495243877757539, "grad_norm": 0.3325256407260895, "learning_rate": 0.00017136547175291233, "loss": 0.219, "step": 2447 }, { "epoch": 0.49544626593806923, "grad_norm": 0.5237107872962952, "learning_rate": 0.00017134318730658373, "loss": 0.2892, "step": 2448 }, { "epoch": 0.4956486541185995, "grad_norm": 0.31656259298324585, "learning_rate": 0.00017132089564250003, "loss": 0.2734, "step": 2449 }, { "epoch": 0.49585104229912974, "grad_norm": 0.3295278549194336, "learning_rate": 0.00017129859676291647, "loss": 0.2296, "step": 2450 }, { "epoch": 0.49585104229912974, "eval_loss": 0.2731389105319977, "eval_runtime": 0.7382, "eval_samples_per_second": 6.773, "eval_steps_per_second": 1.355, "step": 2450 }, { "epoch": 0.49605343047966, "grad_norm": 0.28645098209381104, "learning_rate": 0.000171276290670089, "loss": 0.2562, "step": 2451 }, { "epoch": 0.49625581866019025, "grad_norm": 0.3923911154270172, "learning_rate": 0.00017125397736627437, "loss": 0.2886, "step": 2452 }, { "epoch": 0.4964582068407205, "grad_norm": 0.3624133765697479, "learning_rate": 0.00017123165685372995, "loss": 0.2733, "step": 2453 }, { "epoch": 0.49666059502125076, "grad_norm": 0.3099536895751953, "learning_rate": 0.00017120932913471392, "loss": 0.2276, "step": 2454 }, { "epoch": 0.496862983201781, "grad_norm": 0.33509066700935364, "learning_rate": 0.00017118699421148518, "loss": 0.2628, "step": 2455 }, { "epoch": 0.49706537138231127, "grad_norm": 0.3130567669868469, "learning_rate": 0.00017116465208630327, "loss": 0.2505, "step": 2456 }, { "epoch": 0.4972677595628415, "grad_norm": 0.6522201895713806, "learning_rate": 0.00017114230276142863, "loss": 0.2666, "step": 2457 }, { "epoch": 0.4974701477433718, "grad_norm": 0.2969781160354614, "learning_rate": 0.00017111994623912228, "loss": 0.2202, "step": 2458 }, { "epoch": 0.49767253592390204, "grad_norm": 0.29119473695755005, "learning_rate": 0.000171097582521646, "loss": 0.2414, "step": 2459 }, { "epoch": 0.4978749241044323, "grad_norm": 0.3184351325035095, "learning_rate": 0.00017107521161126234, "loss": 0.2583, "step": 2460 }, { "epoch": 0.49807731228496255, "grad_norm": 0.35448184609413147, "learning_rate": 0.0001710528335102346, "loss": 0.227, "step": 2461 }, { "epoch": 0.4982797004654928, "grad_norm": 0.2825421392917633, "learning_rate": 0.00017103044822082666, "loss": 0.1929, "step": 2462 }, { "epoch": 0.49848208864602306, "grad_norm": 0.3474180996417999, "learning_rate": 0.00017100805574530328, "loss": 0.2645, "step": 2463 }, { "epoch": 0.4986844768265533, "grad_norm": 0.30311545729637146, "learning_rate": 0.00017098565608592993, "loss": 0.2527, "step": 2464 }, { "epoch": 0.49888686500708357, "grad_norm": 0.3159215748310089, "learning_rate": 0.00017096324924497275, "loss": 0.2153, "step": 2465 }, { "epoch": 0.4990892531876138, "grad_norm": 0.2998165190219879, "learning_rate": 0.00017094083522469858, "loss": 0.2315, "step": 2466 }, { "epoch": 0.4992916413681441, "grad_norm": 0.40242812037467957, "learning_rate": 0.0001709184140273751, "loss": 0.2216, "step": 2467 }, { "epoch": 0.49949402954867433, "grad_norm": 0.39427369832992554, "learning_rate": 0.00017089598565527063, "loss": 0.2425, "step": 2468 }, { "epoch": 0.4996964177292046, "grad_norm": 0.29181742668151855, "learning_rate": 0.00017087355011065423, "loss": 0.2357, "step": 2469 }, { "epoch": 0.49989880590973484, "grad_norm": 0.2653137743473053, "learning_rate": 0.00017085110739579567, "loss": 0.2186, "step": 2470 }, { "epoch": 0.5001011940902651, "grad_norm": 0.4047374725341797, "learning_rate": 0.00017082865751296553, "loss": 0.2876, "step": 2471 }, { "epoch": 0.5003035822707954, "grad_norm": 0.2697608768939972, "learning_rate": 0.00017080620046443503, "loss": 0.224, "step": 2472 }, { "epoch": 0.5005059704513256, "grad_norm": 0.3421246409416199, "learning_rate": 0.0001707837362524761, "loss": 0.2753, "step": 2473 }, { "epoch": 0.5007083586318559, "grad_norm": 0.378449410200119, "learning_rate": 0.00017076126487936146, "loss": 0.2409, "step": 2474 }, { "epoch": 0.5009107468123861, "grad_norm": 0.3779212534427643, "learning_rate": 0.00017073878634736456, "loss": 0.2712, "step": 2475 }, { "epoch": 0.5011131349929164, "grad_norm": 0.3363097012042999, "learning_rate": 0.0001707163006587595, "loss": 0.2327, "step": 2476 }, { "epoch": 0.5013155231734466, "grad_norm": 0.30967584252357483, "learning_rate": 0.00017069380781582113, "loss": 0.2203, "step": 2477 }, { "epoch": 0.5015179113539769, "grad_norm": 0.32559165358543396, "learning_rate": 0.00017067130782082507, "loss": 0.2088, "step": 2478 }, { "epoch": 0.5017202995345071, "grad_norm": 0.26994726061820984, "learning_rate": 0.00017064880067604765, "loss": 0.2497, "step": 2479 }, { "epoch": 0.5019226877150375, "grad_norm": 0.38163644075393677, "learning_rate": 0.0001706262863837659, "loss": 0.2536, "step": 2480 }, { "epoch": 0.5021250758955677, "grad_norm": 0.28695085644721985, "learning_rate": 0.00017060376494625753, "loss": 0.2414, "step": 2481 }, { "epoch": 0.502327464076098, "grad_norm": 0.2939639389514923, "learning_rate": 0.0001705812363658011, "loss": 0.2208, "step": 2482 }, { "epoch": 0.5025298522566282, "grad_norm": 0.29969316720962524, "learning_rate": 0.00017055870064467573, "loss": 0.2136, "step": 2483 }, { "epoch": 0.5027322404371585, "grad_norm": 0.48427799344062805, "learning_rate": 0.00017053615778516142, "loss": 0.2368, "step": 2484 }, { "epoch": 0.5029346286176887, "grad_norm": 0.37288084626197815, "learning_rate": 0.0001705136077895388, "loss": 0.2762, "step": 2485 }, { "epoch": 0.503137016798219, "grad_norm": 0.3011093735694885, "learning_rate": 0.00017049105066008923, "loss": 0.2111, "step": 2486 }, { "epoch": 0.5033394049787493, "grad_norm": 0.27865341305732727, "learning_rate": 0.0001704684863990948, "loss": 0.2527, "step": 2487 }, { "epoch": 0.5035417931592795, "grad_norm": 0.3852303922176361, "learning_rate": 0.00017044591500883834, "loss": 0.2248, "step": 2488 }, { "epoch": 0.5037441813398098, "grad_norm": 0.31749090552330017, "learning_rate": 0.00017042333649160336, "loss": 0.2468, "step": 2489 }, { "epoch": 0.50394656952034, "grad_norm": 0.3167575001716614, "learning_rate": 0.00017040075084967415, "loss": 0.2337, "step": 2490 }, { "epoch": 0.5041489577008703, "grad_norm": 0.3536628782749176, "learning_rate": 0.00017037815808533568, "loss": 0.2673, "step": 2491 }, { "epoch": 0.5043513458814005, "grad_norm": 0.4146457314491272, "learning_rate": 0.00017035555820087364, "loss": 0.2238, "step": 2492 }, { "epoch": 0.5045537340619308, "grad_norm": 0.32466617226600647, "learning_rate": 0.00017033295119857448, "loss": 0.2538, "step": 2493 }, { "epoch": 0.504756122242461, "grad_norm": 0.2737172544002533, "learning_rate": 0.00017031033708072527, "loss": 0.2318, "step": 2494 }, { "epoch": 0.5049585104229913, "grad_norm": 0.33758556842803955, "learning_rate": 0.00017028771584961394, "loss": 0.2248, "step": 2495 }, { "epoch": 0.5051608986035215, "grad_norm": 0.2949804365634918, "learning_rate": 0.00017026508750752904, "loss": 0.2426, "step": 2496 }, { "epoch": 0.5053632867840518, "grad_norm": 0.30609846115112305, "learning_rate": 0.00017024245205675986, "loss": 0.2452, "step": 2497 }, { "epoch": 0.505565674964582, "grad_norm": 0.3072340488433838, "learning_rate": 0.00017021980949959641, "loss": 0.2238, "step": 2498 }, { "epoch": 0.5057680631451124, "grad_norm": 0.3006766140460968, "learning_rate": 0.0001701971598383295, "loss": 0.2277, "step": 2499 }, { "epoch": 0.5059704513256426, "grad_norm": 0.33413830399513245, "learning_rate": 0.00017017450307525047, "loss": 0.2794, "step": 2500 }, { "epoch": 0.5059704513256426, "eval_loss": 0.2693020701408386, "eval_runtime": 0.7363, "eval_samples_per_second": 6.791, "eval_steps_per_second": 1.358, "step": 2500 }, { "epoch": 0.5061728395061729, "grad_norm": 0.35662609338760376, "learning_rate": 0.00017015183921265158, "loss": 0.2505, "step": 2501 }, { "epoch": 0.5063752276867031, "grad_norm": 0.3591224253177643, "learning_rate": 0.00017012916825282566, "loss": 0.2682, "step": 2502 }, { "epoch": 0.5065776158672334, "grad_norm": 0.5121231079101562, "learning_rate": 0.00017010649019806638, "loss": 0.2245, "step": 2503 }, { "epoch": 0.5067800040477636, "grad_norm": 0.3819142282009125, "learning_rate": 0.00017008380505066802, "loss": 0.2537, "step": 2504 }, { "epoch": 0.5069823922282939, "grad_norm": 0.29256799817085266, "learning_rate": 0.0001700611128129257, "loss": 0.2633, "step": 2505 }, { "epoch": 0.5071847804088241, "grad_norm": 0.3312610685825348, "learning_rate": 0.0001700384134871351, "loss": 0.2593, "step": 2506 }, { "epoch": 0.5073871685893544, "grad_norm": 0.303207129240036, "learning_rate": 0.00017001570707559274, "loss": 0.2476, "step": 2507 }, { "epoch": 0.5075895567698846, "grad_norm": 0.2712869644165039, "learning_rate": 0.00016999299358059575, "loss": 0.2118, "step": 2508 }, { "epoch": 0.5077919449504149, "grad_norm": 0.29760581254959106, "learning_rate": 0.00016997027300444213, "loss": 0.2539, "step": 2509 }, { "epoch": 0.5079943331309451, "grad_norm": 0.2707705795764923, "learning_rate": 0.00016994754534943048, "loss": 0.2653, "step": 2510 }, { "epoch": 0.5081967213114754, "grad_norm": 0.24326786398887634, "learning_rate": 0.00016992481061786014, "loss": 0.225, "step": 2511 }, { "epoch": 0.5083991094920056, "grad_norm": 0.24204504489898682, "learning_rate": 0.0001699020688120312, "loss": 0.2092, "step": 2512 }, { "epoch": 0.5086014976725359, "grad_norm": 0.27428555488586426, "learning_rate": 0.00016987931993424438, "loss": 0.2461, "step": 2513 }, { "epoch": 0.5088038858530661, "grad_norm": 0.28147372603416443, "learning_rate": 0.0001698565639868012, "loss": 0.2386, "step": 2514 }, { "epoch": 0.5090062740335964, "grad_norm": 0.2533692419528961, "learning_rate": 0.0001698338009720039, "loss": 0.2271, "step": 2515 }, { "epoch": 0.5092086622141268, "grad_norm": 0.3866344094276428, "learning_rate": 0.0001698110308921554, "loss": 0.2464, "step": 2516 }, { "epoch": 0.509411050394657, "grad_norm": 0.21789251267910004, "learning_rate": 0.00016978825374955924, "loss": 0.1852, "step": 2517 }, { "epoch": 0.5096134385751873, "grad_norm": 0.26758989691734314, "learning_rate": 0.00016976546954651988, "loss": 0.2488, "step": 2518 }, { "epoch": 0.5098158267557175, "grad_norm": 0.33704081177711487, "learning_rate": 0.00016974267828534235, "loss": 0.2584, "step": 2519 }, { "epoch": 0.5100182149362478, "grad_norm": 0.2544485330581665, "learning_rate": 0.00016971987996833242, "loss": 0.2403, "step": 2520 }, { "epoch": 0.510220603116778, "grad_norm": 0.2556194067001343, "learning_rate": 0.00016969707459779665, "loss": 0.1753, "step": 2521 }, { "epoch": 0.5104229912973083, "grad_norm": 0.2857758402824402, "learning_rate": 0.00016967426217604214, "loss": 0.2157, "step": 2522 }, { "epoch": 0.5106253794778385, "grad_norm": 0.31661200523376465, "learning_rate": 0.00016965144270537688, "loss": 0.2374, "step": 2523 }, { "epoch": 0.5108277676583688, "grad_norm": 0.28023761510849, "learning_rate": 0.0001696286161881095, "loss": 0.2392, "step": 2524 }, { "epoch": 0.511030155838899, "grad_norm": 0.252261221408844, "learning_rate": 0.00016960578262654931, "loss": 0.1939, "step": 2525 }, { "epoch": 0.5112325440194293, "grad_norm": 0.33003121614456177, "learning_rate": 0.00016958294202300644, "loss": 0.2063, "step": 2526 }, { "epoch": 0.5114349321999595, "grad_norm": 0.24865320324897766, "learning_rate": 0.0001695600943797916, "loss": 0.2183, "step": 2527 }, { "epoch": 0.5116373203804898, "grad_norm": 0.421440064907074, "learning_rate": 0.0001695372396992163, "loss": 0.292, "step": 2528 }, { "epoch": 0.51183970856102, "grad_norm": 0.2991817593574524, "learning_rate": 0.00016951437798359275, "loss": 0.219, "step": 2529 }, { "epoch": 0.5120420967415503, "grad_norm": 0.3103179931640625, "learning_rate": 0.00016949150923523384, "loss": 0.2484, "step": 2530 }, { "epoch": 0.5122444849220805, "grad_norm": 0.28612539172172546, "learning_rate": 0.00016946863345645316, "loss": 0.239, "step": 2531 }, { "epoch": 0.5124468731026108, "grad_norm": 0.4773065447807312, "learning_rate": 0.0001694457506495651, "loss": 0.2489, "step": 2532 }, { "epoch": 0.512649261283141, "grad_norm": 0.4546932578086853, "learning_rate": 0.00016942286081688467, "loss": 0.2987, "step": 2533 }, { "epoch": 0.5128516494636713, "grad_norm": 0.3810085654258728, "learning_rate": 0.0001693999639607276, "loss": 0.2857, "step": 2534 }, { "epoch": 0.5130540376442015, "grad_norm": 0.4003126621246338, "learning_rate": 0.0001693770600834104, "loss": 0.258, "step": 2535 }, { "epoch": 0.5132564258247319, "grad_norm": 0.3469572365283966, "learning_rate": 0.00016935414918725026, "loss": 0.2709, "step": 2536 }, { "epoch": 0.513458814005262, "grad_norm": 0.2894114851951599, "learning_rate": 0.000169331231274565, "loss": 0.2282, "step": 2537 }, { "epoch": 0.5136612021857924, "grad_norm": 0.3092636168003082, "learning_rate": 0.00016930830634767326, "loss": 0.263, "step": 2538 }, { "epoch": 0.5138635903663226, "grad_norm": 0.27841058373451233, "learning_rate": 0.0001692853744088943, "loss": 0.2381, "step": 2539 }, { "epoch": 0.5140659785468529, "grad_norm": 0.2825442850589752, "learning_rate": 0.00016926243546054817, "loss": 0.223, "step": 2540 }, { "epoch": 0.5142683667273831, "grad_norm": 0.34579232335090637, "learning_rate": 0.0001692394895049556, "loss": 0.2571, "step": 2541 }, { "epoch": 0.5144707549079134, "grad_norm": 0.3164571523666382, "learning_rate": 0.00016921653654443798, "loss": 0.25, "step": 2542 }, { "epoch": 0.5146731430884436, "grad_norm": 0.2637081742286682, "learning_rate": 0.00016919357658131749, "loss": 0.2312, "step": 2543 }, { "epoch": 0.5148755312689739, "grad_norm": 0.31452441215515137, "learning_rate": 0.00016917060961791695, "loss": 0.2233, "step": 2544 }, { "epoch": 0.5150779194495041, "grad_norm": 0.35815557837486267, "learning_rate": 0.00016914763565655997, "loss": 0.3006, "step": 2545 }, { "epoch": 0.5152803076300344, "grad_norm": 0.28938955068588257, "learning_rate": 0.0001691246546995707, "loss": 0.2273, "step": 2546 }, { "epoch": 0.5154826958105647, "grad_norm": 0.2862246632575989, "learning_rate": 0.00016910166674927423, "loss": 0.2668, "step": 2547 }, { "epoch": 0.5156850839910949, "grad_norm": 0.33494991064071655, "learning_rate": 0.0001690786718079962, "loss": 0.2825, "step": 2548 }, { "epoch": 0.5158874721716252, "grad_norm": 0.38121524453163147, "learning_rate": 0.00016905566987806297, "loss": 0.2888, "step": 2549 }, { "epoch": 0.5160898603521554, "grad_norm": 0.47657451033592224, "learning_rate": 0.00016903266096180162, "loss": 0.1814, "step": 2550 }, { "epoch": 0.5160898603521554, "eval_loss": 0.2675860524177551, "eval_runtime": 0.7411, "eval_samples_per_second": 6.747, "eval_steps_per_second": 1.349, "step": 2550 }, { "epoch": 0.5162922485326857, "grad_norm": 0.352152019739151, "learning_rate": 0.00016900964506154007, "loss": 0.2574, "step": 2551 }, { "epoch": 0.5164946367132159, "grad_norm": 0.37841930985450745, "learning_rate": 0.00016898662217960667, "loss": 0.2526, "step": 2552 }, { "epoch": 0.5166970248937462, "grad_norm": 0.3507513999938965, "learning_rate": 0.00016896359231833075, "loss": 0.2916, "step": 2553 }, { "epoch": 0.5168994130742764, "grad_norm": 0.2841602861881256, "learning_rate": 0.00016894055548004216, "loss": 0.2546, "step": 2554 }, { "epoch": 0.5171018012548068, "grad_norm": 0.3018854856491089, "learning_rate": 0.00016891751166707154, "loss": 0.2364, "step": 2555 }, { "epoch": 0.517304189435337, "grad_norm": 0.30066660046577454, "learning_rate": 0.00016889446088175027, "loss": 0.235, "step": 2556 }, { "epoch": 0.5175065776158673, "grad_norm": 0.3245569169521332, "learning_rate": 0.00016887140312641034, "loss": 0.2193, "step": 2557 }, { "epoch": 0.5177089657963975, "grad_norm": 0.42227664589881897, "learning_rate": 0.0001688483384033845, "loss": 0.2189, "step": 2558 }, { "epoch": 0.5179113539769278, "grad_norm": 0.2770608961582184, "learning_rate": 0.00016882526671500617, "loss": 0.2144, "step": 2559 }, { "epoch": 0.518113742157458, "grad_norm": 0.38048413395881653, "learning_rate": 0.00016880218806360957, "loss": 0.2379, "step": 2560 }, { "epoch": 0.5183161303379883, "grad_norm": 0.4153047502040863, "learning_rate": 0.0001687791024515295, "loss": 0.2446, "step": 2561 }, { "epoch": 0.5185185185185185, "grad_norm": 0.3124135434627533, "learning_rate": 0.00016875600988110155, "loss": 0.2609, "step": 2562 }, { "epoch": 0.5187209066990488, "grad_norm": 0.3253217041492462, "learning_rate": 0.00016873291035466193, "loss": 0.2902, "step": 2563 }, { "epoch": 0.518923294879579, "grad_norm": 0.32971933484077454, "learning_rate": 0.00016870980387454764, "loss": 0.2548, "step": 2564 }, { "epoch": 0.5191256830601093, "grad_norm": 0.3016485869884491, "learning_rate": 0.00016868669044309642, "loss": 0.2022, "step": 2565 }, { "epoch": 0.5193280712406395, "grad_norm": 0.299196720123291, "learning_rate": 0.00016866357006264652, "loss": 0.2464, "step": 2566 }, { "epoch": 0.5195304594211698, "grad_norm": 0.30322641134262085, "learning_rate": 0.00016864044273553713, "loss": 0.2517, "step": 2567 }, { "epoch": 0.5197328476017, "grad_norm": 0.32855749130249023, "learning_rate": 0.00016861730846410794, "loss": 0.2456, "step": 2568 }, { "epoch": 0.5199352357822303, "grad_norm": 0.3540678322315216, "learning_rate": 0.00016859416725069947, "loss": 0.2404, "step": 2569 }, { "epoch": 0.5201376239627605, "grad_norm": 0.28167209029197693, "learning_rate": 0.00016857101909765294, "loss": 0.2428, "step": 2570 }, { "epoch": 0.5203400121432908, "grad_norm": 0.36069634556770325, "learning_rate": 0.0001685478640073102, "loss": 0.2775, "step": 2571 }, { "epoch": 0.520542400323821, "grad_norm": 0.29065969586372375, "learning_rate": 0.00016852470198201383, "loss": 0.2611, "step": 2572 }, { "epoch": 0.5207447885043514, "grad_norm": 0.23961427807807922, "learning_rate": 0.00016850153302410713, "loss": 0.2346, "step": 2573 }, { "epoch": 0.5209471766848816, "grad_norm": 0.30201297998428345, "learning_rate": 0.00016847835713593412, "loss": 0.206, "step": 2574 }, { "epoch": 0.5211495648654119, "grad_norm": 0.30087214708328247, "learning_rate": 0.00016845517431983946, "loss": 0.1782, "step": 2575 }, { "epoch": 0.5213519530459421, "grad_norm": 0.43761369585990906, "learning_rate": 0.00016843198457816856, "loss": 0.2583, "step": 2576 }, { "epoch": 0.5215543412264724, "grad_norm": 0.2883847653865814, "learning_rate": 0.0001684087879132675, "loss": 0.2656, "step": 2577 }, { "epoch": 0.5217567294070027, "grad_norm": 0.24638386070728302, "learning_rate": 0.00016838558432748308, "loss": 0.2279, "step": 2578 }, { "epoch": 0.5219591175875329, "grad_norm": 0.27895137667655945, "learning_rate": 0.00016836237382316283, "loss": 0.2412, "step": 2579 }, { "epoch": 0.5221615057680632, "grad_norm": 0.34711912274360657, "learning_rate": 0.00016833915640265484, "loss": 0.2468, "step": 2580 }, { "epoch": 0.5223638939485934, "grad_norm": 0.3112831711769104, "learning_rate": 0.00016831593206830816, "loss": 0.2487, "step": 2581 }, { "epoch": 0.5225662821291237, "grad_norm": 0.3572124242782593, "learning_rate": 0.00016829270082247227, "loss": 0.2768, "step": 2582 }, { "epoch": 0.5227686703096539, "grad_norm": 0.2616262137889862, "learning_rate": 0.00016826946266749752, "loss": 0.2074, "step": 2583 }, { "epoch": 0.5229710584901842, "grad_norm": 0.3566007614135742, "learning_rate": 0.00016824621760573485, "loss": 0.228, "step": 2584 }, { "epoch": 0.5231734466707144, "grad_norm": 0.2676408886909485, "learning_rate": 0.000168222965639536, "loss": 0.237, "step": 2585 }, { "epoch": 0.5233758348512447, "grad_norm": 0.40267249941825867, "learning_rate": 0.00016819970677125335, "loss": 0.2302, "step": 2586 }, { "epoch": 0.5235782230317749, "grad_norm": 0.2821453809738159, "learning_rate": 0.00016817644100323995, "loss": 0.24, "step": 2587 }, { "epoch": 0.5237806112123052, "grad_norm": 0.36152565479278564, "learning_rate": 0.00016815316833784962, "loss": 0.2853, "step": 2588 }, { "epoch": 0.5239829993928354, "grad_norm": 0.324090838432312, "learning_rate": 0.00016812988877743686, "loss": 0.2615, "step": 2589 }, { "epoch": 0.5241853875733657, "grad_norm": 0.2735280692577362, "learning_rate": 0.00016810660232435685, "loss": 0.2296, "step": 2590 }, { "epoch": 0.524387775753896, "grad_norm": 0.29336437582969666, "learning_rate": 0.00016808330898096543, "loss": 0.2259, "step": 2591 }, { "epoch": 0.5245901639344263, "grad_norm": 0.3331908583641052, "learning_rate": 0.00016806000874961918, "loss": 0.2717, "step": 2592 }, { "epoch": 0.5247925521149565, "grad_norm": 0.30863717198371887, "learning_rate": 0.00016803670163267542, "loss": 0.2502, "step": 2593 }, { "epoch": 0.5249949402954868, "grad_norm": 0.42840126156806946, "learning_rate": 0.00016801338763249208, "loss": 0.2422, "step": 2594 }, { "epoch": 0.525197328476017, "grad_norm": 0.28465163707733154, "learning_rate": 0.0001679900667514278, "loss": 0.231, "step": 2595 }, { "epoch": 0.5253997166565473, "grad_norm": 0.46548783779144287, "learning_rate": 0.00016796673899184203, "loss": 0.2115, "step": 2596 }, { "epoch": 0.5256021048370775, "grad_norm": 0.32914069294929504, "learning_rate": 0.00016794340435609474, "loss": 0.2153, "step": 2597 }, { "epoch": 0.5258044930176078, "grad_norm": 0.34944626688957214, "learning_rate": 0.00016792006284654677, "loss": 0.2272, "step": 2598 }, { "epoch": 0.526006881198138, "grad_norm": 0.28967317938804626, "learning_rate": 0.00016789671446555945, "loss": 0.2393, "step": 2599 }, { "epoch": 0.5262092693786683, "grad_norm": 0.2868635952472687, "learning_rate": 0.00016787335921549502, "loss": 0.2639, "step": 2600 }, { "epoch": 0.5262092693786683, "eval_loss": 0.2700594961643219, "eval_runtime": 0.7374, "eval_samples_per_second": 6.78, "eval_steps_per_second": 1.356, "step": 2600 }, { "epoch": 0.5264116575591985, "grad_norm": 0.3494894504547119, "learning_rate": 0.0001678499970987163, "loss": 0.2236, "step": 2601 }, { "epoch": 0.5266140457397288, "grad_norm": 0.2993201017379761, "learning_rate": 0.00016782662811758682, "loss": 0.2574, "step": 2602 }, { "epoch": 0.526816433920259, "grad_norm": 0.36104443669319153, "learning_rate": 0.0001678032522744708, "loss": 0.2583, "step": 2603 }, { "epoch": 0.5270188221007893, "grad_norm": 0.355884313583374, "learning_rate": 0.00016777986957173315, "loss": 0.2564, "step": 2604 }, { "epoch": 0.5272212102813195, "grad_norm": 0.2789347767829895, "learning_rate": 0.00016775648001173953, "loss": 0.2451, "step": 2605 }, { "epoch": 0.5274235984618498, "grad_norm": 0.4173826277256012, "learning_rate": 0.0001677330835968562, "loss": 0.2581, "step": 2606 }, { "epoch": 0.5276259866423801, "grad_norm": 0.32102257013320923, "learning_rate": 0.0001677096803294502, "loss": 0.2328, "step": 2607 }, { "epoch": 0.5278283748229103, "grad_norm": 0.3866223394870758, "learning_rate": 0.00016768627021188922, "loss": 0.2324, "step": 2608 }, { "epoch": 0.5280307630034407, "grad_norm": 0.24886023998260498, "learning_rate": 0.00016766285324654163, "loss": 0.2293, "step": 2609 }, { "epoch": 0.5282331511839709, "grad_norm": 0.4187617897987366, "learning_rate": 0.00016763942943577654, "loss": 0.2507, "step": 2610 }, { "epoch": 0.5284355393645012, "grad_norm": 0.323421835899353, "learning_rate": 0.0001676159987819637, "loss": 0.2571, "step": 2611 }, { "epoch": 0.5286379275450314, "grad_norm": 0.2955639958381653, "learning_rate": 0.00016759256128747358, "loss": 0.2485, "step": 2612 }, { "epoch": 0.5288403157255617, "grad_norm": 0.3304389417171478, "learning_rate": 0.00016756911695467737, "loss": 0.2401, "step": 2613 }, { "epoch": 0.5290427039060919, "grad_norm": 0.27834227681159973, "learning_rate": 0.0001675456657859469, "loss": 0.1978, "step": 2614 }, { "epoch": 0.5292450920866222, "grad_norm": 0.29491370916366577, "learning_rate": 0.00016752220778365468, "loss": 0.2288, "step": 2615 }, { "epoch": 0.5294474802671524, "grad_norm": 0.39043062925338745, "learning_rate": 0.000167498742950174, "loss": 0.2469, "step": 2616 }, { "epoch": 0.5296498684476827, "grad_norm": 0.28894317150115967, "learning_rate": 0.0001674752712878788, "loss": 0.2242, "step": 2617 }, { "epoch": 0.5298522566282129, "grad_norm": 0.2729368805885315, "learning_rate": 0.0001674517927991436, "loss": 0.2229, "step": 2618 }, { "epoch": 0.5300546448087432, "grad_norm": 0.3531043231487274, "learning_rate": 0.00016742830748634382, "loss": 0.2392, "step": 2619 }, { "epoch": 0.5302570329892734, "grad_norm": 0.31617650389671326, "learning_rate": 0.00016740481535185538, "loss": 0.2367, "step": 2620 }, { "epoch": 0.5304594211698037, "grad_norm": 0.28258374333381653, "learning_rate": 0.000167381316398055, "loss": 0.2556, "step": 2621 }, { "epoch": 0.5306618093503339, "grad_norm": 0.2794475853443146, "learning_rate": 0.00016735781062732005, "loss": 0.2592, "step": 2622 }, { "epoch": 0.5308641975308642, "grad_norm": 0.25831735134124756, "learning_rate": 0.00016733429804202858, "loss": 0.2083, "step": 2623 }, { "epoch": 0.5310665857113944, "grad_norm": 0.35561296343803406, "learning_rate": 0.0001673107786445594, "loss": 0.2555, "step": 2624 }, { "epoch": 0.5312689738919247, "grad_norm": 0.2760870158672333, "learning_rate": 0.0001672872524372919, "loss": 0.2023, "step": 2625 }, { "epoch": 0.5314713620724549, "grad_norm": 0.33602291345596313, "learning_rate": 0.00016726371942260625, "loss": 0.2226, "step": 2626 }, { "epoch": 0.5316737502529852, "grad_norm": 0.302262544631958, "learning_rate": 0.00016724017960288324, "loss": 0.2788, "step": 2627 }, { "epoch": 0.5318761384335154, "grad_norm": 0.26179543137550354, "learning_rate": 0.00016721663298050442, "loss": 0.242, "step": 2628 }, { "epoch": 0.5320785266140458, "grad_norm": 0.27100905776023865, "learning_rate": 0.00016719307955785195, "loss": 0.2288, "step": 2629 }, { "epoch": 0.532280914794576, "grad_norm": 0.3250473141670227, "learning_rate": 0.00016716951933730875, "loss": 0.2845, "step": 2630 }, { "epoch": 0.5324833029751063, "grad_norm": 0.2948412001132965, "learning_rate": 0.0001671459523212584, "loss": 0.2337, "step": 2631 }, { "epoch": 0.5326856911556365, "grad_norm": 0.3160333037376404, "learning_rate": 0.0001671223785120851, "loss": 0.2939, "step": 2632 }, { "epoch": 0.5328880793361668, "grad_norm": 0.2872770428657532, "learning_rate": 0.0001670987979121739, "loss": 0.2317, "step": 2633 }, { "epoch": 0.533090467516697, "grad_norm": 0.2879941761493683, "learning_rate": 0.00016707521052391035, "loss": 0.2457, "step": 2634 }, { "epoch": 0.5332928556972273, "grad_norm": 0.2784724235534668, "learning_rate": 0.00016705161634968086, "loss": 0.2411, "step": 2635 }, { "epoch": 0.5334952438777575, "grad_norm": 0.2538287043571472, "learning_rate": 0.00016702801539187235, "loss": 0.2063, "step": 2636 }, { "epoch": 0.5336976320582878, "grad_norm": 0.30907729268074036, "learning_rate": 0.00016700440765287256, "loss": 0.2445, "step": 2637 }, { "epoch": 0.5339000202388181, "grad_norm": 0.28510886430740356, "learning_rate": 0.00016698079313506989, "loss": 0.2463, "step": 2638 }, { "epoch": 0.5341024084193483, "grad_norm": 0.32631993293762207, "learning_rate": 0.0001669571718408534, "loss": 0.2277, "step": 2639 }, { "epoch": 0.5343047965998786, "grad_norm": 0.298195481300354, "learning_rate": 0.00016693354377261284, "loss": 0.2531, "step": 2640 }, { "epoch": 0.5345071847804088, "grad_norm": 0.32875820994377136, "learning_rate": 0.00016690990893273862, "loss": 0.2492, "step": 2641 }, { "epoch": 0.5347095729609391, "grad_norm": 0.27733179926872253, "learning_rate": 0.0001668862673236219, "loss": 0.2143, "step": 2642 }, { "epoch": 0.5349119611414693, "grad_norm": 0.30193451046943665, "learning_rate": 0.00016686261894765448, "loss": 0.2301, "step": 2643 }, { "epoch": 0.5351143493219996, "grad_norm": 0.3172464072704315, "learning_rate": 0.00016683896380722887, "loss": 0.2468, "step": 2644 }, { "epoch": 0.5353167375025298, "grad_norm": 1.5048164129257202, "learning_rate": 0.00016681530190473822, "loss": 0.2744, "step": 2645 }, { "epoch": 0.5355191256830601, "grad_norm": 0.3033134937286377, "learning_rate": 0.00016679163324257644, "loss": 0.253, "step": 2646 }, { "epoch": 0.5357215138635903, "grad_norm": 0.36687731742858887, "learning_rate": 0.00016676795782313798, "loss": 0.2557, "step": 2647 }, { "epoch": 0.5359239020441207, "grad_norm": 0.38913866877555847, "learning_rate": 0.00016674427564881817, "loss": 0.2596, "step": 2648 }, { "epoch": 0.5361262902246509, "grad_norm": 0.3134247064590454, "learning_rate": 0.0001667205867220129, "loss": 0.2355, "step": 2649 }, { "epoch": 0.5363286784051812, "grad_norm": 0.30010947585105896, "learning_rate": 0.00016669689104511877, "loss": 0.2351, "step": 2650 }, { "epoch": 0.5363286784051812, "eval_loss": 0.2756694257259369, "eval_runtime": 0.7371, "eval_samples_per_second": 6.783, "eval_steps_per_second": 1.357, "step": 2650 }, { "epoch": 0.5365310665857114, "grad_norm": 0.3701987564563751, "learning_rate": 0.00016667318862053303, "loss": 0.2625, "step": 2651 }, { "epoch": 0.5367334547662417, "grad_norm": 0.2762279510498047, "learning_rate": 0.00016664947945065365, "loss": 0.2387, "step": 2652 }, { "epoch": 0.5369358429467719, "grad_norm": 0.5423218011856079, "learning_rate": 0.0001666257635378793, "loss": 0.2536, "step": 2653 }, { "epoch": 0.5371382311273022, "grad_norm": 0.3626985549926758, "learning_rate": 0.00016660204088460927, "loss": 0.2439, "step": 2654 }, { "epoch": 0.5373406193078324, "grad_norm": 0.4017620384693146, "learning_rate": 0.00016657831149324362, "loss": 0.2285, "step": 2655 }, { "epoch": 0.5375430074883627, "grad_norm": 0.29086053371429443, "learning_rate": 0.000166554575366183, "loss": 0.2206, "step": 2656 }, { "epoch": 0.5377453956688929, "grad_norm": 0.2927820682525635, "learning_rate": 0.0001665308325058288, "loss": 0.2295, "step": 2657 }, { "epoch": 0.5379477838494232, "grad_norm": 0.4150175154209137, "learning_rate": 0.00016650708291458303, "loss": 0.2561, "step": 2658 }, { "epoch": 0.5381501720299534, "grad_norm": 0.33315229415893555, "learning_rate": 0.00016648332659484848, "loss": 0.2453, "step": 2659 }, { "epoch": 0.5383525602104837, "grad_norm": 0.39167946577072144, "learning_rate": 0.00016645956354902858, "loss": 0.274, "step": 2660 }, { "epoch": 0.5385549483910139, "grad_norm": 0.2934190332889557, "learning_rate": 0.00016643579377952737, "loss": 0.2584, "step": 2661 }, { "epoch": 0.5387573365715442, "grad_norm": 0.3536292314529419, "learning_rate": 0.00016641201728874965, "loss": 0.2535, "step": 2662 }, { "epoch": 0.5389597247520744, "grad_norm": 0.29246678948402405, "learning_rate": 0.00016638823407910084, "loss": 0.2585, "step": 2663 }, { "epoch": 0.5391621129326047, "grad_norm": 0.43456900119781494, "learning_rate": 0.00016636444415298716, "loss": 0.267, "step": 2664 }, { "epoch": 0.5393645011131349, "grad_norm": 0.2911350131034851, "learning_rate": 0.00016634064751281535, "loss": 0.217, "step": 2665 }, { "epoch": 0.5395668892936653, "grad_norm": 0.31089237332344055, "learning_rate": 0.00016631684416099294, "loss": 0.2452, "step": 2666 }, { "epoch": 0.5397692774741955, "grad_norm": 0.34671053290367126, "learning_rate": 0.00016629303409992807, "loss": 0.2875, "step": 2667 }, { "epoch": 0.5399716656547258, "grad_norm": 0.2837030291557312, "learning_rate": 0.00016626921733202963, "loss": 0.2345, "step": 2668 }, { "epoch": 0.5401740538352561, "grad_norm": 0.48084816336631775, "learning_rate": 0.00016624539385970714, "loss": 0.2603, "step": 2669 }, { "epoch": 0.5403764420157863, "grad_norm": 0.33831149339675903, "learning_rate": 0.0001662215636853708, "loss": 0.2847, "step": 2670 }, { "epoch": 0.5405788301963166, "grad_norm": 0.27991217374801636, "learning_rate": 0.0001661977268114315, "loss": 0.2168, "step": 2671 }, { "epoch": 0.5407812183768468, "grad_norm": 0.2763236463069916, "learning_rate": 0.0001661738832403008, "loss": 0.2532, "step": 2672 }, { "epoch": 0.5409836065573771, "grad_norm": 0.31166934967041016, "learning_rate": 0.00016615003297439099, "loss": 0.2738, "step": 2673 }, { "epoch": 0.5411859947379073, "grad_norm": 0.30180248618125916, "learning_rate": 0.00016612617601611488, "loss": 0.2273, "step": 2674 }, { "epoch": 0.5413883829184376, "grad_norm": 0.25693386793136597, "learning_rate": 0.00016610231236788617, "loss": 0.2059, "step": 2675 }, { "epoch": 0.5415907710989678, "grad_norm": 0.2835194170475006, "learning_rate": 0.0001660784420321191, "loss": 0.2069, "step": 2676 }, { "epoch": 0.5417931592794981, "grad_norm": 0.4018629491329193, "learning_rate": 0.00016605456501122862, "loss": 0.2489, "step": 2677 }, { "epoch": 0.5419955474600283, "grad_norm": 0.3063729405403137, "learning_rate": 0.00016603068130763036, "loss": 0.2791, "step": 2678 }, { "epoch": 0.5421979356405586, "grad_norm": 0.2935909330844879, "learning_rate": 0.0001660067909237406, "loss": 0.2437, "step": 2679 }, { "epoch": 0.5424003238210888, "grad_norm": 0.27330735325813293, "learning_rate": 0.00016598289386197634, "loss": 0.1945, "step": 2680 }, { "epoch": 0.5426027120016191, "grad_norm": 0.3396134078502655, "learning_rate": 0.00016595899012475523, "loss": 0.2554, "step": 2681 }, { "epoch": 0.5428051001821493, "grad_norm": 0.26023051142692566, "learning_rate": 0.00016593507971449561, "loss": 0.2245, "step": 2682 }, { "epoch": 0.5430074883626796, "grad_norm": 0.3104912042617798, "learning_rate": 0.00016591116263361646, "loss": 0.2393, "step": 2683 }, { "epoch": 0.5432098765432098, "grad_norm": 0.26407697796821594, "learning_rate": 0.00016588723888453748, "loss": 0.2586, "step": 2684 }, { "epoch": 0.5434122647237402, "grad_norm": 0.30238598585128784, "learning_rate": 0.000165863308469679, "loss": 0.253, "step": 2685 }, { "epoch": 0.5436146529042704, "grad_norm": 0.33824655413627625, "learning_rate": 0.0001658393713914621, "loss": 0.2808, "step": 2686 }, { "epoch": 0.5438170410848007, "grad_norm": 0.725450873374939, "learning_rate": 0.0001658154276523084, "loss": 0.2531, "step": 2687 }, { "epoch": 0.5440194292653309, "grad_norm": 0.28861457109451294, "learning_rate": 0.00016579147725464036, "loss": 0.2393, "step": 2688 }, { "epoch": 0.5442218174458612, "grad_norm": 0.40884849429130554, "learning_rate": 0.000165767520200881, "loss": 0.2718, "step": 2689 }, { "epoch": 0.5444242056263914, "grad_norm": 0.5971659421920776, "learning_rate": 0.000165743556493454, "loss": 0.2587, "step": 2690 }, { "epoch": 0.5446265938069217, "grad_norm": 0.26276418566703796, "learning_rate": 0.00016571958613478382, "loss": 0.2386, "step": 2691 }, { "epoch": 0.5448289819874519, "grad_norm": 0.2730572819709778, "learning_rate": 0.00016569560912729552, "loss": 0.2305, "step": 2692 }, { "epoch": 0.5450313701679822, "grad_norm": 0.3255332410335541, "learning_rate": 0.00016567162547341478, "loss": 0.2445, "step": 2693 }, { "epoch": 0.5452337583485124, "grad_norm": 0.32056882977485657, "learning_rate": 0.0001656476351755681, "loss": 0.2668, "step": 2694 }, { "epoch": 0.5454361465290427, "grad_norm": 0.3126126229763031, "learning_rate": 0.0001656236382361825, "loss": 0.2294, "step": 2695 }, { "epoch": 0.5456385347095729, "grad_norm": 0.38915979862213135, "learning_rate": 0.00016559963465768575, "loss": 0.229, "step": 2696 }, { "epoch": 0.5458409228901032, "grad_norm": 0.3153875172138214, "learning_rate": 0.00016557562444250633, "loss": 0.233, "step": 2697 }, { "epoch": 0.5460433110706335, "grad_norm": 0.3637772798538208, "learning_rate": 0.0001655516075930733, "loss": 0.2572, "step": 2698 }, { "epoch": 0.5462456992511637, "grad_norm": 0.28701117634773254, "learning_rate": 0.00016552758411181643, "loss": 0.2588, "step": 2699 }, { "epoch": 0.546448087431694, "grad_norm": 0.3291356563568115, "learning_rate": 0.00016550355400116615, "loss": 0.2454, "step": 2700 }, { "epoch": 0.546448087431694, "eval_loss": 0.2727106213569641, "eval_runtime": 0.7413, "eval_samples_per_second": 6.745, "eval_steps_per_second": 1.349, "step": 2700 }, { "epoch": 0.5466504756122242, "grad_norm": 0.3378230035305023, "learning_rate": 0.0001654795172635536, "loss": 0.2608, "step": 2701 }, { "epoch": 0.5468528637927546, "grad_norm": 0.28990438580513, "learning_rate": 0.0001654554739014106, "loss": 0.2299, "step": 2702 }, { "epoch": 0.5470552519732848, "grad_norm": 0.2976401150226593, "learning_rate": 0.00016543142391716958, "loss": 0.2179, "step": 2703 }, { "epoch": 0.5472576401538151, "grad_norm": 0.3290218412876129, "learning_rate": 0.00016540736731326358, "loss": 0.2793, "step": 2704 }, { "epoch": 0.5474600283343453, "grad_norm": 0.3076421618461609, "learning_rate": 0.00016538330409212655, "loss": 0.2466, "step": 2705 }, { "epoch": 0.5476624165148756, "grad_norm": 0.2529980540275574, "learning_rate": 0.00016535923425619283, "loss": 0.2289, "step": 2706 }, { "epoch": 0.5478648046954058, "grad_norm": 0.3483887314796448, "learning_rate": 0.00016533515780789757, "loss": 0.2344, "step": 2707 }, { "epoch": 0.5480671928759361, "grad_norm": 0.46474558115005493, "learning_rate": 0.00016531107474967663, "loss": 0.2104, "step": 2708 }, { "epoch": 0.5482695810564663, "grad_norm": 0.3963288962841034, "learning_rate": 0.00016528698508396644, "loss": 0.2273, "step": 2709 }, { "epoch": 0.5484719692369966, "grad_norm": 0.32386282086372375, "learning_rate": 0.00016526288881320414, "loss": 0.213, "step": 2710 }, { "epoch": 0.5486743574175268, "grad_norm": 0.3164507746696472, "learning_rate": 0.00016523878593982755, "loss": 0.2496, "step": 2711 }, { "epoch": 0.5488767455980571, "grad_norm": 0.316854864358902, "learning_rate": 0.00016521467646627515, "loss": 0.2555, "step": 2712 }, { "epoch": 0.5490791337785873, "grad_norm": 0.38298240303993225, "learning_rate": 0.00016519056039498607, "loss": 0.2372, "step": 2713 }, { "epoch": 0.5492815219591176, "grad_norm": 0.4422507584095001, "learning_rate": 0.00016516643772840011, "loss": 0.2916, "step": 2714 }, { "epoch": 0.5494839101396478, "grad_norm": 0.3752078711986542, "learning_rate": 0.0001651423084689578, "loss": 0.2451, "step": 2715 }, { "epoch": 0.5496862983201781, "grad_norm": 0.25422441959381104, "learning_rate": 0.00016511817261910023, "loss": 0.1938, "step": 2716 }, { "epoch": 0.5498886865007083, "grad_norm": 0.2828325927257538, "learning_rate": 0.00016509403018126923, "loss": 0.2518, "step": 2717 }, { "epoch": 0.5500910746812386, "grad_norm": 0.31741753220558167, "learning_rate": 0.00016506988115790727, "loss": 0.2348, "step": 2718 }, { "epoch": 0.5502934628617688, "grad_norm": 0.2640233337879181, "learning_rate": 0.0001650457255514575, "loss": 0.2212, "step": 2719 }, { "epoch": 0.5504958510422991, "grad_norm": 0.29826608300209045, "learning_rate": 0.0001650215633643638, "loss": 0.2049, "step": 2720 }, { "epoch": 0.5506982392228293, "grad_norm": 0.3221287429332733, "learning_rate": 0.00016499739459907052, "loss": 0.246, "step": 2721 }, { "epoch": 0.5509006274033597, "grad_norm": 0.2922976016998291, "learning_rate": 0.00016497321925802285, "loss": 0.1945, "step": 2722 }, { "epoch": 0.5511030155838899, "grad_norm": 0.4366983473300934, "learning_rate": 0.00016494903734366663, "loss": 0.2492, "step": 2723 }, { "epoch": 0.5513054037644202, "grad_norm": 0.4032882750034332, "learning_rate": 0.00016492484885844834, "loss": 0.2792, "step": 2724 }, { "epoch": 0.5515077919449504, "grad_norm": 0.4576028287410736, "learning_rate": 0.00016490065380481508, "loss": 0.2362, "step": 2725 }, { "epoch": 0.5517101801254807, "grad_norm": 0.29097214341163635, "learning_rate": 0.00016487645218521464, "loss": 0.1913, "step": 2726 }, { "epoch": 0.5519125683060109, "grad_norm": 0.5986493229866028, "learning_rate": 0.00016485224400209555, "loss": 0.2713, "step": 2727 }, { "epoch": 0.5521149564865412, "grad_norm": 0.33854955434799194, "learning_rate": 0.0001648280292579069, "loss": 0.2676, "step": 2728 }, { "epoch": 0.5523173446670715, "grad_norm": 0.28433772921562195, "learning_rate": 0.00016480380795509843, "loss": 0.2161, "step": 2729 }, { "epoch": 0.5525197328476017, "grad_norm": 0.34827956557273865, "learning_rate": 0.00016477958009612068, "loss": 0.2212, "step": 2730 }, { "epoch": 0.552722121028132, "grad_norm": 0.3341020941734314, "learning_rate": 0.00016475534568342472, "loss": 0.2367, "step": 2731 }, { "epoch": 0.5529245092086622, "grad_norm": 0.29117056727409363, "learning_rate": 0.00016473110471946237, "loss": 0.2335, "step": 2732 }, { "epoch": 0.5531268973891925, "grad_norm": 0.3291405141353607, "learning_rate": 0.00016470685720668606, "loss": 0.247, "step": 2733 }, { "epoch": 0.5533292855697227, "grad_norm": 0.378342866897583, "learning_rate": 0.00016468260314754892, "loss": 0.2516, "step": 2734 }, { "epoch": 0.553531673750253, "grad_norm": 0.498351126909256, "learning_rate": 0.00016465834254450468, "loss": 0.2192, "step": 2735 }, { "epoch": 0.5537340619307832, "grad_norm": 0.3393566906452179, "learning_rate": 0.00016463407540000778, "loss": 0.2314, "step": 2736 }, { "epoch": 0.5539364501113135, "grad_norm": 0.5438311696052551, "learning_rate": 0.00016460980171651338, "loss": 0.2736, "step": 2737 }, { "epoch": 0.5541388382918437, "grad_norm": 0.3348614573478699, "learning_rate": 0.00016458552149647716, "loss": 0.2654, "step": 2738 }, { "epoch": 0.554341226472374, "grad_norm": 0.312345951795578, "learning_rate": 0.00016456123474235552, "loss": 0.2666, "step": 2739 }, { "epoch": 0.5545436146529042, "grad_norm": 0.2773478627204895, "learning_rate": 0.00016453694145660564, "loss": 0.2572, "step": 2740 }, { "epoch": 0.5547460028334346, "grad_norm": 0.3372350037097931, "learning_rate": 0.00016451264164168516, "loss": 0.2576, "step": 2741 }, { "epoch": 0.5549483910139648, "grad_norm": 0.34194353222846985, "learning_rate": 0.00016448833530005255, "loss": 0.2337, "step": 2742 }, { "epoch": 0.5551507791944951, "grad_norm": 0.29681119322776794, "learning_rate": 0.00016446402243416682, "loss": 0.2518, "step": 2743 }, { "epoch": 0.5553531673750253, "grad_norm": 0.31567174196243286, "learning_rate": 0.0001644397030464877, "loss": 0.2543, "step": 2744 }, { "epoch": 0.5555555555555556, "grad_norm": 0.3233564794063568, "learning_rate": 0.00016441537713947563, "loss": 0.251, "step": 2745 }, { "epoch": 0.5557579437360858, "grad_norm": 0.25843173265457153, "learning_rate": 0.00016439104471559156, "loss": 0.2231, "step": 2746 }, { "epoch": 0.5559603319166161, "grad_norm": 0.38345223665237427, "learning_rate": 0.00016436670577729727, "loss": 0.2568, "step": 2747 }, { "epoch": 0.5561627200971463, "grad_norm": 0.361279159784317, "learning_rate": 0.00016434236032705508, "loss": 0.2421, "step": 2748 }, { "epoch": 0.5563651082776766, "grad_norm": 0.29712972044944763, "learning_rate": 0.000164318008367328, "loss": 0.2375, "step": 2749 }, { "epoch": 0.5565674964582068, "grad_norm": 0.3402288556098938, "learning_rate": 0.0001642936499005797, "loss": 0.2556, "step": 2750 }, { "epoch": 0.5565674964582068, "eval_loss": 0.271230548620224, "eval_runtime": 0.7399, "eval_samples_per_second": 6.758, "eval_steps_per_second": 1.352, "step": 2750 }, { "epoch": 0.5567698846387371, "grad_norm": 0.31470251083374023, "learning_rate": 0.00016426928492927455, "loss": 0.272, "step": 2751 }, { "epoch": 0.5569722728192673, "grad_norm": 0.40976059436798096, "learning_rate": 0.0001642449134558775, "loss": 0.2206, "step": 2752 }, { "epoch": 0.5571746609997976, "grad_norm": 0.3531858026981354, "learning_rate": 0.00016422053548285424, "loss": 0.2526, "step": 2753 }, { "epoch": 0.5573770491803278, "grad_norm": 0.33502495288848877, "learning_rate": 0.00016419615101267103, "loss": 0.2359, "step": 2754 }, { "epoch": 0.5575794373608581, "grad_norm": 0.24423053860664368, "learning_rate": 0.0001641717600477949, "loss": 0.2205, "step": 2755 }, { "epoch": 0.5577818255413883, "grad_norm": 0.3108390271663666, "learning_rate": 0.00016414736259069338, "loss": 0.2626, "step": 2756 }, { "epoch": 0.5579842137219186, "grad_norm": 0.3606570363044739, "learning_rate": 0.00016412295864383486, "loss": 0.2688, "step": 2757 }, { "epoch": 0.5581866019024488, "grad_norm": 0.42568790912628174, "learning_rate": 0.0001640985482096882, "loss": 0.2456, "step": 2758 }, { "epoch": 0.5583889900829792, "grad_norm": 0.35818469524383545, "learning_rate": 0.00016407413129072303, "loss": 0.2762, "step": 2759 }, { "epoch": 0.5585913782635095, "grad_norm": 0.27855581045150757, "learning_rate": 0.00016404970788940957, "loss": 0.2409, "step": 2760 }, { "epoch": 0.5587937664440397, "grad_norm": 0.3034149706363678, "learning_rate": 0.0001640252780082187, "loss": 0.2416, "step": 2761 }, { "epoch": 0.55899615462457, "grad_norm": 0.2733892798423767, "learning_rate": 0.00016400084164962201, "loss": 0.256, "step": 2762 }, { "epoch": 0.5591985428051002, "grad_norm": 0.3733433485031128, "learning_rate": 0.00016397639881609175, "loss": 0.266, "step": 2763 }, { "epoch": 0.5594009309856305, "grad_norm": 0.29257914423942566, "learning_rate": 0.00016395194951010074, "loss": 0.2496, "step": 2764 }, { "epoch": 0.5596033191661607, "grad_norm": 0.2640772759914398, "learning_rate": 0.00016392749373412253, "loss": 0.2297, "step": 2765 }, { "epoch": 0.559805707346691, "grad_norm": 0.3010108470916748, "learning_rate": 0.00016390303149063128, "loss": 0.236, "step": 2766 }, { "epoch": 0.5600080955272212, "grad_norm": 0.3347938358783722, "learning_rate": 0.0001638785627821018, "loss": 0.2321, "step": 2767 }, { "epoch": 0.5602104837077515, "grad_norm": 0.2746746838092804, "learning_rate": 0.00016385408761100965, "loss": 0.2623, "step": 2768 }, { "epoch": 0.5604128718882817, "grad_norm": 0.31118178367614746, "learning_rate": 0.0001638296059798309, "loss": 0.2429, "step": 2769 }, { "epoch": 0.560615260068812, "grad_norm": 0.3054017126560211, "learning_rate": 0.00016380511789104238, "loss": 0.2455, "step": 2770 }, { "epoch": 0.5608176482493422, "grad_norm": 0.28247079253196716, "learning_rate": 0.00016378062334712157, "loss": 0.2251, "step": 2771 }, { "epoch": 0.5610200364298725, "grad_norm": 0.2857528030872345, "learning_rate": 0.0001637561223505465, "loss": 0.2441, "step": 2772 }, { "epoch": 0.5612224246104027, "grad_norm": 0.31612929701805115, "learning_rate": 0.00016373161490379595, "loss": 0.266, "step": 2773 }, { "epoch": 0.561424812790933, "grad_norm": 0.2727610766887665, "learning_rate": 0.00016370710100934935, "loss": 0.2511, "step": 2774 }, { "epoch": 0.5616272009714632, "grad_norm": 0.34806764125823975, "learning_rate": 0.00016368258066968671, "loss": 0.2678, "step": 2775 }, { "epoch": 0.5618295891519935, "grad_norm": 0.4355735778808594, "learning_rate": 0.00016365805388728882, "loss": 0.2785, "step": 2776 }, { "epoch": 0.5620319773325237, "grad_norm": 0.2966254651546478, "learning_rate": 0.00016363352066463694, "loss": 0.2023, "step": 2777 }, { "epoch": 0.5622343655130541, "grad_norm": 0.3531200587749481, "learning_rate": 0.00016360898100421318, "loss": 0.2288, "step": 2778 }, { "epoch": 0.5624367536935843, "grad_norm": 0.24990540742874146, "learning_rate": 0.00016358443490850015, "loss": 0.2125, "step": 2779 }, { "epoch": 0.5626391418741146, "grad_norm": 0.31871917843818665, "learning_rate": 0.00016355988237998117, "loss": 0.2483, "step": 2780 }, { "epoch": 0.5628415300546448, "grad_norm": 0.3779838979244232, "learning_rate": 0.0001635353234211402, "loss": 0.2513, "step": 2781 }, { "epoch": 0.5630439182351751, "grad_norm": 0.31013843417167664, "learning_rate": 0.00016351075803446193, "loss": 0.2343, "step": 2782 }, { "epoch": 0.5632463064157053, "grad_norm": 0.2897894084453583, "learning_rate": 0.00016348618622243156, "loss": 0.2415, "step": 2783 }, { "epoch": 0.5634486945962356, "grad_norm": 0.29897069931030273, "learning_rate": 0.000163461607987535, "loss": 0.237, "step": 2784 }, { "epoch": 0.5636510827767658, "grad_norm": 0.31116607785224915, "learning_rate": 0.00016343702333225881, "loss": 0.2673, "step": 2785 }, { "epoch": 0.5638534709572961, "grad_norm": 0.2632872760295868, "learning_rate": 0.0001634124322590903, "loss": 0.206, "step": 2786 }, { "epoch": 0.5640558591378263, "grad_norm": 0.4221595525741577, "learning_rate": 0.00016338783477051722, "loss": 0.2548, "step": 2787 }, { "epoch": 0.5642582473183566, "grad_norm": 0.41988351941108704, "learning_rate": 0.00016336323086902816, "loss": 0.3041, "step": 2788 }, { "epoch": 0.5644606354988869, "grad_norm": 0.3068259060382843, "learning_rate": 0.00016333862055711228, "loss": 0.2578, "step": 2789 }, { "epoch": 0.5646630236794171, "grad_norm": 0.41189467906951904, "learning_rate": 0.00016331400383725936, "loss": 0.2482, "step": 2790 }, { "epoch": 0.5648654118599474, "grad_norm": 0.2784172594547272, "learning_rate": 0.00016328938071195984, "loss": 0.2306, "step": 2791 }, { "epoch": 0.5650678000404776, "grad_norm": 0.3234933614730835, "learning_rate": 0.00016326475118370493, "loss": 0.2457, "step": 2792 }, { "epoch": 0.5652701882210079, "grad_norm": 0.3366439938545227, "learning_rate": 0.0001632401152549863, "loss": 0.2598, "step": 2793 }, { "epoch": 0.5654725764015381, "grad_norm": 1.26985502243042, "learning_rate": 0.00016321547292829638, "loss": 0.2393, "step": 2794 }, { "epoch": 0.5656749645820685, "grad_norm": 0.28081372380256653, "learning_rate": 0.00016319082420612825, "loss": 0.256, "step": 2795 }, { "epoch": 0.5658773527625987, "grad_norm": 0.31037789583206177, "learning_rate": 0.00016316616909097553, "loss": 0.2431, "step": 2796 }, { "epoch": 0.566079740943129, "grad_norm": 0.2816120684146881, "learning_rate": 0.00016314150758533265, "loss": 0.2261, "step": 2797 }, { "epoch": 0.5662821291236592, "grad_norm": 0.4624975621700287, "learning_rate": 0.00016311683969169453, "loss": 0.2336, "step": 2798 }, { "epoch": 0.5664845173041895, "grad_norm": 0.28248104453086853, "learning_rate": 0.00016309216541255688, "loss": 0.2206, "step": 2799 }, { "epoch": 0.5666869054847197, "grad_norm": 0.36865583062171936, "learning_rate": 0.00016306748475041594, "loss": 0.2809, "step": 2800 }, { "epoch": 0.5666869054847197, "eval_loss": 0.269897997379303, "eval_runtime": 0.7397, "eval_samples_per_second": 6.759, "eval_steps_per_second": 1.352, "step": 2800 }, { "epoch": 0.56688929366525, "grad_norm": 0.339393675327301, "learning_rate": 0.00016304279770776867, "loss": 0.2045, "step": 2801 }, { "epoch": 0.5670916818457802, "grad_norm": 0.38487958908081055, "learning_rate": 0.00016301810428711263, "loss": 0.2546, "step": 2802 }, { "epoch": 0.5672940700263105, "grad_norm": 0.39645740389823914, "learning_rate": 0.00016299340449094603, "loss": 0.2231, "step": 2803 }, { "epoch": 0.5674964582068407, "grad_norm": 0.26686328649520874, "learning_rate": 0.00016296869832176772, "loss": 0.2263, "step": 2804 }, { "epoch": 0.567698846387371, "grad_norm": 0.3659006655216217, "learning_rate": 0.00016294398578207728, "loss": 0.2385, "step": 2805 }, { "epoch": 0.5679012345679012, "grad_norm": 0.36053726077079773, "learning_rate": 0.0001629192668743748, "loss": 0.2548, "step": 2806 }, { "epoch": 0.5681036227484315, "grad_norm": 0.29510796070098877, "learning_rate": 0.0001628945416011611, "loss": 0.2512, "step": 2807 }, { "epoch": 0.5683060109289617, "grad_norm": 0.3900874853134155, "learning_rate": 0.00016286980996493762, "loss": 0.3024, "step": 2808 }, { "epoch": 0.568508399109492, "grad_norm": 0.2818908393383026, "learning_rate": 0.00016284507196820646, "loss": 0.1875, "step": 2809 }, { "epoch": 0.5687107872900222, "grad_norm": 0.25959712266921997, "learning_rate": 0.00016282032761347036, "loss": 0.2116, "step": 2810 }, { "epoch": 0.5689131754705525, "grad_norm": 0.47624823451042175, "learning_rate": 0.00016279557690323268, "loss": 0.2554, "step": 2811 }, { "epoch": 0.5691155636510827, "grad_norm": 0.27003055810928345, "learning_rate": 0.0001627708198399974, "loss": 0.2458, "step": 2812 }, { "epoch": 0.569317951831613, "grad_norm": 0.34233883023262024, "learning_rate": 0.00016274605642626925, "loss": 0.2444, "step": 2813 }, { "epoch": 0.5695203400121432, "grad_norm": 0.31025639176368713, "learning_rate": 0.00016272128666455348, "loss": 0.2504, "step": 2814 }, { "epoch": 0.5697227281926736, "grad_norm": 0.3257082402706146, "learning_rate": 0.00016269651055735604, "loss": 0.2437, "step": 2815 }, { "epoch": 0.5699251163732038, "grad_norm": 0.4323844313621521, "learning_rate": 0.0001626717281071836, "loss": 0.2304, "step": 2816 }, { "epoch": 0.5701275045537341, "grad_norm": 0.3516029119491577, "learning_rate": 0.00016264693931654324, "loss": 0.2285, "step": 2817 }, { "epoch": 0.5703298927342643, "grad_norm": 0.3017370104789734, "learning_rate": 0.00016262214418794293, "loss": 0.2512, "step": 2818 }, { "epoch": 0.5705322809147946, "grad_norm": 0.33841025829315186, "learning_rate": 0.00016259734272389115, "loss": 0.271, "step": 2819 }, { "epoch": 0.5707346690953249, "grad_norm": 0.43659186363220215, "learning_rate": 0.0001625725349268971, "loss": 0.2231, "step": 2820 }, { "epoch": 0.5709370572758551, "grad_norm": 0.28855910897254944, "learning_rate": 0.00016254772079947046, "loss": 0.2396, "step": 2821 }, { "epoch": 0.5711394454563854, "grad_norm": 0.25596854090690613, "learning_rate": 0.00016252290034412175, "loss": 0.2379, "step": 2822 }, { "epoch": 0.5713418336369156, "grad_norm": 0.31578749418258667, "learning_rate": 0.0001624980735633621, "loss": 0.2174, "step": 2823 }, { "epoch": 0.5715442218174459, "grad_norm": 0.35793131589889526, "learning_rate": 0.0001624732404597031, "loss": 0.2461, "step": 2824 }, { "epoch": 0.5717466099979761, "grad_norm": 0.2687852084636688, "learning_rate": 0.00016244840103565714, "loss": 0.2315, "step": 2825 }, { "epoch": 0.5719489981785064, "grad_norm": 0.5443539023399353, "learning_rate": 0.00016242355529373726, "loss": 0.2681, "step": 2826 }, { "epoch": 0.5721513863590366, "grad_norm": 0.3889239430427551, "learning_rate": 0.00016239870323645706, "loss": 0.2648, "step": 2827 }, { "epoch": 0.5723537745395669, "grad_norm": 0.2995070815086365, "learning_rate": 0.00016237384486633078, "loss": 0.2608, "step": 2828 }, { "epoch": 0.5725561627200971, "grad_norm": 0.31988614797592163, "learning_rate": 0.00016234898018587337, "loss": 0.2706, "step": 2829 }, { "epoch": 0.5727585509006274, "grad_norm": 0.3469448983669281, "learning_rate": 0.00016232410919760036, "loss": 0.2828, "step": 2830 }, { "epoch": 0.5729609390811576, "grad_norm": 0.438338965177536, "learning_rate": 0.00016229923190402796, "loss": 0.2606, "step": 2831 }, { "epoch": 0.573163327261688, "grad_norm": 0.25825679302215576, "learning_rate": 0.00016227434830767294, "loss": 0.2145, "step": 2832 }, { "epoch": 0.5733657154422181, "grad_norm": 0.2764333486557007, "learning_rate": 0.00016224945841105282, "loss": 0.2257, "step": 2833 }, { "epoch": 0.5735681036227485, "grad_norm": 0.3371870219707489, "learning_rate": 0.00016222456221668568, "loss": 0.2526, "step": 2834 }, { "epoch": 0.5737704918032787, "grad_norm": 0.3069702088832855, "learning_rate": 0.00016219965972709023, "loss": 0.2591, "step": 2835 }, { "epoch": 0.573972879983809, "grad_norm": 0.33921316266059875, "learning_rate": 0.00016217475094478586, "loss": 0.2205, "step": 2836 }, { "epoch": 0.5741752681643392, "grad_norm": 0.3128065764904022, "learning_rate": 0.0001621498358722926, "loss": 0.2617, "step": 2837 }, { "epoch": 0.5743776563448695, "grad_norm": 0.32519394159317017, "learning_rate": 0.0001621249145121311, "loss": 0.2146, "step": 2838 }, { "epoch": 0.5745800445253997, "grad_norm": 0.32722973823547363, "learning_rate": 0.00016209998686682258, "loss": 0.2562, "step": 2839 }, { "epoch": 0.57478243270593, "grad_norm": 0.2931281328201294, "learning_rate": 0.00016207505293888903, "loss": 0.204, "step": 2840 }, { "epoch": 0.5749848208864602, "grad_norm": 0.4192622900009155, "learning_rate": 0.00016205011273085293, "loss": 0.2562, "step": 2841 }, { "epoch": 0.5751872090669905, "grad_norm": 0.37887004017829895, "learning_rate": 0.00016202516624523754, "loss": 0.2936, "step": 2842 }, { "epoch": 0.5753895972475207, "grad_norm": 0.3006739616394043, "learning_rate": 0.00016200021348456662, "loss": 0.2494, "step": 2843 }, { "epoch": 0.575591985428051, "grad_norm": 0.3014276623725891, "learning_rate": 0.00016197525445136468, "loss": 0.2633, "step": 2844 }, { "epoch": 0.5757943736085812, "grad_norm": 0.29328301548957825, "learning_rate": 0.00016195028914815679, "loss": 0.2494, "step": 2845 }, { "epoch": 0.5759967617891115, "grad_norm": 0.30112847685813904, "learning_rate": 0.00016192531757746868, "loss": 0.2129, "step": 2846 }, { "epoch": 0.5761991499696417, "grad_norm": 0.3580033779144287, "learning_rate": 0.0001619003397418267, "loss": 0.2478, "step": 2847 }, { "epoch": 0.576401538150172, "grad_norm": 0.34292858839035034, "learning_rate": 0.0001618753556437579, "loss": 0.2374, "step": 2848 }, { "epoch": 0.5766039263307022, "grad_norm": 0.4073190689086914, "learning_rate": 0.00016185036528578985, "loss": 0.2558, "step": 2849 }, { "epoch": 0.5768063145112325, "grad_norm": 0.355729341506958, "learning_rate": 0.00016182536867045082, "loss": 0.2413, "step": 2850 }, { "epoch": 0.5768063145112325, "eval_loss": 0.27183297276496887, "eval_runtime": 0.7381, "eval_samples_per_second": 6.775, "eval_steps_per_second": 1.355, "step": 2850 }, { "epoch": 0.5770087026917629, "grad_norm": 0.4269033670425415, "learning_rate": 0.00016180036580026972, "loss": 0.2233, "step": 2851 }, { "epoch": 0.577211090872293, "grad_norm": 0.284562885761261, "learning_rate": 0.00016177535667777603, "loss": 0.2212, "step": 2852 }, { "epoch": 0.5774134790528234, "grad_norm": 0.3404622972011566, "learning_rate": 0.00016175034130550003, "loss": 0.2369, "step": 2853 }, { "epoch": 0.5776158672333536, "grad_norm": 0.37222614884376526, "learning_rate": 0.00016172531968597234, "loss": 0.2291, "step": 2854 }, { "epoch": 0.5778182554138839, "grad_norm": 0.37149935960769653, "learning_rate": 0.00016170029182172458, "loss": 0.2321, "step": 2855 }, { "epoch": 0.5780206435944141, "grad_norm": 0.2935652732849121, "learning_rate": 0.00016167525771528863, "loss": 0.2377, "step": 2856 }, { "epoch": 0.5782230317749444, "grad_norm": 0.3011722266674042, "learning_rate": 0.0001616502173691973, "loss": 0.238, "step": 2857 }, { "epoch": 0.5784254199554746, "grad_norm": 0.24494045972824097, "learning_rate": 0.00016162517078598384, "loss": 0.2217, "step": 2858 }, { "epoch": 0.5786278081360049, "grad_norm": 0.3266785740852356, "learning_rate": 0.00016160011796818223, "loss": 0.2459, "step": 2859 }, { "epoch": 0.5788301963165351, "grad_norm": 0.42591631412506104, "learning_rate": 0.00016157505891832707, "loss": 0.2655, "step": 2860 }, { "epoch": 0.5790325844970654, "grad_norm": 0.33396992087364197, "learning_rate": 0.00016154999363895354, "loss": 0.2523, "step": 2861 }, { "epoch": 0.5792349726775956, "grad_norm": 0.3604094088077545, "learning_rate": 0.0001615249221325975, "loss": 0.2697, "step": 2862 }, { "epoch": 0.5794373608581259, "grad_norm": 0.3391878306865692, "learning_rate": 0.00016149984440179537, "loss": 0.2264, "step": 2863 }, { "epoch": 0.5796397490386561, "grad_norm": 0.2555520832538605, "learning_rate": 0.0001614747604490843, "loss": 0.223, "step": 2864 }, { "epoch": 0.5798421372191864, "grad_norm": 0.30228936672210693, "learning_rate": 0.00016144967027700204, "loss": 0.2645, "step": 2865 }, { "epoch": 0.5800445253997166, "grad_norm": 0.5504426956176758, "learning_rate": 0.00016142457388808695, "loss": 0.2548, "step": 2866 }, { "epoch": 0.5802469135802469, "grad_norm": 0.324349582195282, "learning_rate": 0.00016139947128487796, "loss": 0.2272, "step": 2867 }, { "epoch": 0.5804493017607771, "grad_norm": 0.29885464906692505, "learning_rate": 0.0001613743624699147, "loss": 0.2381, "step": 2868 }, { "epoch": 0.5806516899413074, "grad_norm": 0.2752251625061035, "learning_rate": 0.00016134924744573746, "loss": 0.2473, "step": 2869 }, { "epoch": 0.5808540781218376, "grad_norm": 0.40981918573379517, "learning_rate": 0.0001613241262148871, "loss": 0.2638, "step": 2870 }, { "epoch": 0.581056466302368, "grad_norm": 0.367501437664032, "learning_rate": 0.00016129899877990512, "loss": 0.254, "step": 2871 }, { "epoch": 0.5812588544828982, "grad_norm": 0.36186274886131287, "learning_rate": 0.00016127386514333368, "loss": 0.291, "step": 2872 }, { "epoch": 0.5814612426634285, "grad_norm": 0.30635300278663635, "learning_rate": 0.00016124872530771546, "loss": 0.24, "step": 2873 }, { "epoch": 0.5816636308439587, "grad_norm": 0.3640815019607544, "learning_rate": 0.00016122357927559388, "loss": 0.2207, "step": 2874 }, { "epoch": 0.581866019024489, "grad_norm": 0.37263694405555725, "learning_rate": 0.00016119842704951298, "loss": 0.2656, "step": 2875 }, { "epoch": 0.5820684072050192, "grad_norm": 0.34229201078414917, "learning_rate": 0.00016117326863201737, "loss": 0.27, "step": 2876 }, { "epoch": 0.5822707953855495, "grad_norm": 0.3659871518611908, "learning_rate": 0.00016114810402565236, "loss": 0.2576, "step": 2877 }, { "epoch": 0.5824731835660797, "grad_norm": 0.2683655619621277, "learning_rate": 0.00016112293323296376, "loss": 0.2232, "step": 2878 }, { "epoch": 0.58267557174661, "grad_norm": 0.2627432644367218, "learning_rate": 0.00016109775625649815, "loss": 0.2384, "step": 2879 }, { "epoch": 0.5828779599271403, "grad_norm": 0.5119886994361877, "learning_rate": 0.0001610725730988026, "loss": 0.2156, "step": 2880 }, { "epoch": 0.5830803481076705, "grad_norm": 0.3302849531173706, "learning_rate": 0.00016104738376242501, "loss": 0.211, "step": 2881 }, { "epoch": 0.5832827362882008, "grad_norm": 0.34468814730644226, "learning_rate": 0.00016102218824991365, "loss": 0.2415, "step": 2882 }, { "epoch": 0.583485124468731, "grad_norm": 0.2802000045776367, "learning_rate": 0.00016099698656381762, "loss": 0.2105, "step": 2883 }, { "epoch": 0.5836875126492613, "grad_norm": 0.3089223802089691, "learning_rate": 0.0001609717787066865, "loss": 0.2407, "step": 2884 }, { "epoch": 0.5838899008297915, "grad_norm": 0.3115883767604828, "learning_rate": 0.00016094656468107057, "loss": 0.2526, "step": 2885 }, { "epoch": 0.5840922890103218, "grad_norm": 0.35136011242866516, "learning_rate": 0.00016092134448952074, "loss": 0.2089, "step": 2886 }, { "epoch": 0.584294677190852, "grad_norm": 0.5641045570373535, "learning_rate": 0.0001608961181345885, "loss": 0.2535, "step": 2887 }, { "epoch": 0.5844970653713824, "grad_norm": 0.44940614700317383, "learning_rate": 0.00016087088561882605, "loss": 0.2245, "step": 2888 }, { "epoch": 0.5846994535519126, "grad_norm": 0.26401445269584656, "learning_rate": 0.00016084564694478605, "loss": 0.2194, "step": 2889 }, { "epoch": 0.5849018417324429, "grad_norm": 0.28557443618774414, "learning_rate": 0.00016082040211502197, "loss": 0.2326, "step": 2890 }, { "epoch": 0.5851042299129731, "grad_norm": 0.3739382028579712, "learning_rate": 0.00016079515113208776, "loss": 0.2574, "step": 2891 }, { "epoch": 0.5853066180935034, "grad_norm": 0.40503665804862976, "learning_rate": 0.00016076989399853807, "loss": 0.2534, "step": 2892 }, { "epoch": 0.5855090062740336, "grad_norm": 0.34489211440086365, "learning_rate": 0.0001607446307169282, "loss": 0.261, "step": 2893 }, { "epoch": 0.5857113944545639, "grad_norm": 0.4603923261165619, "learning_rate": 0.00016071936128981396, "loss": 0.248, "step": 2894 }, { "epoch": 0.5859137826350941, "grad_norm": 0.45358771085739136, "learning_rate": 0.00016069408571975187, "loss": 0.2285, "step": 2895 }, { "epoch": 0.5861161708156244, "grad_norm": 0.34958937764167786, "learning_rate": 0.000160668804009299, "loss": 0.2142, "step": 2896 }, { "epoch": 0.5863185589961546, "grad_norm": 0.40218639373779297, "learning_rate": 0.00016064351616101318, "loss": 0.2736, "step": 2897 }, { "epoch": 0.5865209471766849, "grad_norm": 0.2853103280067444, "learning_rate": 0.0001606182221774527, "loss": 0.2249, "step": 2898 }, { "epoch": 0.5867233353572151, "grad_norm": 0.35319843888282776, "learning_rate": 0.00016059292206117655, "loss": 0.2332, "step": 2899 }, { "epoch": 0.5869257235377454, "grad_norm": 0.3640676736831665, "learning_rate": 0.00016056761581474438, "loss": 0.2451, "step": 2900 }, { "epoch": 0.5869257235377454, "eval_loss": 0.2811625003814697, "eval_runtime": 0.7382, "eval_samples_per_second": 6.774, "eval_steps_per_second": 1.355, "step": 2900 }, { "epoch": 0.5871281117182756, "grad_norm": 0.426506370306015, "learning_rate": 0.00016054230344071636, "loss": 0.2452, "step": 2901 }, { "epoch": 0.5873304998988059, "grad_norm": 0.4590941071510315, "learning_rate": 0.0001605169849416533, "loss": 0.2189, "step": 2902 }, { "epoch": 0.5875328880793361, "grad_norm": 0.2869848310947418, "learning_rate": 0.00016049166032011672, "loss": 0.2412, "step": 2903 }, { "epoch": 0.5877352762598664, "grad_norm": 0.3106136918067932, "learning_rate": 0.0001604663295786687, "loss": 0.2225, "step": 2904 }, { "epoch": 0.5879376644403966, "grad_norm": 0.3410221040248871, "learning_rate": 0.0001604409927198719, "loss": 0.2274, "step": 2905 }, { "epoch": 0.588140052620927, "grad_norm": 0.3309195935726166, "learning_rate": 0.0001604156497462897, "loss": 0.2532, "step": 2906 }, { "epoch": 0.5883424408014571, "grad_norm": 0.31511515378952026, "learning_rate": 0.00016039030066048593, "loss": 0.2601, "step": 2907 }, { "epoch": 0.5885448289819875, "grad_norm": 0.3180387318134308, "learning_rate": 0.00016036494546502525, "loss": 0.2318, "step": 2908 }, { "epoch": 0.5887472171625177, "grad_norm": 0.3602931797504425, "learning_rate": 0.00016033958416247277, "loss": 0.3103, "step": 2909 }, { "epoch": 0.588949605343048, "grad_norm": 0.3041614294052124, "learning_rate": 0.00016031421675539428, "loss": 0.2982, "step": 2910 }, { "epoch": 0.5891519935235783, "grad_norm": 0.35607126355171204, "learning_rate": 0.00016028884324635625, "loss": 0.2401, "step": 2911 }, { "epoch": 0.5893543817041085, "grad_norm": 0.37263765931129456, "learning_rate": 0.00016026346363792567, "loss": 0.2691, "step": 2912 }, { "epoch": 0.5895567698846388, "grad_norm": 0.315032422542572, "learning_rate": 0.00016023807793267013, "loss": 0.2549, "step": 2913 }, { "epoch": 0.589759158065169, "grad_norm": 0.35683658719062805, "learning_rate": 0.00016021268613315796, "loss": 0.25, "step": 2914 }, { "epoch": 0.5899615462456993, "grad_norm": 0.3754711449146271, "learning_rate": 0.000160187288241958, "loss": 0.2467, "step": 2915 }, { "epoch": 0.5901639344262295, "grad_norm": 0.31732767820358276, "learning_rate": 0.0001601618842616398, "loss": 0.2295, "step": 2916 }, { "epoch": 0.5903663226067598, "grad_norm": 0.2917217016220093, "learning_rate": 0.00016013647419477339, "loss": 0.2308, "step": 2917 }, { "epoch": 0.59056871078729, "grad_norm": 0.31983745098114014, "learning_rate": 0.00016011105804392953, "loss": 0.2232, "step": 2918 }, { "epoch": 0.5907710989678203, "grad_norm": 0.2851954698562622, "learning_rate": 0.00016008563581167953, "loss": 0.2266, "step": 2919 }, { "epoch": 0.5909734871483505, "grad_norm": 0.38204529881477356, "learning_rate": 0.00016006020750059538, "loss": 0.22, "step": 2920 }, { "epoch": 0.5911758753288808, "grad_norm": 0.2778181731700897, "learning_rate": 0.00016003477311324964, "loss": 0.2385, "step": 2921 }, { "epoch": 0.591378263509411, "grad_norm": 0.3321495056152344, "learning_rate": 0.0001600093326522155, "loss": 0.2163, "step": 2922 }, { "epoch": 0.5915806516899413, "grad_norm": 0.3323008418083191, "learning_rate": 0.00015998388612006677, "loss": 0.2798, "step": 2923 }, { "epoch": 0.5917830398704715, "grad_norm": 0.28340160846710205, "learning_rate": 0.00015995843351937781, "loss": 0.219, "step": 2924 }, { "epoch": 0.5919854280510018, "grad_norm": 0.6043773293495178, "learning_rate": 0.00015993297485272372, "loss": 0.2479, "step": 2925 }, { "epoch": 0.592187816231532, "grad_norm": 0.5740264058113098, "learning_rate": 0.00015990751012268009, "loss": 0.2423, "step": 2926 }, { "epoch": 0.5923902044120624, "grad_norm": 0.2924440801143646, "learning_rate": 0.00015988203933182317, "loss": 0.2033, "step": 2927 }, { "epoch": 0.5925925925925926, "grad_norm": 0.2408752590417862, "learning_rate": 0.00015985656248272988, "loss": 0.186, "step": 2928 }, { "epoch": 0.5927949807731229, "grad_norm": 0.3335459232330322, "learning_rate": 0.00015983107957797765, "loss": 0.2791, "step": 2929 }, { "epoch": 0.5929973689536531, "grad_norm": 0.31838342547416687, "learning_rate": 0.0001598055906201446, "loss": 0.228, "step": 2930 }, { "epoch": 0.5931997571341834, "grad_norm": 0.37246522307395935, "learning_rate": 0.0001597800956118094, "loss": 0.2954, "step": 2931 }, { "epoch": 0.5934021453147136, "grad_norm": 0.2864042818546295, "learning_rate": 0.0001597545945555514, "loss": 0.2385, "step": 2932 }, { "epoch": 0.5936045334952439, "grad_norm": 0.30606046319007874, "learning_rate": 0.00015972908745395052, "loss": 0.2636, "step": 2933 }, { "epoch": 0.5938069216757741, "grad_norm": 0.3238597512245178, "learning_rate": 0.0001597035743095873, "loss": 0.2839, "step": 2934 }, { "epoch": 0.5940093098563044, "grad_norm": 0.2814807593822479, "learning_rate": 0.0001596780551250429, "loss": 0.2309, "step": 2935 }, { "epoch": 0.5942116980368346, "grad_norm": 0.33480608463287354, "learning_rate": 0.00015965252990289908, "loss": 0.2248, "step": 2936 }, { "epoch": 0.5944140862173649, "grad_norm": 0.5626258254051208, "learning_rate": 0.0001596269986457382, "loss": 0.2519, "step": 2937 }, { "epoch": 0.5946164743978951, "grad_norm": 0.32387876510620117, "learning_rate": 0.00015960146135614328, "loss": 0.2625, "step": 2938 }, { "epoch": 0.5948188625784254, "grad_norm": 0.2990744113922119, "learning_rate": 0.00015957591803669784, "loss": 0.2248, "step": 2939 }, { "epoch": 0.5950212507589556, "grad_norm": 0.28303173184394836, "learning_rate": 0.00015955036868998618, "loss": 0.2518, "step": 2940 }, { "epoch": 0.5952236389394859, "grad_norm": 0.5194461941719055, "learning_rate": 0.00015952481331859306, "loss": 0.2418, "step": 2941 }, { "epoch": 0.5954260271200162, "grad_norm": 0.3277747929096222, "learning_rate": 0.00015949925192510392, "loss": 0.2527, "step": 2942 }, { "epoch": 0.5956284153005464, "grad_norm": 0.3047173321247101, "learning_rate": 0.00015947368451210478, "loss": 0.2458, "step": 2943 }, { "epoch": 0.5958308034810768, "grad_norm": 0.27463477849960327, "learning_rate": 0.0001594481110821823, "loss": 0.2188, "step": 2944 }, { "epoch": 0.596033191661607, "grad_norm": 0.2871634364128113, "learning_rate": 0.00015942253163792373, "loss": 0.2431, "step": 2945 }, { "epoch": 0.5962355798421373, "grad_norm": 0.28732845187187195, "learning_rate": 0.0001593969461819169, "loss": 0.1976, "step": 2946 }, { "epoch": 0.5964379680226675, "grad_norm": 0.3001699447631836, "learning_rate": 0.0001593713547167503, "loss": 0.2232, "step": 2947 }, { "epoch": 0.5966403562031978, "grad_norm": 0.819691002368927, "learning_rate": 0.000159345757245013, "loss": 0.2685, "step": 2948 }, { "epoch": 0.596842744383728, "grad_norm": 0.28296270966529846, "learning_rate": 0.00015932015376929475, "loss": 0.249, "step": 2949 }, { "epoch": 0.5970451325642583, "grad_norm": 0.3961198627948761, "learning_rate": 0.00015929454429218574, "loss": 0.2562, "step": 2950 }, { "epoch": 0.5970451325642583, "eval_loss": 0.2750966250896454, "eval_runtime": 0.7417, "eval_samples_per_second": 6.741, "eval_steps_per_second": 1.348, "step": 2950 }, { "epoch": 0.5972475207447885, "grad_norm": 0.33126339316368103, "learning_rate": 0.00015926892881627688, "loss": 0.2809, "step": 2951 }, { "epoch": 0.5974499089253188, "grad_norm": 0.3269531726837158, "learning_rate": 0.00015924330734415975, "loss": 0.2657, "step": 2952 }, { "epoch": 0.597652297105849, "grad_norm": 0.29826268553733826, "learning_rate": 0.0001592176798784264, "loss": 0.2303, "step": 2953 }, { "epoch": 0.5978546852863793, "grad_norm": 0.29660069942474365, "learning_rate": 0.00015919204642166954, "loss": 0.2494, "step": 2954 }, { "epoch": 0.5980570734669095, "grad_norm": 0.49443933367729187, "learning_rate": 0.00015916640697648254, "loss": 0.2448, "step": 2955 }, { "epoch": 0.5982594616474398, "grad_norm": 0.27439481019973755, "learning_rate": 0.00015914076154545931, "loss": 0.2036, "step": 2956 }, { "epoch": 0.59846184982797, "grad_norm": 0.29085221886634827, "learning_rate": 0.00015911511013119438, "loss": 0.2785, "step": 2957 }, { "epoch": 0.5986642380085003, "grad_norm": 0.2833772599697113, "learning_rate": 0.0001590894527362829, "loss": 0.205, "step": 2958 }, { "epoch": 0.5988666261890305, "grad_norm": 0.3200933039188385, "learning_rate": 0.00015906378936332062, "loss": 0.2875, "step": 2959 }, { "epoch": 0.5990690143695608, "grad_norm": 0.28993549942970276, "learning_rate": 0.00015903812001490385, "loss": 0.2417, "step": 2960 }, { "epoch": 0.599271402550091, "grad_norm": 0.2662200927734375, "learning_rate": 0.00015901244469362962, "loss": 0.2315, "step": 2961 }, { "epoch": 0.5994737907306213, "grad_norm": 0.26462310552597046, "learning_rate": 0.0001589867634020954, "loss": 0.2452, "step": 2962 }, { "epoch": 0.5996761789111515, "grad_norm": 0.3398553431034088, "learning_rate": 0.00015896107614289944, "loss": 0.2189, "step": 2963 }, { "epoch": 0.5998785670916819, "grad_norm": 0.49576640129089355, "learning_rate": 0.00015893538291864045, "loss": 0.279, "step": 2964 }, { "epoch": 0.6000809552722121, "grad_norm": 0.2650597095489502, "learning_rate": 0.0001589096837319178, "loss": 0.2216, "step": 2965 }, { "epoch": 0.6002833434527424, "grad_norm": 0.2608364522457123, "learning_rate": 0.00015888397858533152, "loss": 0.2431, "step": 2966 }, { "epoch": 0.6004857316332726, "grad_norm": 0.29076239466667175, "learning_rate": 0.00015885826748148212, "loss": 0.2963, "step": 2967 }, { "epoch": 0.6006881198138029, "grad_norm": 0.2960323095321655, "learning_rate": 0.0001588325504229708, "loss": 0.2443, "step": 2968 }, { "epoch": 0.6008905079943331, "grad_norm": 0.2616303265094757, "learning_rate": 0.0001588068274123994, "loss": 0.2154, "step": 2969 }, { "epoch": 0.6010928961748634, "grad_norm": 0.3281814157962799, "learning_rate": 0.00015878109845237018, "loss": 0.2384, "step": 2970 }, { "epoch": 0.6012952843553937, "grad_norm": 0.32332029938697815, "learning_rate": 0.00015875536354548628, "loss": 0.2669, "step": 2971 }, { "epoch": 0.6014976725359239, "grad_norm": 0.3212827742099762, "learning_rate": 0.0001587296226943512, "loss": 0.2387, "step": 2972 }, { "epoch": 0.6017000607164542, "grad_norm": 0.38467156887054443, "learning_rate": 0.0001587038759015691, "loss": 0.2722, "step": 2973 }, { "epoch": 0.6019024488969844, "grad_norm": 0.3750758767127991, "learning_rate": 0.00015867812316974482, "loss": 0.2462, "step": 2974 }, { "epoch": 0.6021048370775147, "grad_norm": 0.3365626931190491, "learning_rate": 0.00015865236450148372, "loss": 0.2425, "step": 2975 }, { "epoch": 0.6023072252580449, "grad_norm": 0.23194032907485962, "learning_rate": 0.00015862659989939184, "loss": 0.1905, "step": 2976 }, { "epoch": 0.6025096134385752, "grad_norm": 0.31470999121665955, "learning_rate": 0.00015860082936607574, "loss": 0.2203, "step": 2977 }, { "epoch": 0.6027120016191054, "grad_norm": 0.2919837534427643, "learning_rate": 0.00015857505290414262, "loss": 0.2329, "step": 2978 }, { "epoch": 0.6029143897996357, "grad_norm": 0.3553715944290161, "learning_rate": 0.00015854927051620025, "loss": 0.2522, "step": 2979 }, { "epoch": 0.6031167779801659, "grad_norm": 0.30528631806373596, "learning_rate": 0.00015852348220485706, "loss": 0.2241, "step": 2980 }, { "epoch": 0.6033191661606963, "grad_norm": 0.2882617115974426, "learning_rate": 0.00015849768797272201, "loss": 0.2102, "step": 2981 }, { "epoch": 0.6035215543412265, "grad_norm": 0.33525657653808594, "learning_rate": 0.0001584718878224047, "loss": 0.2158, "step": 2982 }, { "epoch": 0.6037239425217568, "grad_norm": 0.33792588114738464, "learning_rate": 0.00015844608175651534, "loss": 0.2676, "step": 2983 }, { "epoch": 0.603926330702287, "grad_norm": 0.3096262812614441, "learning_rate": 0.0001584202697776647, "loss": 0.2145, "step": 2984 }, { "epoch": 0.6041287188828173, "grad_norm": 0.4227834641933441, "learning_rate": 0.00015839445188846414, "loss": 0.2795, "step": 2985 }, { "epoch": 0.6043311070633475, "grad_norm": 0.32572686672210693, "learning_rate": 0.00015836862809152566, "loss": 0.2603, "step": 2986 }, { "epoch": 0.6045334952438778, "grad_norm": 0.7060582041740417, "learning_rate": 0.0001583427983894618, "loss": 0.2504, "step": 2987 }, { "epoch": 0.604735883424408, "grad_norm": 0.42503786087036133, "learning_rate": 0.00015831696278488587, "loss": 0.2309, "step": 2988 }, { "epoch": 0.6049382716049383, "grad_norm": 0.29326146841049194, "learning_rate": 0.00015829112128041151, "loss": 0.2404, "step": 2989 }, { "epoch": 0.6051406597854685, "grad_norm": 0.2777334749698639, "learning_rate": 0.00015826527387865314, "loss": 0.2759, "step": 2990 }, { "epoch": 0.6053430479659988, "grad_norm": 0.26709550619125366, "learning_rate": 0.00015823942058222574, "loss": 0.2435, "step": 2991 }, { "epoch": 0.605545436146529, "grad_norm": 0.31071168184280396, "learning_rate": 0.00015821356139374487, "loss": 0.2403, "step": 2992 }, { "epoch": 0.6057478243270593, "grad_norm": 0.2868014872074127, "learning_rate": 0.00015818769631582668, "loss": 0.2331, "step": 2993 }, { "epoch": 0.6059502125075895, "grad_norm": 0.32010069489479065, "learning_rate": 0.00015816182535108796, "loss": 0.239, "step": 2994 }, { "epoch": 0.6061526006881198, "grad_norm": 0.34212884306907654, "learning_rate": 0.000158135948502146, "loss": 0.2375, "step": 2995 }, { "epoch": 0.60635498886865, "grad_norm": 0.27103322744369507, "learning_rate": 0.0001581100657716188, "loss": 0.2451, "step": 2996 }, { "epoch": 0.6065573770491803, "grad_norm": 0.3346523940563202, "learning_rate": 0.00015808417716212488, "loss": 0.253, "step": 2997 }, { "epoch": 0.6067597652297105, "grad_norm": 0.25295400619506836, "learning_rate": 0.00015805828267628338, "loss": 0.2306, "step": 2998 }, { "epoch": 0.6069621534102408, "grad_norm": 0.46044930815696716, "learning_rate": 0.00015803238231671405, "loss": 0.2422, "step": 2999 }, { "epoch": 0.607164541590771, "grad_norm": 0.3438783884048462, "learning_rate": 0.0001580064760860372, "loss": 0.2628, "step": 3000 }, { "epoch": 0.607164541590771, "eval_loss": 0.2774945795536041, "eval_runtime": 0.7407, "eval_samples_per_second": 6.75, "eval_steps_per_second": 1.35, "step": 3000 }, { "epoch": 0.6073669297713014, "grad_norm": 0.2550566494464874, "learning_rate": 0.00015798056398687375, "loss": 0.2401, "step": 3001 }, { "epoch": 0.6075693179518317, "grad_norm": 0.40897125005722046, "learning_rate": 0.0001579546460218452, "loss": 0.2242, "step": 3002 }, { "epoch": 0.6077717061323619, "grad_norm": 0.33101367950439453, "learning_rate": 0.0001579287221935737, "loss": 0.2573, "step": 3003 }, { "epoch": 0.6079740943128922, "grad_norm": 0.3354332745075226, "learning_rate": 0.00015790279250468194, "loss": 0.2868, "step": 3004 }, { "epoch": 0.6081764824934224, "grad_norm": 0.280505508184433, "learning_rate": 0.0001578768569577932, "loss": 0.236, "step": 3005 }, { "epoch": 0.6083788706739527, "grad_norm": 0.3221960663795471, "learning_rate": 0.00015785091555553136, "loss": 0.2703, "step": 3006 }, { "epoch": 0.6085812588544829, "grad_norm": 0.29745444655418396, "learning_rate": 0.00015782496830052089, "loss": 0.2662, "step": 3007 }, { "epoch": 0.6087836470350132, "grad_norm": 0.3765595257282257, "learning_rate": 0.00015779901519538688, "loss": 0.29, "step": 3008 }, { "epoch": 0.6089860352155434, "grad_norm": 0.4914434850215912, "learning_rate": 0.00015777305624275502, "loss": 0.2435, "step": 3009 }, { "epoch": 0.6091884233960737, "grad_norm": 0.3370167016983032, "learning_rate": 0.0001577470914452515, "loss": 0.219, "step": 3010 }, { "epoch": 0.6093908115766039, "grad_norm": 0.268723726272583, "learning_rate": 0.0001577211208055032, "loss": 0.232, "step": 3011 }, { "epoch": 0.6095931997571342, "grad_norm": 0.29349520802497864, "learning_rate": 0.0001576951443261376, "loss": 0.2005, "step": 3012 }, { "epoch": 0.6097955879376644, "grad_norm": 0.3591350317001343, "learning_rate": 0.00015766916200978266, "loss": 0.2533, "step": 3013 }, { "epoch": 0.6099979761181947, "grad_norm": 0.38604936003685, "learning_rate": 0.00015764317385906702, "loss": 0.2507, "step": 3014 }, { "epoch": 0.6102003642987249, "grad_norm": 0.3372388780117035, "learning_rate": 0.0001576171798766199, "loss": 0.2661, "step": 3015 }, { "epoch": 0.6104027524792552, "grad_norm": 0.27703356742858887, "learning_rate": 0.0001575911800650711, "loss": 0.2547, "step": 3016 }, { "epoch": 0.6106051406597854, "grad_norm": 0.32858291268348694, "learning_rate": 0.00015756517442705098, "loss": 0.2417, "step": 3017 }, { "epoch": 0.6108075288403157, "grad_norm": 0.30357709527015686, "learning_rate": 0.00015753916296519055, "loss": 0.2563, "step": 3018 }, { "epoch": 0.611009917020846, "grad_norm": 0.354936808347702, "learning_rate": 0.00015751314568212134, "loss": 0.2869, "step": 3019 }, { "epoch": 0.6112123052013763, "grad_norm": 0.30504146218299866, "learning_rate": 0.00015748712258047552, "loss": 0.2704, "step": 3020 }, { "epoch": 0.6114146933819065, "grad_norm": 0.3150479197502136, "learning_rate": 0.0001574610936628859, "loss": 0.2273, "step": 3021 }, { "epoch": 0.6116170815624368, "grad_norm": 0.32766446471214294, "learning_rate": 0.0001574350589319857, "loss": 0.2693, "step": 3022 }, { "epoch": 0.611819469742967, "grad_norm": 0.2615867257118225, "learning_rate": 0.00015740901839040894, "loss": 0.2343, "step": 3023 }, { "epoch": 0.6120218579234973, "grad_norm": 0.3711775839328766, "learning_rate": 0.00015738297204079005, "loss": 0.2863, "step": 3024 }, { "epoch": 0.6122242461040275, "grad_norm": 0.33009645342826843, "learning_rate": 0.00015735691988576415, "loss": 0.2445, "step": 3025 }, { "epoch": 0.6124266342845578, "grad_norm": 0.39618271589279175, "learning_rate": 0.00015733086192796697, "loss": 0.312, "step": 3026 }, { "epoch": 0.612629022465088, "grad_norm": 0.29640382528305054, "learning_rate": 0.00015730479817003474, "loss": 0.2673, "step": 3027 }, { "epoch": 0.6128314106456183, "grad_norm": 0.2582768499851227, "learning_rate": 0.00015727872861460434, "loss": 0.2052, "step": 3028 }, { "epoch": 0.6130337988261485, "grad_norm": 0.2651136517524719, "learning_rate": 0.00015725265326431315, "loss": 0.2238, "step": 3029 }, { "epoch": 0.6132361870066788, "grad_norm": 0.2757776081562042, "learning_rate": 0.00015722657212179927, "loss": 0.244, "step": 3030 }, { "epoch": 0.613438575187209, "grad_norm": 0.28894442319869995, "learning_rate": 0.00015720048518970132, "loss": 0.2371, "step": 3031 }, { "epoch": 0.6136409633677393, "grad_norm": 0.3005363643169403, "learning_rate": 0.00015717439247065843, "loss": 0.2292, "step": 3032 }, { "epoch": 0.6138433515482696, "grad_norm": 0.2538183629512787, "learning_rate": 0.00015714829396731048, "loss": 0.2208, "step": 3033 }, { "epoch": 0.6140457397287998, "grad_norm": 0.44336000084877014, "learning_rate": 0.00015712218968229777, "loss": 0.2328, "step": 3034 }, { "epoch": 0.6142481279093301, "grad_norm": 0.25125953555107117, "learning_rate": 0.00015709607961826127, "loss": 0.2027, "step": 3035 }, { "epoch": 0.6144505160898603, "grad_norm": 0.26832088828086853, "learning_rate": 0.00015706996377784255, "loss": 0.1998, "step": 3036 }, { "epoch": 0.6146529042703907, "grad_norm": 0.29760006070137024, "learning_rate": 0.00015704384216368373, "loss": 0.2625, "step": 3037 }, { "epoch": 0.6148552924509209, "grad_norm": 0.3440050482749939, "learning_rate": 0.00015701771477842752, "loss": 0.2569, "step": 3038 }, { "epoch": 0.6150576806314512, "grad_norm": 0.6179642677307129, "learning_rate": 0.0001569915816247172, "loss": 0.2382, "step": 3039 }, { "epoch": 0.6152600688119814, "grad_norm": 0.27009207010269165, "learning_rate": 0.00015696544270519665, "loss": 0.1881, "step": 3040 }, { "epoch": 0.6154624569925117, "grad_norm": 0.37599071860313416, "learning_rate": 0.00015693929802251038, "loss": 0.2555, "step": 3041 }, { "epoch": 0.6156648451730419, "grad_norm": 0.24889563024044037, "learning_rate": 0.00015691314757930336, "loss": 0.2501, "step": 3042 }, { "epoch": 0.6158672333535722, "grad_norm": 0.2929196357727051, "learning_rate": 0.00015688699137822126, "loss": 0.2259, "step": 3043 }, { "epoch": 0.6160696215341024, "grad_norm": 0.40753883123397827, "learning_rate": 0.00015686082942191026, "loss": 0.2045, "step": 3044 }, { "epoch": 0.6162720097146327, "grad_norm": 0.266451358795166, "learning_rate": 0.0001568346617130172, "loss": 0.2347, "step": 3045 }, { "epoch": 0.6164743978951629, "grad_norm": 0.38660845160484314, "learning_rate": 0.0001568084882541894, "loss": 0.2777, "step": 3046 }, { "epoch": 0.6166767860756932, "grad_norm": 0.3965816795825958, "learning_rate": 0.00015678230904807484, "loss": 0.2591, "step": 3047 }, { "epoch": 0.6168791742562234, "grad_norm": 0.2663493752479553, "learning_rate": 0.00015675612409732207, "loss": 0.2057, "step": 3048 }, { "epoch": 0.6170815624367537, "grad_norm": 0.32578763365745544, "learning_rate": 0.00015672993340458023, "loss": 0.2711, "step": 3049 }, { "epoch": 0.6172839506172839, "grad_norm": 0.4756195843219757, "learning_rate": 0.00015670373697249896, "loss": 0.2418, "step": 3050 }, { "epoch": 0.6172839506172839, "eval_loss": 0.27323392033576965, "eval_runtime": 0.7372, "eval_samples_per_second": 6.783, "eval_steps_per_second": 1.357, "step": 3050 }, { "epoch": 0.6174863387978142, "grad_norm": 0.34247496724128723, "learning_rate": 0.00015667753480372857, "loss": 0.2175, "step": 3051 }, { "epoch": 0.6176887269783444, "grad_norm": 0.330746591091156, "learning_rate": 0.00015665132690091994, "loss": 0.238, "step": 3052 }, { "epoch": 0.6178911151588747, "grad_norm": 0.3871874511241913, "learning_rate": 0.00015662511326672448, "loss": 0.2575, "step": 3053 }, { "epoch": 0.6180935033394049, "grad_norm": 0.3050987720489502, "learning_rate": 0.00015659889390379417, "loss": 0.2641, "step": 3054 }, { "epoch": 0.6182958915199352, "grad_norm": 0.4178610146045685, "learning_rate": 0.00015657266881478172, "loss": 0.271, "step": 3055 }, { "epoch": 0.6184982797004654, "grad_norm": 0.42879247665405273, "learning_rate": 0.0001565464380023402, "loss": 0.2352, "step": 3056 }, { "epoch": 0.6187006678809958, "grad_norm": 0.27890655398368835, "learning_rate": 0.00015652020146912347, "loss": 0.238, "step": 3057 }, { "epoch": 0.618903056061526, "grad_norm": 0.30058541893959045, "learning_rate": 0.0001564939592177858, "loss": 0.2579, "step": 3058 }, { "epoch": 0.6191054442420563, "grad_norm": 0.31102004647254944, "learning_rate": 0.0001564677112509821, "loss": 0.2746, "step": 3059 }, { "epoch": 0.6193078324225865, "grad_norm": 0.36177563667297363, "learning_rate": 0.00015644145757136792, "loss": 0.2381, "step": 3060 }, { "epoch": 0.6195102206031168, "grad_norm": 0.3283332586288452, "learning_rate": 0.00015641519818159928, "loss": 0.2636, "step": 3061 }, { "epoch": 0.6197126087836471, "grad_norm": 0.34870678186416626, "learning_rate": 0.00015638893308433284, "loss": 0.2404, "step": 3062 }, { "epoch": 0.6199149969641773, "grad_norm": 0.3281346261501312, "learning_rate": 0.00015636266228222584, "loss": 0.26, "step": 3063 }, { "epoch": 0.6201173851447076, "grad_norm": 0.30716973543167114, "learning_rate": 0.00015633638577793607, "loss": 0.2574, "step": 3064 }, { "epoch": 0.6203197733252378, "grad_norm": 0.3106836974620819, "learning_rate": 0.0001563101035741219, "loss": 0.2259, "step": 3065 }, { "epoch": 0.6205221615057681, "grad_norm": 0.27169597148895264, "learning_rate": 0.00015628381567344234, "loss": 0.2637, "step": 3066 }, { "epoch": 0.6207245496862983, "grad_norm": 0.39207571744918823, "learning_rate": 0.00015625752207855688, "loss": 0.2649, "step": 3067 }, { "epoch": 0.6209269378668286, "grad_norm": 0.530310869216919, "learning_rate": 0.00015623122279212562, "loss": 0.2235, "step": 3068 }, { "epoch": 0.6211293260473588, "grad_norm": 0.2757643461227417, "learning_rate": 0.00015620491781680927, "loss": 0.2359, "step": 3069 }, { "epoch": 0.6213317142278891, "grad_norm": 0.35047096014022827, "learning_rate": 0.0001561786071552691, "loss": 0.2595, "step": 3070 }, { "epoch": 0.6215341024084193, "grad_norm": 0.31434178352355957, "learning_rate": 0.00015615229081016697, "loss": 0.252, "step": 3071 }, { "epoch": 0.6217364905889496, "grad_norm": 0.2799963653087616, "learning_rate": 0.00015612596878416518, "loss": 0.2675, "step": 3072 }, { "epoch": 0.6219388787694798, "grad_norm": 0.2696954011917114, "learning_rate": 0.00015609964107992684, "loss": 0.2212, "step": 3073 }, { "epoch": 0.6221412669500102, "grad_norm": 0.31906068325042725, "learning_rate": 0.00015607330770011545, "loss": 0.2672, "step": 3074 }, { "epoch": 0.6223436551305404, "grad_norm": 0.3010803461074829, "learning_rate": 0.00015604696864739517, "loss": 0.2167, "step": 3075 }, { "epoch": 0.6225460433110707, "grad_norm": 0.4235513508319855, "learning_rate": 0.00015602062392443066, "loss": 0.2296, "step": 3076 }, { "epoch": 0.6227484314916009, "grad_norm": 0.2939774692058563, "learning_rate": 0.00015599427353388728, "loss": 0.2296, "step": 3077 }, { "epoch": 0.6229508196721312, "grad_norm": 0.2979620695114136, "learning_rate": 0.0001559679174784308, "loss": 0.2394, "step": 3078 }, { "epoch": 0.6231532078526614, "grad_norm": 0.37031883001327515, "learning_rate": 0.00015594155576072777, "loss": 0.2558, "step": 3079 }, { "epoch": 0.6233555960331917, "grad_norm": 0.3406575918197632, "learning_rate": 0.00015591518838344506, "loss": 0.2652, "step": 3080 }, { "epoch": 0.6235579842137219, "grad_norm": 0.3246553838253021, "learning_rate": 0.0001558888153492503, "loss": 0.2106, "step": 3081 }, { "epoch": 0.6237603723942522, "grad_norm": 0.2892184555530548, "learning_rate": 0.00015586243666081164, "loss": 0.2133, "step": 3082 }, { "epoch": 0.6239627605747824, "grad_norm": 0.32574543356895447, "learning_rate": 0.00015583605232079783, "loss": 0.2278, "step": 3083 }, { "epoch": 0.6241651487553127, "grad_norm": 0.4512995779514313, "learning_rate": 0.00015580966233187812, "loss": 0.266, "step": 3084 }, { "epoch": 0.6243675369358429, "grad_norm": 0.25693219900131226, "learning_rate": 0.00015578326669672232, "loss": 0.218, "step": 3085 }, { "epoch": 0.6245699251163732, "grad_norm": 0.3269275724887848, "learning_rate": 0.00015575686541800096, "loss": 0.2117, "step": 3086 }, { "epoch": 0.6247723132969034, "grad_norm": 0.3049417734146118, "learning_rate": 0.000155730458498385, "loss": 0.1994, "step": 3087 }, { "epoch": 0.6249747014774337, "grad_norm": 0.2842578887939453, "learning_rate": 0.00015570404594054604, "loss": 0.222, "step": 3088 }, { "epoch": 0.6251770896579639, "grad_norm": 0.33271604776382446, "learning_rate": 0.00015567762774715618, "loss": 0.2239, "step": 3089 }, { "epoch": 0.6253794778384942, "grad_norm": 0.24872200191020966, "learning_rate": 0.0001556512039208882, "loss": 0.2068, "step": 3090 }, { "epoch": 0.6255818660190244, "grad_norm": 0.2888675034046173, "learning_rate": 0.00015562477446441535, "loss": 0.2172, "step": 3091 }, { "epoch": 0.6257842541995547, "grad_norm": 0.28715941309928894, "learning_rate": 0.00015559833938041145, "loss": 0.2363, "step": 3092 }, { "epoch": 0.6259866423800851, "grad_norm": 0.24771295487880707, "learning_rate": 0.00015557189867155099, "loss": 0.2325, "step": 3093 }, { "epoch": 0.6261890305606153, "grad_norm": 0.3012368679046631, "learning_rate": 0.00015554545234050892, "loss": 0.2324, "step": 3094 }, { "epoch": 0.6263914187411456, "grad_norm": 0.34395089745521545, "learning_rate": 0.00015551900038996078, "loss": 0.2547, "step": 3095 }, { "epoch": 0.6265938069216758, "grad_norm": 0.4432348310947418, "learning_rate": 0.00015549254282258278, "loss": 0.2417, "step": 3096 }, { "epoch": 0.6267961951022061, "grad_norm": 0.2741510570049286, "learning_rate": 0.00015546607964105156, "loss": 0.2071, "step": 3097 }, { "epoch": 0.6269985832827363, "grad_norm": 0.36685729026794434, "learning_rate": 0.0001554396108480444, "loss": 0.2855, "step": 3098 }, { "epoch": 0.6272009714632666, "grad_norm": 0.38427332043647766, "learning_rate": 0.00015541313644623912, "loss": 0.2506, "step": 3099 }, { "epoch": 0.6274033596437968, "grad_norm": 0.41870930790901184, "learning_rate": 0.0001553866564383142, "loss": 0.2346, "step": 3100 }, { "epoch": 0.6274033596437968, "eval_loss": 0.26668718457221985, "eval_runtime": 0.7392, "eval_samples_per_second": 6.764, "eval_steps_per_second": 1.353, "step": 3100 }, { "epoch": 0.6276057478243271, "grad_norm": 0.31527113914489746, "learning_rate": 0.00015536017082694846, "loss": 0.2279, "step": 3101 }, { "epoch": 0.6278081360048573, "grad_norm": 0.26014941930770874, "learning_rate": 0.00015533367961482157, "loss": 0.2159, "step": 3102 }, { "epoch": 0.6280105241853876, "grad_norm": 0.2955648601055145, "learning_rate": 0.00015530718280461355, "loss": 0.2377, "step": 3103 }, { "epoch": 0.6282129123659178, "grad_norm": 0.29161566495895386, "learning_rate": 0.00015528068039900514, "loss": 0.2391, "step": 3104 }, { "epoch": 0.6284153005464481, "grad_norm": 0.3259400427341461, "learning_rate": 0.00015525417240067757, "loss": 0.2595, "step": 3105 }, { "epoch": 0.6286176887269783, "grad_norm": 0.3628864288330078, "learning_rate": 0.0001552276588123126, "loss": 0.2553, "step": 3106 }, { "epoch": 0.6288200769075086, "grad_norm": 0.29265254735946655, "learning_rate": 0.00015520113963659254, "loss": 0.2661, "step": 3107 }, { "epoch": 0.6290224650880388, "grad_norm": 0.2571503818035126, "learning_rate": 0.00015517461487620047, "loss": 0.2588, "step": 3108 }, { "epoch": 0.6292248532685691, "grad_norm": 0.2192489504814148, "learning_rate": 0.00015514808453381975, "loss": 0.197, "step": 3109 }, { "epoch": 0.6294272414490993, "grad_norm": 0.2925032675266266, "learning_rate": 0.00015512154861213452, "loss": 0.2244, "step": 3110 }, { "epoch": 0.6296296296296297, "grad_norm": 0.37553200125694275, "learning_rate": 0.0001550950071138294, "loss": 0.2445, "step": 3111 }, { "epoch": 0.6298320178101598, "grad_norm": 0.3246464729309082, "learning_rate": 0.00015506846004158955, "loss": 0.2406, "step": 3112 }, { "epoch": 0.6300344059906902, "grad_norm": 0.4171786904335022, "learning_rate": 0.00015504190739810074, "loss": 0.2209, "step": 3113 }, { "epoch": 0.6302367941712204, "grad_norm": 0.30422425270080566, "learning_rate": 0.00015501534918604926, "loss": 0.2543, "step": 3114 }, { "epoch": 0.6304391823517507, "grad_norm": 0.3078934848308563, "learning_rate": 0.00015498878540812208, "loss": 0.2423, "step": 3115 }, { "epoch": 0.6306415705322809, "grad_norm": 0.3099100887775421, "learning_rate": 0.00015496221606700657, "loss": 0.2655, "step": 3116 }, { "epoch": 0.6308439587128112, "grad_norm": 0.29486164450645447, "learning_rate": 0.00015493564116539072, "loss": 0.2356, "step": 3117 }, { "epoch": 0.6310463468933414, "grad_norm": 0.3040613830089569, "learning_rate": 0.00015490906070596316, "loss": 0.1921, "step": 3118 }, { "epoch": 0.6312487350738717, "grad_norm": 0.2945636808872223, "learning_rate": 0.00015488247469141295, "loss": 0.2215, "step": 3119 }, { "epoch": 0.6314511232544019, "grad_norm": 0.30915266275405884, "learning_rate": 0.00015485588312442986, "loss": 0.2574, "step": 3120 }, { "epoch": 0.6316535114349322, "grad_norm": 0.37940701842308044, "learning_rate": 0.0001548292860077041, "loss": 0.2501, "step": 3121 }, { "epoch": 0.6318558996154624, "grad_norm": 0.43181759119033813, "learning_rate": 0.0001548026833439265, "loss": 0.2947, "step": 3122 }, { "epoch": 0.6320582877959927, "grad_norm": 0.33247214555740356, "learning_rate": 0.00015477607513578846, "loss": 0.2125, "step": 3123 }, { "epoch": 0.632260675976523, "grad_norm": 0.2787618935108185, "learning_rate": 0.00015474946138598186, "loss": 0.2297, "step": 3124 }, { "epoch": 0.6324630641570532, "grad_norm": 0.3035429120063782, "learning_rate": 0.00015472284209719925, "loss": 0.2171, "step": 3125 }, { "epoch": 0.6326654523375835, "grad_norm": 0.2825091779232025, "learning_rate": 0.00015469621727213367, "loss": 0.2304, "step": 3126 }, { "epoch": 0.6328678405181137, "grad_norm": 0.2831633687019348, "learning_rate": 0.0001546695869134788, "loss": 0.244, "step": 3127 }, { "epoch": 0.633070228698644, "grad_norm": 0.4898656904697418, "learning_rate": 0.00015464295102392872, "loss": 0.2374, "step": 3128 }, { "epoch": 0.6332726168791742, "grad_norm": 0.3614266812801361, "learning_rate": 0.0001546163096061782, "loss": 0.2265, "step": 3129 }, { "epoch": 0.6334750050597046, "grad_norm": 0.4073461890220642, "learning_rate": 0.0001545896626629226, "loss": 0.1959, "step": 3130 }, { "epoch": 0.6336773932402348, "grad_norm": 0.33134597539901733, "learning_rate": 0.00015456301019685769, "loss": 0.2228, "step": 3131 }, { "epoch": 0.6338797814207651, "grad_norm": 0.5032749772071838, "learning_rate": 0.00015453635221067996, "loss": 0.2398, "step": 3132 }, { "epoch": 0.6340821696012953, "grad_norm": 0.2859143316745758, "learning_rate": 0.00015450968870708636, "loss": 0.24, "step": 3133 }, { "epoch": 0.6342845577818256, "grad_norm": 0.3137022852897644, "learning_rate": 0.00015448301968877442, "loss": 0.2478, "step": 3134 }, { "epoch": 0.6344869459623558, "grad_norm": 0.3397473692893982, "learning_rate": 0.00015445634515844222, "loss": 0.2476, "step": 3135 }, { "epoch": 0.6346893341428861, "grad_norm": 0.35663914680480957, "learning_rate": 0.0001544296651187884, "loss": 0.236, "step": 3136 }, { "epoch": 0.6348917223234163, "grad_norm": 0.28017348051071167, "learning_rate": 0.0001544029795725122, "loss": 0.2308, "step": 3137 }, { "epoch": 0.6350941105039466, "grad_norm": 0.2992730438709259, "learning_rate": 0.0001543762885223134, "loss": 0.2429, "step": 3138 }, { "epoch": 0.6352964986844768, "grad_norm": 0.3374902307987213, "learning_rate": 0.00015434959197089228, "loss": 0.2499, "step": 3139 }, { "epoch": 0.6354988868650071, "grad_norm": 0.37910881638526917, "learning_rate": 0.0001543228899209497, "loss": 0.2635, "step": 3140 }, { "epoch": 0.6357012750455373, "grad_norm": 0.26263922452926636, "learning_rate": 0.00015429618237518716, "loss": 0.2048, "step": 3141 }, { "epoch": 0.6359036632260676, "grad_norm": 0.27847379446029663, "learning_rate": 0.0001542694693363066, "loss": 0.2265, "step": 3142 }, { "epoch": 0.6361060514065978, "grad_norm": 0.28324779868125916, "learning_rate": 0.00015424275080701055, "loss": 0.2349, "step": 3143 }, { "epoch": 0.6363084395871281, "grad_norm": 0.278072714805603, "learning_rate": 0.00015421602679000217, "loss": 0.2323, "step": 3144 }, { "epoch": 0.6365108277676583, "grad_norm": 0.3826614022254944, "learning_rate": 0.00015418929728798505, "loss": 0.2197, "step": 3145 }, { "epoch": 0.6367132159481886, "grad_norm": 0.3305886387825012, "learning_rate": 0.00015416256230366346, "loss": 0.2245, "step": 3146 }, { "epoch": 0.6369156041287188, "grad_norm": 0.2796514332294464, "learning_rate": 0.00015413582183974213, "loss": 0.2098, "step": 3147 }, { "epoch": 0.6371179923092491, "grad_norm": 0.3792613744735718, "learning_rate": 0.00015410907589892637, "loss": 0.2411, "step": 3148 }, { "epoch": 0.6373203804897793, "grad_norm": 0.2925693392753601, "learning_rate": 0.0001540823244839221, "loss": 0.2683, "step": 3149 }, { "epoch": 0.6375227686703097, "grad_norm": 0.3847316801548004, "learning_rate": 0.0001540555675974357, "loss": 0.263, "step": 3150 }, { "epoch": 0.6375227686703097, "eval_loss": 0.2652246356010437, "eval_runtime": 0.7391, "eval_samples_per_second": 6.765, "eval_steps_per_second": 1.353, "step": 3150 }, { "epoch": 0.6377251568508399, "grad_norm": 0.2546859681606293, "learning_rate": 0.0001540288052421742, "loss": 0.2343, "step": 3151 }, { "epoch": 0.6379275450313702, "grad_norm": 0.2868305444717407, "learning_rate": 0.00015400203742084508, "loss": 0.2726, "step": 3152 }, { "epoch": 0.6381299332119005, "grad_norm": 0.2870636284351349, "learning_rate": 0.0001539752641361564, "loss": 0.2055, "step": 3153 }, { "epoch": 0.6383323213924307, "grad_norm": 0.7042229175567627, "learning_rate": 0.0001539484853908169, "loss": 0.2114, "step": 3154 }, { "epoch": 0.638534709572961, "grad_norm": 0.30285945534706116, "learning_rate": 0.00015392170118753575, "loss": 0.2451, "step": 3155 }, { "epoch": 0.6387370977534912, "grad_norm": 0.48774808645248413, "learning_rate": 0.00015389491152902263, "loss": 0.2731, "step": 3156 }, { "epoch": 0.6389394859340215, "grad_norm": 0.25389939546585083, "learning_rate": 0.00015386811641798786, "loss": 0.1723, "step": 3157 }, { "epoch": 0.6391418741145517, "grad_norm": 0.3468737304210663, "learning_rate": 0.00015384131585714235, "loss": 0.2381, "step": 3158 }, { "epoch": 0.639344262295082, "grad_norm": 0.3124346137046814, "learning_rate": 0.0001538145098491974, "loss": 0.217, "step": 3159 }, { "epoch": 0.6395466504756122, "grad_norm": 0.41017088294029236, "learning_rate": 0.00015378769839686504, "loss": 0.2213, "step": 3160 }, { "epoch": 0.6397490386561425, "grad_norm": 0.6051793694496155, "learning_rate": 0.00015376088150285773, "loss": 0.2691, "step": 3161 }, { "epoch": 0.6399514268366727, "grad_norm": 0.2914044260978699, "learning_rate": 0.00015373405916988857, "loss": 0.2465, "step": 3162 }, { "epoch": 0.640153815017203, "grad_norm": 0.29912036657333374, "learning_rate": 0.0001537072314006711, "loss": 0.2338, "step": 3163 }, { "epoch": 0.6403562031977332, "grad_norm": 0.31532540917396545, "learning_rate": 0.0001536803981979195, "loss": 0.247, "step": 3164 }, { "epoch": 0.6405585913782635, "grad_norm": 0.2610158622264862, "learning_rate": 0.0001536535595643485, "loss": 0.2389, "step": 3165 }, { "epoch": 0.6407609795587937, "grad_norm": 0.3037506341934204, "learning_rate": 0.0001536267155026733, "loss": 0.207, "step": 3166 }, { "epoch": 0.640963367739324, "grad_norm": 0.3930741548538208, "learning_rate": 0.0001535998660156097, "loss": 0.2291, "step": 3167 }, { "epoch": 0.6411657559198543, "grad_norm": 0.3472108840942383, "learning_rate": 0.00015357301110587412, "loss": 0.2352, "step": 3168 }, { "epoch": 0.6413681441003846, "grad_norm": 0.2909805178642273, "learning_rate": 0.0001535461507761834, "loss": 0.2418, "step": 3169 }, { "epoch": 0.6415705322809148, "grad_norm": 0.4504471719264984, "learning_rate": 0.000153519285029255, "loss": 0.2477, "step": 3170 }, { "epoch": 0.6417729204614451, "grad_norm": 0.3161703646183014, "learning_rate": 0.0001534924138678069, "loss": 0.2547, "step": 3171 }, { "epoch": 0.6419753086419753, "grad_norm": 0.4808140695095062, "learning_rate": 0.00015346553729455765, "loss": 0.2616, "step": 3172 }, { "epoch": 0.6421776968225056, "grad_norm": 0.28160232305526733, "learning_rate": 0.00015343865531222638, "loss": 0.2363, "step": 3173 }, { "epoch": 0.6423800850030358, "grad_norm": 0.307965487241745, "learning_rate": 0.00015341176792353265, "loss": 0.2445, "step": 3174 }, { "epoch": 0.6425824731835661, "grad_norm": 0.2636444568634033, "learning_rate": 0.00015338487513119668, "loss": 0.2316, "step": 3175 }, { "epoch": 0.6427848613640963, "grad_norm": 0.2725645899772644, "learning_rate": 0.00015335797693793923, "loss": 0.2147, "step": 3176 }, { "epoch": 0.6429872495446266, "grad_norm": 0.3533676564693451, "learning_rate": 0.00015333107334648154, "loss": 0.2648, "step": 3177 }, { "epoch": 0.6431896377251568, "grad_norm": 0.275126188993454, "learning_rate": 0.00015330416435954547, "loss": 0.2223, "step": 3178 }, { "epoch": 0.6433920259056871, "grad_norm": 0.27795132994651794, "learning_rate": 0.00015327724997985334, "loss": 0.2137, "step": 3179 }, { "epoch": 0.6435944140862173, "grad_norm": 0.2937318682670593, "learning_rate": 0.00015325033021012808, "loss": 0.2425, "step": 3180 }, { "epoch": 0.6437968022667476, "grad_norm": 0.43675699830055237, "learning_rate": 0.0001532234050530932, "loss": 0.2199, "step": 3181 }, { "epoch": 0.6439991904472778, "grad_norm": 0.3387317359447479, "learning_rate": 0.0001531964745114726, "loss": 0.2509, "step": 3182 }, { "epoch": 0.6442015786278081, "grad_norm": 0.3405567407608032, "learning_rate": 0.00015316953858799095, "loss": 0.2375, "step": 3183 }, { "epoch": 0.6444039668083384, "grad_norm": 0.28323695063591003, "learning_rate": 0.00015314259728537325, "loss": 0.2151, "step": 3184 }, { "epoch": 0.6446063549888686, "grad_norm": 0.26877549290657043, "learning_rate": 0.0001531156506063452, "loss": 0.2309, "step": 3185 }, { "epoch": 0.644808743169399, "grad_norm": 0.3151189088821411, "learning_rate": 0.00015308869855363294, "loss": 0.2441, "step": 3186 }, { "epoch": 0.6450111313499292, "grad_norm": 0.3761087656021118, "learning_rate": 0.00015306174112996324, "loss": 0.2475, "step": 3187 }, { "epoch": 0.6452135195304595, "grad_norm": 0.3229560852050781, "learning_rate": 0.00015303477833806332, "loss": 0.2677, "step": 3188 }, { "epoch": 0.6454159077109897, "grad_norm": 0.2946084439754486, "learning_rate": 0.000153007810180661, "loss": 0.2017, "step": 3189 }, { "epoch": 0.64561829589152, "grad_norm": 0.25200504064559937, "learning_rate": 0.00015298083666048467, "loss": 0.2215, "step": 3190 }, { "epoch": 0.6458206840720502, "grad_norm": 0.3906277120113373, "learning_rate": 0.0001529538577802632, "loss": 0.332, "step": 3191 }, { "epoch": 0.6460230722525805, "grad_norm": 0.39015209674835205, "learning_rate": 0.0001529268735427261, "loss": 0.2405, "step": 3192 }, { "epoch": 0.6462254604331107, "grad_norm": 0.31589704751968384, "learning_rate": 0.0001528998839506032, "loss": 0.2484, "step": 3193 }, { "epoch": 0.646427848613641, "grad_norm": 0.27482402324676514, "learning_rate": 0.0001528728890066252, "loss": 0.1976, "step": 3194 }, { "epoch": 0.6466302367941712, "grad_norm": 0.3176118731498718, "learning_rate": 0.000152845888713523, "loss": 0.2141, "step": 3195 }, { "epoch": 0.6468326249747015, "grad_norm": 0.240059494972229, "learning_rate": 0.00015281888307402833, "loss": 0.222, "step": 3196 }, { "epoch": 0.6470350131552317, "grad_norm": 0.36541762948036194, "learning_rate": 0.00015279187209087328, "loss": 0.2182, "step": 3197 }, { "epoch": 0.647237401335762, "grad_norm": 0.4202319383621216, "learning_rate": 0.00015276485576679055, "loss": 0.2354, "step": 3198 }, { "epoch": 0.6474397895162922, "grad_norm": 0.29542794823646545, "learning_rate": 0.00015273783410451336, "loss": 0.2539, "step": 3199 }, { "epoch": 0.6476421776968225, "grad_norm": 0.30133146047592163, "learning_rate": 0.0001527108071067755, "loss": 0.2162, "step": 3200 }, { "epoch": 0.6476421776968225, "eval_loss": 0.26807427406311035, "eval_runtime": 0.7402, "eval_samples_per_second": 6.755, "eval_steps_per_second": 1.351, "step": 3200 }, { "epoch": 0.6478445658773527, "grad_norm": 0.2947642505168915, "learning_rate": 0.00015268377477631128, "loss": 0.2246, "step": 3201 }, { "epoch": 0.648046954057883, "grad_norm": 0.2888360917568207, "learning_rate": 0.0001526567371158555, "loss": 0.2265, "step": 3202 }, { "epoch": 0.6482493422384132, "grad_norm": 0.3799479305744171, "learning_rate": 0.00015262969412814357, "loss": 0.2374, "step": 3203 }, { "epoch": 0.6484517304189436, "grad_norm": 0.27309316396713257, "learning_rate": 0.0001526026458159115, "loss": 0.2327, "step": 3204 }, { "epoch": 0.6486541185994738, "grad_norm": 0.3100753426551819, "learning_rate": 0.00015257559218189562, "loss": 0.2542, "step": 3205 }, { "epoch": 0.6488565067800041, "grad_norm": 0.3332715928554535, "learning_rate": 0.000152548533228833, "loss": 0.2389, "step": 3206 }, { "epoch": 0.6490588949605343, "grad_norm": 0.33358773589134216, "learning_rate": 0.0001525214689594612, "loss": 0.2338, "step": 3207 }, { "epoch": 0.6492612831410646, "grad_norm": 0.3007495701313019, "learning_rate": 0.00015249439937651825, "loss": 0.2366, "step": 3208 }, { "epoch": 0.6494636713215948, "grad_norm": 0.28332725167274475, "learning_rate": 0.00015246732448274275, "loss": 0.2582, "step": 3209 }, { "epoch": 0.6496660595021251, "grad_norm": 0.36708885431289673, "learning_rate": 0.00015244024428087393, "loss": 0.263, "step": 3210 }, { "epoch": 0.6498684476826553, "grad_norm": 0.2604656517505646, "learning_rate": 0.00015241315877365143, "loss": 0.2179, "step": 3211 }, { "epoch": 0.6500708358631856, "grad_norm": 0.41370195150375366, "learning_rate": 0.00015238606796381553, "loss": 0.2509, "step": 3212 }, { "epoch": 0.6502732240437158, "grad_norm": 0.3503737151622772, "learning_rate": 0.0001523589718541069, "loss": 0.26, "step": 3213 }, { "epoch": 0.6504756122242461, "grad_norm": 0.32839810848236084, "learning_rate": 0.00015233187044726693, "loss": 0.2243, "step": 3214 }, { "epoch": 0.6506780004047764, "grad_norm": 0.45140987634658813, "learning_rate": 0.00015230476374603738, "loss": 0.2298, "step": 3215 }, { "epoch": 0.6508803885853066, "grad_norm": 0.27483177185058594, "learning_rate": 0.00015227765175316072, "loss": 0.2455, "step": 3216 }, { "epoch": 0.6510827767658369, "grad_norm": 0.2992781400680542, "learning_rate": 0.0001522505344713798, "loss": 0.2466, "step": 3217 }, { "epoch": 0.6512851649463671, "grad_norm": 0.5257993340492249, "learning_rate": 0.00015222341190343803, "loss": 0.2274, "step": 3218 }, { "epoch": 0.6514875531268974, "grad_norm": 0.3546026051044464, "learning_rate": 0.00015219628405207942, "loss": 0.2477, "step": 3219 }, { "epoch": 0.6516899413074276, "grad_norm": 0.3486909866333008, "learning_rate": 0.00015216915092004847, "loss": 0.2557, "step": 3220 }, { "epoch": 0.6518923294879579, "grad_norm": 0.3145160377025604, "learning_rate": 0.00015214201251009023, "loss": 0.2306, "step": 3221 }, { "epoch": 0.6520947176684881, "grad_norm": 0.34794342517852783, "learning_rate": 0.00015211486882495029, "loss": 0.2659, "step": 3222 }, { "epoch": 0.6522971058490185, "grad_norm": 0.34079018235206604, "learning_rate": 0.00015208771986737477, "loss": 0.2957, "step": 3223 }, { "epoch": 0.6524994940295487, "grad_norm": 0.3173547089099884, "learning_rate": 0.0001520605656401103, "loss": 0.2304, "step": 3224 }, { "epoch": 0.652701882210079, "grad_norm": 0.2905762791633606, "learning_rate": 0.00015203340614590406, "loss": 0.238, "step": 3225 }, { "epoch": 0.6529042703906092, "grad_norm": 0.30235806107521057, "learning_rate": 0.00015200624138750376, "loss": 0.2483, "step": 3226 }, { "epoch": 0.6531066585711395, "grad_norm": 0.32202184200286865, "learning_rate": 0.0001519790713676577, "loss": 0.2373, "step": 3227 }, { "epoch": 0.6533090467516697, "grad_norm": 0.2946752905845642, "learning_rate": 0.00015195189608911455, "loss": 0.2486, "step": 3228 }, { "epoch": 0.6535114349322, "grad_norm": 0.3170306086540222, "learning_rate": 0.0001519247155546237, "loss": 0.226, "step": 3229 }, { "epoch": 0.6537138231127302, "grad_norm": 0.3037535548210144, "learning_rate": 0.00015189752976693498, "loss": 0.2358, "step": 3230 }, { "epoch": 0.6539162112932605, "grad_norm": 0.27636289596557617, "learning_rate": 0.00015187033872879875, "loss": 0.2617, "step": 3231 }, { "epoch": 0.6541185994737907, "grad_norm": 0.3105868399143219, "learning_rate": 0.0001518431424429659, "loss": 0.2888, "step": 3232 }, { "epoch": 0.654320987654321, "grad_norm": 0.29909271001815796, "learning_rate": 0.0001518159409121879, "loss": 0.2462, "step": 3233 }, { "epoch": 0.6545233758348512, "grad_norm": 0.29715684056282043, "learning_rate": 0.00015178873413921665, "loss": 0.2476, "step": 3234 }, { "epoch": 0.6547257640153815, "grad_norm": 0.31502577662467957, "learning_rate": 0.00015176152212680478, "loss": 0.2044, "step": 3235 }, { "epoch": 0.6549281521959117, "grad_norm": 0.29666459560394287, "learning_rate": 0.00015173430487770513, "loss": 0.2457, "step": 3236 }, { "epoch": 0.655130540376442, "grad_norm": 1.0712119340896606, "learning_rate": 0.0001517070823946714, "loss": 0.267, "step": 3237 }, { "epoch": 0.6553329285569722, "grad_norm": 0.34478285908699036, "learning_rate": 0.00015167985468045764, "loss": 0.2671, "step": 3238 }, { "epoch": 0.6555353167375025, "grad_norm": 0.3465102016925812, "learning_rate": 0.00015165262173781846, "loss": 0.248, "step": 3239 }, { "epoch": 0.6557377049180327, "grad_norm": 0.3330729901790619, "learning_rate": 0.00015162538356950899, "loss": 0.2555, "step": 3240 }, { "epoch": 0.655940093098563, "grad_norm": 0.30930474400520325, "learning_rate": 0.00015159814017828488, "loss": 0.2541, "step": 3241 }, { "epoch": 0.6561424812790932, "grad_norm": 0.3146357834339142, "learning_rate": 0.00015157089156690238, "loss": 0.2385, "step": 3242 }, { "epoch": 0.6563448694596236, "grad_norm": 0.43063199520111084, "learning_rate": 0.00015154363773811822, "loss": 0.2822, "step": 3243 }, { "epoch": 0.6565472576401539, "grad_norm": 0.3448871970176697, "learning_rate": 0.0001515163786946896, "loss": 0.2568, "step": 3244 }, { "epoch": 0.6567496458206841, "grad_norm": 0.3220970928668976, "learning_rate": 0.00015148911443937436, "loss": 0.1936, "step": 3245 }, { "epoch": 0.6569520340012144, "grad_norm": 0.28015419840812683, "learning_rate": 0.0001514618449749308, "loss": 0.215, "step": 3246 }, { "epoch": 0.6571544221817446, "grad_norm": 0.3737340569496155, "learning_rate": 0.00015143457030411775, "loss": 0.2332, "step": 3247 }, { "epoch": 0.6573568103622749, "grad_norm": 0.35730138421058655, "learning_rate": 0.00015140729042969453, "loss": 0.1969, "step": 3248 }, { "epoch": 0.6575591985428051, "grad_norm": 0.3678499162197113, "learning_rate": 0.00015138000535442112, "loss": 0.2669, "step": 3249 }, { "epoch": 0.6577615867233354, "grad_norm": 0.35763099789619446, "learning_rate": 0.00015135271508105787, "loss": 0.2693, "step": 3250 }, { "epoch": 0.6577615867233354, "eval_loss": 0.27355989813804626, "eval_runtime": 0.7378, "eval_samples_per_second": 6.777, "eval_steps_per_second": 1.355, "step": 3250 }, { "epoch": 0.6579639749038656, "grad_norm": 0.27716264128685, "learning_rate": 0.00015132541961236577, "loss": 0.1913, "step": 3251 }, { "epoch": 0.6581663630843959, "grad_norm": 0.3249410390853882, "learning_rate": 0.00015129811895110625, "loss": 0.2515, "step": 3252 }, { "epoch": 0.6583687512649261, "grad_norm": 0.3644621670246124, "learning_rate": 0.0001512708131000413, "loss": 0.2511, "step": 3253 }, { "epoch": 0.6585711394454564, "grad_norm": 0.36926156282424927, "learning_rate": 0.00015124350206193347, "loss": 0.2611, "step": 3254 }, { "epoch": 0.6587735276259866, "grad_norm": 0.3121117949485779, "learning_rate": 0.0001512161858395458, "loss": 0.2429, "step": 3255 }, { "epoch": 0.6589759158065169, "grad_norm": 0.28802725672721863, "learning_rate": 0.00015118886443564185, "loss": 0.2379, "step": 3256 }, { "epoch": 0.6591783039870471, "grad_norm": 0.4395899772644043, "learning_rate": 0.00015116153785298573, "loss": 0.2528, "step": 3257 }, { "epoch": 0.6593806921675774, "grad_norm": 0.38329094648361206, "learning_rate": 0.000151134206094342, "loss": 0.2519, "step": 3258 }, { "epoch": 0.6595830803481076, "grad_norm": 0.327964186668396, "learning_rate": 0.00015110686916247588, "loss": 0.232, "step": 3259 }, { "epoch": 0.659785468528638, "grad_norm": 0.27446261048316956, "learning_rate": 0.000151079527060153, "loss": 0.269, "step": 3260 }, { "epoch": 0.6599878567091682, "grad_norm": 0.4416070580482483, "learning_rate": 0.0001510521797901395, "loss": 0.2213, "step": 3261 }, { "epoch": 0.6601902448896985, "grad_norm": 0.35151076316833496, "learning_rate": 0.00015102482735520218, "loss": 0.2215, "step": 3262 }, { "epoch": 0.6603926330702287, "grad_norm": 0.43818771839141846, "learning_rate": 0.0001509974697581082, "loss": 0.2437, "step": 3263 }, { "epoch": 0.660595021250759, "grad_norm": 0.3801747262477875, "learning_rate": 0.00015097010700162536, "loss": 0.2498, "step": 3264 }, { "epoch": 0.6607974094312892, "grad_norm": 0.28544607758522034, "learning_rate": 0.0001509427390885219, "loss": 0.2418, "step": 3265 }, { "epoch": 0.6609997976118195, "grad_norm": 0.30231353640556335, "learning_rate": 0.00015091536602156663, "loss": 0.2459, "step": 3266 }, { "epoch": 0.6612021857923497, "grad_norm": 0.32911112904548645, "learning_rate": 0.00015088798780352888, "loss": 0.2412, "step": 3267 }, { "epoch": 0.66140457397288, "grad_norm": 0.28970471024513245, "learning_rate": 0.00015086060443717848, "loss": 0.2153, "step": 3268 }, { "epoch": 0.6616069621534102, "grad_norm": 0.3324395418167114, "learning_rate": 0.00015083321592528583, "loss": 0.2095, "step": 3269 }, { "epoch": 0.6618093503339405, "grad_norm": 0.2921914756298065, "learning_rate": 0.00015080582227062174, "loss": 0.2404, "step": 3270 }, { "epoch": 0.6620117385144707, "grad_norm": 0.28326934576034546, "learning_rate": 0.00015077842347595768, "loss": 0.2368, "step": 3271 }, { "epoch": 0.662214126695001, "grad_norm": 0.3168807029724121, "learning_rate": 0.00015075101954406555, "loss": 0.2439, "step": 3272 }, { "epoch": 0.6624165148755312, "grad_norm": 0.3238939940929413, "learning_rate": 0.00015072361047771776, "loss": 0.2817, "step": 3273 }, { "epoch": 0.6626189030560615, "grad_norm": 0.3251747786998749, "learning_rate": 0.00015069619627968732, "loss": 0.2511, "step": 3274 }, { "epoch": 0.6628212912365918, "grad_norm": 0.37923121452331543, "learning_rate": 0.00015066877695274768, "loss": 0.2331, "step": 3275 }, { "epoch": 0.663023679417122, "grad_norm": 0.37296178936958313, "learning_rate": 0.00015064135249967287, "loss": 0.2763, "step": 3276 }, { "epoch": 0.6632260675976523, "grad_norm": 0.41110411286354065, "learning_rate": 0.00015061392292323734, "loss": 0.2665, "step": 3277 }, { "epoch": 0.6634284557781825, "grad_norm": 0.3819164037704468, "learning_rate": 0.00015058648822621618, "loss": 0.2248, "step": 3278 }, { "epoch": 0.6636308439587129, "grad_norm": 0.29319527745246887, "learning_rate": 0.00015055904841138496, "loss": 0.2383, "step": 3279 }, { "epoch": 0.6638332321392431, "grad_norm": 0.4977886974811554, "learning_rate": 0.00015053160348151975, "loss": 0.2704, "step": 3280 }, { "epoch": 0.6640356203197734, "grad_norm": 0.32568296790122986, "learning_rate": 0.00015050415343939713, "loss": 0.2232, "step": 3281 }, { "epoch": 0.6642380085003036, "grad_norm": 0.336796373128891, "learning_rate": 0.00015047669828779412, "loss": 0.26, "step": 3282 }, { "epoch": 0.6644403966808339, "grad_norm": 0.2726500630378723, "learning_rate": 0.00015044923802948854, "loss": 0.2414, "step": 3283 }, { "epoch": 0.6646427848613641, "grad_norm": 0.28668951988220215, "learning_rate": 0.00015042177266725833, "loss": 0.2467, "step": 3284 }, { "epoch": 0.6648451730418944, "grad_norm": 0.3118850886821747, "learning_rate": 0.00015039430220388224, "loss": 0.2444, "step": 3285 }, { "epoch": 0.6650475612224246, "grad_norm": 0.2920859456062317, "learning_rate": 0.00015036682664213943, "loss": 0.2092, "step": 3286 }, { "epoch": 0.6652499494029549, "grad_norm": 0.29595354199409485, "learning_rate": 0.0001503393459848096, "loss": 0.2176, "step": 3287 }, { "epoch": 0.6654523375834851, "grad_norm": 0.3060019910335541, "learning_rate": 0.00015031186023467297, "loss": 0.2565, "step": 3288 }, { "epoch": 0.6656547257640154, "grad_norm": 0.24143262207508087, "learning_rate": 0.00015028436939451022, "loss": 0.2045, "step": 3289 }, { "epoch": 0.6658571139445456, "grad_norm": 0.27763256430625916, "learning_rate": 0.0001502568734671026, "loss": 0.2291, "step": 3290 }, { "epoch": 0.6660595021250759, "grad_norm": 0.34247729182243347, "learning_rate": 0.0001502293724552319, "loss": 0.2684, "step": 3291 }, { "epoch": 0.6662618903056061, "grad_norm": 0.27955156564712524, "learning_rate": 0.0001502018663616803, "loss": 0.2002, "step": 3292 }, { "epoch": 0.6664642784861364, "grad_norm": 0.31482434272766113, "learning_rate": 0.00015017435518923064, "loss": 0.2126, "step": 3293 }, { "epoch": 0.6666666666666666, "grad_norm": 0.3705318868160248, "learning_rate": 0.0001501468389406662, "loss": 0.2389, "step": 3294 }, { "epoch": 0.6668690548471969, "grad_norm": 0.38991278409957886, "learning_rate": 0.0001501193176187708, "loss": 0.2238, "step": 3295 }, { "epoch": 0.6670714430277271, "grad_norm": 0.39363741874694824, "learning_rate": 0.00015009179122632872, "loss": 0.2273, "step": 3296 }, { "epoch": 0.6672738312082575, "grad_norm": 0.27503731846809387, "learning_rate": 0.00015006425976612478, "loss": 0.2356, "step": 3297 }, { "epoch": 0.6674762193887877, "grad_norm": 0.3818816840648651, "learning_rate": 0.00015003672324094439, "loss": 0.2442, "step": 3298 }, { "epoch": 0.667678607569318, "grad_norm": 0.2479441612958908, "learning_rate": 0.00015000918165357336, "loss": 0.2077, "step": 3299 }, { "epoch": 0.6678809957498482, "grad_norm": 0.3392878770828247, "learning_rate": 0.00014998163500679808, "loss": 0.2185, "step": 3300 }, { "epoch": 0.6678809957498482, "eval_loss": 0.2743344008922577, "eval_runtime": 0.739, "eval_samples_per_second": 6.766, "eval_steps_per_second": 1.353, "step": 3300 }, { "epoch": 0.6680833839303785, "grad_norm": 0.4281541705131531, "learning_rate": 0.0001499540833034054, "loss": 0.2606, "step": 3301 }, { "epoch": 0.6682857721109087, "grad_norm": 0.3157845139503479, "learning_rate": 0.00014992652654618275, "loss": 0.2454, "step": 3302 }, { "epoch": 0.668488160291439, "grad_norm": 0.35366958379745483, "learning_rate": 0.00014989896473791803, "loss": 0.2188, "step": 3303 }, { "epoch": 0.6686905484719692, "grad_norm": 0.28584495186805725, "learning_rate": 0.0001498713978813996, "loss": 0.2106, "step": 3304 }, { "epoch": 0.6688929366524995, "grad_norm": 0.32046595215797424, "learning_rate": 0.0001498438259794165, "loss": 0.2289, "step": 3305 }, { "epoch": 0.6690953248330298, "grad_norm": 0.2838146388530731, "learning_rate": 0.00014981624903475803, "loss": 0.2355, "step": 3306 }, { "epoch": 0.66929771301356, "grad_norm": 0.34235918521881104, "learning_rate": 0.00014978866705021423, "loss": 0.282, "step": 3307 }, { "epoch": 0.6695001011940903, "grad_norm": 0.2716708183288574, "learning_rate": 0.0001497610800285755, "loss": 0.2327, "step": 3308 }, { "epoch": 0.6697024893746205, "grad_norm": 0.30932801961898804, "learning_rate": 0.00014973348797263283, "loss": 0.2531, "step": 3309 }, { "epoch": 0.6699048775551508, "grad_norm": 0.3900741636753082, "learning_rate": 0.0001497058908851777, "loss": 0.2124, "step": 3310 }, { "epoch": 0.670107265735681, "grad_norm": 0.3228439688682556, "learning_rate": 0.0001496782887690021, "loss": 0.2598, "step": 3311 }, { "epoch": 0.6703096539162113, "grad_norm": 0.2500062882900238, "learning_rate": 0.0001496506816268985, "loss": 0.2102, "step": 3312 }, { "epoch": 0.6705120420967415, "grad_norm": 0.2975091338157654, "learning_rate": 0.0001496230694616599, "loss": 0.2676, "step": 3313 }, { "epoch": 0.6707144302772718, "grad_norm": 0.3403729200363159, "learning_rate": 0.00014959545227607982, "loss": 0.2706, "step": 3314 }, { "epoch": 0.670916818457802, "grad_norm": 0.2906215786933899, "learning_rate": 0.0001495678300729523, "loss": 0.2226, "step": 3315 }, { "epoch": 0.6711192066383324, "grad_norm": 0.3046065866947174, "learning_rate": 0.00014954020285507183, "loss": 0.2482, "step": 3316 }, { "epoch": 0.6713215948188626, "grad_norm": 0.3264645040035248, "learning_rate": 0.0001495125706252335, "loss": 0.2251, "step": 3317 }, { "epoch": 0.6715239829993929, "grad_norm": 0.2815634608268738, "learning_rate": 0.00014948493338623275, "loss": 0.2253, "step": 3318 }, { "epoch": 0.6717263711799231, "grad_norm": 0.2859968841075897, "learning_rate": 0.00014945729114086568, "loss": 0.2151, "step": 3319 }, { "epoch": 0.6719287593604534, "grad_norm": 0.3038029968738556, "learning_rate": 0.0001494296438919289, "loss": 0.2243, "step": 3320 }, { "epoch": 0.6721311475409836, "grad_norm": 0.3037087917327881, "learning_rate": 0.00014940199164221936, "loss": 0.2564, "step": 3321 }, { "epoch": 0.6723335357215139, "grad_norm": 0.30161625146865845, "learning_rate": 0.00014937433439453466, "loss": 0.2691, "step": 3322 }, { "epoch": 0.6725359239020441, "grad_norm": 0.37410643696784973, "learning_rate": 0.0001493466721516729, "loss": 0.244, "step": 3323 }, { "epoch": 0.6727383120825744, "grad_norm": 0.4140486717224121, "learning_rate": 0.00014931900491643266, "loss": 0.305, "step": 3324 }, { "epoch": 0.6729407002631046, "grad_norm": 0.355758935213089, "learning_rate": 0.00014929133269161296, "loss": 0.2452, "step": 3325 }, { "epoch": 0.6731430884436349, "grad_norm": 0.32681307196617126, "learning_rate": 0.00014926365548001346, "loss": 0.2507, "step": 3326 }, { "epoch": 0.6733454766241651, "grad_norm": 0.3159436583518982, "learning_rate": 0.00014923597328443422, "loss": 0.2556, "step": 3327 }, { "epoch": 0.6735478648046954, "grad_norm": 0.32382968068122864, "learning_rate": 0.00014920828610767584, "loss": 0.2504, "step": 3328 }, { "epoch": 0.6737502529852256, "grad_norm": 0.3211047649383545, "learning_rate": 0.0001491805939525394, "loss": 0.2735, "step": 3329 }, { "epoch": 0.6739526411657559, "grad_norm": 0.4596484303474426, "learning_rate": 0.0001491528968218265, "loss": 0.2799, "step": 3330 }, { "epoch": 0.6741550293462861, "grad_norm": 0.347531259059906, "learning_rate": 0.00014912519471833922, "loss": 0.3237, "step": 3331 }, { "epoch": 0.6743574175268164, "grad_norm": 0.7053202986717224, "learning_rate": 0.00014909748764488026, "loss": 0.2516, "step": 3332 }, { "epoch": 0.6745598057073466, "grad_norm": 0.2955659329891205, "learning_rate": 0.00014906977560425264, "loss": 0.2788, "step": 3333 }, { "epoch": 0.674762193887877, "grad_norm": 0.32625612616539, "learning_rate": 0.00014904205859926002, "loss": 0.2353, "step": 3334 }, { "epoch": 0.6749645820684073, "grad_norm": 0.28343072533607483, "learning_rate": 0.00014901433663270649, "loss": 0.2202, "step": 3335 }, { "epoch": 0.6751669702489375, "grad_norm": 0.3389117121696472, "learning_rate": 0.0001489866097073967, "loss": 0.211, "step": 3336 }, { "epoch": 0.6753693584294678, "grad_norm": 0.3158564865589142, "learning_rate": 0.00014895887782613576, "loss": 0.2403, "step": 3337 }, { "epoch": 0.675571746609998, "grad_norm": 0.33958199620246887, "learning_rate": 0.00014893114099172924, "loss": 0.2384, "step": 3338 }, { "epoch": 0.6757741347905283, "grad_norm": 0.2601412236690521, "learning_rate": 0.00014890339920698334, "loss": 0.2303, "step": 3339 }, { "epoch": 0.6759765229710585, "grad_norm": 0.253071129322052, "learning_rate": 0.00014887565247470464, "loss": 0.235, "step": 3340 }, { "epoch": 0.6761789111515888, "grad_norm": 0.3102857172489166, "learning_rate": 0.00014884790079770026, "loss": 0.2194, "step": 3341 }, { "epoch": 0.676381299332119, "grad_norm": 0.2947596311569214, "learning_rate": 0.00014882014417877783, "loss": 0.2323, "step": 3342 }, { "epoch": 0.6765836875126493, "grad_norm": 0.30757614970207214, "learning_rate": 0.0001487923826207455, "loss": 0.2188, "step": 3343 }, { "epoch": 0.6767860756931795, "grad_norm": 0.27366071939468384, "learning_rate": 0.00014876461612641184, "loss": 0.2439, "step": 3344 }, { "epoch": 0.6769884638737098, "grad_norm": 0.28723663091659546, "learning_rate": 0.000148736844698586, "loss": 0.2536, "step": 3345 }, { "epoch": 0.67719085205424, "grad_norm": 0.25499916076660156, "learning_rate": 0.00014870906834007762, "loss": 0.2133, "step": 3346 }, { "epoch": 0.6773932402347703, "grad_norm": 0.3012092113494873, "learning_rate": 0.0001486812870536968, "loss": 0.2519, "step": 3347 }, { "epoch": 0.6775956284153005, "grad_norm": 0.35855376720428467, "learning_rate": 0.00014865350084225415, "loss": 0.3027, "step": 3348 }, { "epoch": 0.6777980165958308, "grad_norm": 0.2527807652950287, "learning_rate": 0.00014862570970856082, "loss": 0.1778, "step": 3349 }, { "epoch": 0.678000404776361, "grad_norm": 0.36591532826423645, "learning_rate": 0.0001485979136554284, "loss": 0.2506, "step": 3350 }, { "epoch": 0.678000404776361, "eval_loss": 0.27230167388916016, "eval_runtime": 0.7416, "eval_samples_per_second": 6.743, "eval_steps_per_second": 1.349, "step": 3350 }, { "epoch": 0.6782027929568913, "grad_norm": 0.38087227940559387, "learning_rate": 0.000148570112685669, "loss": 0.2004, "step": 3351 }, { "epoch": 0.6784051811374215, "grad_norm": 0.3207904100418091, "learning_rate": 0.00014854230680209525, "loss": 0.234, "step": 3352 }, { "epoch": 0.6786075693179519, "grad_norm": 0.2659710943698883, "learning_rate": 0.00014851449600752025, "loss": 0.2075, "step": 3353 }, { "epoch": 0.678809957498482, "grad_norm": 0.2757861614227295, "learning_rate": 0.0001484866803047576, "loss": 0.2163, "step": 3354 }, { "epoch": 0.6790123456790124, "grad_norm": 0.32871562242507935, "learning_rate": 0.0001484588596966214, "loss": 0.2626, "step": 3355 }, { "epoch": 0.6792147338595426, "grad_norm": 0.29668277502059937, "learning_rate": 0.0001484310341859262, "loss": 0.2489, "step": 3356 }, { "epoch": 0.6794171220400729, "grad_norm": 0.2997402548789978, "learning_rate": 0.0001484032037754872, "loss": 0.2553, "step": 3357 }, { "epoch": 0.6796195102206031, "grad_norm": 0.3057543933391571, "learning_rate": 0.00014837536846811994, "loss": 0.2624, "step": 3358 }, { "epoch": 0.6798218984011334, "grad_norm": 0.310094952583313, "learning_rate": 0.00014834752826664045, "loss": 0.2447, "step": 3359 }, { "epoch": 0.6800242865816636, "grad_norm": 0.317460834980011, "learning_rate": 0.00014831968317386538, "loss": 0.2225, "step": 3360 }, { "epoch": 0.6802266747621939, "grad_norm": 0.2607171833515167, "learning_rate": 0.0001482918331926118, "loss": 0.2148, "step": 3361 }, { "epoch": 0.6804290629427241, "grad_norm": 0.2784850001335144, "learning_rate": 0.00014826397832569721, "loss": 0.2148, "step": 3362 }, { "epoch": 0.6806314511232544, "grad_norm": 0.3371258080005646, "learning_rate": 0.00014823611857593972, "loss": 0.2474, "step": 3363 }, { "epoch": 0.6808338393037846, "grad_norm": 0.33104196190834045, "learning_rate": 0.00014820825394615793, "loss": 0.2485, "step": 3364 }, { "epoch": 0.6810362274843149, "grad_norm": 0.3230952024459839, "learning_rate": 0.00014818038443917083, "loss": 0.2664, "step": 3365 }, { "epoch": 0.6812386156648452, "grad_norm": 0.7129462957382202, "learning_rate": 0.00014815251005779797, "loss": 0.2436, "step": 3366 }, { "epoch": 0.6814410038453754, "grad_norm": 0.3523089289665222, "learning_rate": 0.00014812463080485943, "loss": 0.2529, "step": 3367 }, { "epoch": 0.6816433920259057, "grad_norm": 0.2964981198310852, "learning_rate": 0.0001480967466831757, "loss": 0.2468, "step": 3368 }, { "epoch": 0.6818457802064359, "grad_norm": 0.3880394697189331, "learning_rate": 0.0001480688576955678, "loss": 0.2625, "step": 3369 }, { "epoch": 0.6820481683869662, "grad_norm": 0.3858512341976166, "learning_rate": 0.00014804096384485728, "loss": 0.2777, "step": 3370 }, { "epoch": 0.6822505565674964, "grad_norm": 0.323681116104126, "learning_rate": 0.00014801306513386614, "loss": 0.2733, "step": 3371 }, { "epoch": 0.6824529447480268, "grad_norm": 0.309332937002182, "learning_rate": 0.0001479851615654168, "loss": 0.2427, "step": 3372 }, { "epoch": 0.682655332928557, "grad_norm": 0.3008730411529541, "learning_rate": 0.00014795725314233237, "loss": 0.2485, "step": 3373 }, { "epoch": 0.6828577211090873, "grad_norm": 0.34555160999298096, "learning_rate": 0.0001479293398674363, "loss": 0.2254, "step": 3374 }, { "epoch": 0.6830601092896175, "grad_norm": 0.339863121509552, "learning_rate": 0.0001479014217435525, "loss": 0.2535, "step": 3375 }, { "epoch": 0.6832624974701478, "grad_norm": 0.43282923102378845, "learning_rate": 0.00014787349877350546, "loss": 0.2536, "step": 3376 }, { "epoch": 0.683464885650678, "grad_norm": 0.304993599653244, "learning_rate": 0.00014784557096012016, "loss": 0.1946, "step": 3377 }, { "epoch": 0.6836672738312083, "grad_norm": 0.29952695965766907, "learning_rate": 0.00014781763830622202, "loss": 0.2514, "step": 3378 }, { "epoch": 0.6838696620117385, "grad_norm": 0.4388863444328308, "learning_rate": 0.00014778970081463699, "loss": 0.247, "step": 3379 }, { "epoch": 0.6840720501922688, "grad_norm": 0.3819953501224518, "learning_rate": 0.0001477617584881915, "loss": 0.2696, "step": 3380 }, { "epoch": 0.684274438372799, "grad_norm": 0.2509651184082031, "learning_rate": 0.00014773381132971241, "loss": 0.2177, "step": 3381 }, { "epoch": 0.6844768265533293, "grad_norm": 0.3718028962612152, "learning_rate": 0.00014770585934202715, "loss": 0.2678, "step": 3382 }, { "epoch": 0.6846792147338595, "grad_norm": 0.3376442790031433, "learning_rate": 0.00014767790252796366, "loss": 0.251, "step": 3383 }, { "epoch": 0.6848816029143898, "grad_norm": 0.28158965706825256, "learning_rate": 0.00014764994089035027, "loss": 0.2431, "step": 3384 }, { "epoch": 0.68508399109492, "grad_norm": 0.23821482062339783, "learning_rate": 0.00014762197443201583, "loss": 0.2131, "step": 3385 }, { "epoch": 0.6852863792754503, "grad_norm": 0.29754817485809326, "learning_rate": 0.00014759400315578967, "loss": 0.2601, "step": 3386 }, { "epoch": 0.6854887674559805, "grad_norm": 0.27643054723739624, "learning_rate": 0.0001475660270645017, "loss": 0.2396, "step": 3387 }, { "epoch": 0.6856911556365108, "grad_norm": 0.3064822554588318, "learning_rate": 0.00014753804616098225, "loss": 0.2026, "step": 3388 }, { "epoch": 0.685893543817041, "grad_norm": 0.3296985626220703, "learning_rate": 0.00014751006044806203, "loss": 0.2439, "step": 3389 }, { "epoch": 0.6860959319975714, "grad_norm": 0.3113783597946167, "learning_rate": 0.00014748206992857245, "loss": 0.2038, "step": 3390 }, { "epoch": 0.6862983201781016, "grad_norm": 0.32076701521873474, "learning_rate": 0.00014745407460534525, "loss": 0.2497, "step": 3391 }, { "epoch": 0.6865007083586319, "grad_norm": 0.290251225233078, "learning_rate": 0.0001474260744812127, "loss": 0.2094, "step": 3392 }, { "epoch": 0.6867030965391621, "grad_norm": 0.33005815744400024, "learning_rate": 0.0001473980695590076, "loss": 0.1986, "step": 3393 }, { "epoch": 0.6869054847196924, "grad_norm": 0.40082716941833496, "learning_rate": 0.00014737005984156318, "loss": 0.2553, "step": 3394 }, { "epoch": 0.6871078729002226, "grad_norm": 0.29623132944107056, "learning_rate": 0.00014734204533171311, "loss": 0.2455, "step": 3395 }, { "epoch": 0.6873102610807529, "grad_norm": 0.3471490144729614, "learning_rate": 0.00014731402603229167, "loss": 0.2865, "step": 3396 }, { "epoch": 0.6875126492612832, "grad_norm": 0.4119548499584198, "learning_rate": 0.00014728600194613355, "loss": 0.2478, "step": 3397 }, { "epoch": 0.6877150374418134, "grad_norm": 0.35273849964141846, "learning_rate": 0.00014725797307607388, "loss": 0.2914, "step": 3398 }, { "epoch": 0.6879174256223437, "grad_norm": 0.33130013942718506, "learning_rate": 0.0001472299394249484, "loss": 0.2617, "step": 3399 }, { "epoch": 0.6881198138028739, "grad_norm": 0.2648128569126129, "learning_rate": 0.0001472019009955932, "loss": 0.2046, "step": 3400 }, { "epoch": 0.6881198138028739, "eval_loss": 0.26742979884147644, "eval_runtime": 0.7415, "eval_samples_per_second": 6.743, "eval_steps_per_second": 1.349, "step": 3400 }, { "epoch": 0.6883222019834042, "grad_norm": 0.26487231254577637, "learning_rate": 0.00014717385779084493, "loss": 0.2381, "step": 3401 }, { "epoch": 0.6885245901639344, "grad_norm": 0.3537691831588745, "learning_rate": 0.00014714580981354077, "loss": 0.2545, "step": 3402 }, { "epoch": 0.6887269783444647, "grad_norm": 0.27148503065109253, "learning_rate": 0.00014711775706651822, "loss": 0.2377, "step": 3403 }, { "epoch": 0.6889293665249949, "grad_norm": 0.30252805352211, "learning_rate": 0.00014708969955261545, "loss": 0.2388, "step": 3404 }, { "epoch": 0.6891317547055252, "grad_norm": 0.26469656825065613, "learning_rate": 0.00014706163727467097, "loss": 0.2158, "step": 3405 }, { "epoch": 0.6893341428860554, "grad_norm": 0.25582897663116455, "learning_rate": 0.00014703357023552384, "loss": 0.1943, "step": 3406 }, { "epoch": 0.6895365310665857, "grad_norm": 0.22913604974746704, "learning_rate": 0.0001470054984380136, "loss": 0.2137, "step": 3407 }, { "epoch": 0.689738919247116, "grad_norm": 0.3478175103664398, "learning_rate": 0.00014697742188498025, "loss": 0.2524, "step": 3408 }, { "epoch": 0.6899413074276463, "grad_norm": 0.3166928291320801, "learning_rate": 0.00014694934057926426, "loss": 0.2543, "step": 3409 }, { "epoch": 0.6901436956081765, "grad_norm": 0.36977559328079224, "learning_rate": 0.00014692125452370663, "loss": 0.2227, "step": 3410 }, { "epoch": 0.6903460837887068, "grad_norm": 0.24233773350715637, "learning_rate": 0.00014689316372114883, "loss": 0.2042, "step": 3411 }, { "epoch": 0.690548471969237, "grad_norm": 0.3313998878002167, "learning_rate": 0.00014686506817443274, "loss": 0.2213, "step": 3412 }, { "epoch": 0.6907508601497673, "grad_norm": 0.3995082378387451, "learning_rate": 0.00014683696788640082, "loss": 0.259, "step": 3413 }, { "epoch": 0.6909532483302975, "grad_norm": 0.30752405524253845, "learning_rate": 0.00014680886285989595, "loss": 0.28, "step": 3414 }, { "epoch": 0.6911556365108278, "grad_norm": 0.3397967219352722, "learning_rate": 0.00014678075309776148, "loss": 0.2488, "step": 3415 }, { "epoch": 0.691358024691358, "grad_norm": 0.26995396614074707, "learning_rate": 0.00014675263860284128, "loss": 0.2255, "step": 3416 }, { "epoch": 0.6915604128718883, "grad_norm": 0.2898835241794586, "learning_rate": 0.00014672451937797968, "loss": 0.2517, "step": 3417 }, { "epoch": 0.6917628010524185, "grad_norm": 0.30538830161094666, "learning_rate": 0.00014669639542602147, "loss": 0.2225, "step": 3418 }, { "epoch": 0.6919651892329488, "grad_norm": 0.2842797040939331, "learning_rate": 0.00014666826674981196, "loss": 0.2615, "step": 3419 }, { "epoch": 0.692167577413479, "grad_norm": 0.36719465255737305, "learning_rate": 0.0001466401333521969, "loss": 0.218, "step": 3420 }, { "epoch": 0.6923699655940093, "grad_norm": 0.33825409412384033, "learning_rate": 0.00014661199523602255, "loss": 0.2335, "step": 3421 }, { "epoch": 0.6925723537745395, "grad_norm": 0.32189881801605225, "learning_rate": 0.0001465838524041356, "loss": 0.261, "step": 3422 }, { "epoch": 0.6927747419550698, "grad_norm": 0.3095569610595703, "learning_rate": 0.0001465557048593833, "loss": 0.2225, "step": 3423 }, { "epoch": 0.6929771301356, "grad_norm": 0.3033314645290375, "learning_rate": 0.00014652755260461325, "loss": 0.2272, "step": 3424 }, { "epoch": 0.6931795183161303, "grad_norm": 0.2859857380390167, "learning_rate": 0.00014649939564267362, "loss": 0.2419, "step": 3425 }, { "epoch": 0.6933819064966606, "grad_norm": 0.2597350776195526, "learning_rate": 0.0001464712339764131, "loss": 0.2393, "step": 3426 }, { "epoch": 0.6935842946771908, "grad_norm": 0.305396169424057, "learning_rate": 0.0001464430676086807, "loss": 0.2382, "step": 3427 }, { "epoch": 0.6937866828577212, "grad_norm": 0.3529883921146393, "learning_rate": 0.0001464148965423261, "loss": 0.2664, "step": 3428 }, { "epoch": 0.6939890710382514, "grad_norm": 0.3151325285434723, "learning_rate": 0.00014638672078019926, "loss": 0.2202, "step": 3429 }, { "epoch": 0.6941914592187817, "grad_norm": 0.27725449204444885, "learning_rate": 0.00014635854032515072, "loss": 0.2431, "step": 3430 }, { "epoch": 0.6943938473993119, "grad_norm": 0.2587081491947174, "learning_rate": 0.00014633035518003153, "loss": 0.2224, "step": 3431 }, { "epoch": 0.6945962355798422, "grad_norm": 0.26528117060661316, "learning_rate": 0.00014630216534769312, "loss": 0.2067, "step": 3432 }, { "epoch": 0.6947986237603724, "grad_norm": 0.28663206100463867, "learning_rate": 0.00014627397083098747, "loss": 0.2205, "step": 3433 }, { "epoch": 0.6950010119409027, "grad_norm": 0.2610286474227905, "learning_rate": 0.00014624577163276702, "loss": 0.2594, "step": 3434 }, { "epoch": 0.6952034001214329, "grad_norm": 0.48873916268348694, "learning_rate": 0.00014621756775588462, "loss": 0.2415, "step": 3435 }, { "epoch": 0.6954057883019632, "grad_norm": 0.27237123250961304, "learning_rate": 0.00014618935920319368, "loss": 0.2283, "step": 3436 }, { "epoch": 0.6956081764824934, "grad_norm": 0.3596338629722595, "learning_rate": 0.00014616114597754804, "loss": 0.2363, "step": 3437 }, { "epoch": 0.6958105646630237, "grad_norm": 0.30172982811927795, "learning_rate": 0.00014613292808180202, "loss": 0.2657, "step": 3438 }, { "epoch": 0.6960129528435539, "grad_norm": 0.2471524178981781, "learning_rate": 0.00014610470551881037, "loss": 0.2093, "step": 3439 }, { "epoch": 0.6962153410240842, "grad_norm": 0.3113197088241577, "learning_rate": 0.00014607647829142844, "loss": 0.2763, "step": 3440 }, { "epoch": 0.6964177292046144, "grad_norm": 0.614896833896637, "learning_rate": 0.00014604824640251186, "loss": 0.2017, "step": 3441 }, { "epoch": 0.6966201173851447, "grad_norm": 0.27417445182800293, "learning_rate": 0.00014602000985491692, "loss": 0.2402, "step": 3442 }, { "epoch": 0.6968225055656749, "grad_norm": 0.29607483744621277, "learning_rate": 0.00014599176865150027, "loss": 0.263, "step": 3443 }, { "epoch": 0.6970248937462052, "grad_norm": 0.348371684551239, "learning_rate": 0.00014596352279511903, "loss": 0.2236, "step": 3444 }, { "epoch": 0.6972272819267354, "grad_norm": 0.2907358407974243, "learning_rate": 0.00014593527228863083, "loss": 0.213, "step": 3445 }, { "epoch": 0.6974296701072658, "grad_norm": 0.3474874496459961, "learning_rate": 0.00014590701713489382, "loss": 0.2123, "step": 3446 }, { "epoch": 0.697632058287796, "grad_norm": 0.2992868721485138, "learning_rate": 0.00014587875733676646, "loss": 0.2418, "step": 3447 }, { "epoch": 0.6978344464683263, "grad_norm": 0.37498465180397034, "learning_rate": 0.0001458504928971079, "loss": 0.2517, "step": 3448 }, { "epoch": 0.6980368346488565, "grad_norm": 0.24434925615787506, "learning_rate": 0.0001458222238187775, "loss": 0.2309, "step": 3449 }, { "epoch": 0.6982392228293868, "grad_norm": 0.3073577880859375, "learning_rate": 0.00014579395010463537, "loss": 0.2517, "step": 3450 }, { "epoch": 0.6982392228293868, "eval_loss": 0.26842841506004333, "eval_runtime": 0.7378, "eval_samples_per_second": 6.777, "eval_steps_per_second": 1.355, "step": 3450 }, { "epoch": 0.698441611009917, "grad_norm": 0.3026072382926941, "learning_rate": 0.00014576567175754183, "loss": 0.2191, "step": 3451 }, { "epoch": 0.6986439991904473, "grad_norm": 0.3033480644226074, "learning_rate": 0.00014573738878035785, "loss": 0.2746, "step": 3452 }, { "epoch": 0.6988463873709775, "grad_norm": 0.32554948329925537, "learning_rate": 0.0001457091011759448, "loss": 0.2847, "step": 3453 }, { "epoch": 0.6990487755515078, "grad_norm": 0.33661654591560364, "learning_rate": 0.0001456808089471645, "loss": 0.2413, "step": 3454 }, { "epoch": 0.699251163732038, "grad_norm": 0.24390999972820282, "learning_rate": 0.00014565251209687927, "loss": 0.2142, "step": 3455 }, { "epoch": 0.6994535519125683, "grad_norm": 0.3766001760959625, "learning_rate": 0.00014562421062795192, "loss": 0.2578, "step": 3456 }, { "epoch": 0.6996559400930986, "grad_norm": 0.31514355540275574, "learning_rate": 0.00014559590454324564, "loss": 0.223, "step": 3457 }, { "epoch": 0.6998583282736288, "grad_norm": 0.43481573462486267, "learning_rate": 0.00014556759384562416, "loss": 0.2558, "step": 3458 }, { "epoch": 0.7000607164541591, "grad_norm": 0.2606533169746399, "learning_rate": 0.00014553927853795169, "loss": 0.207, "step": 3459 }, { "epoch": 0.7002631046346893, "grad_norm": 0.3021109402179718, "learning_rate": 0.00014551095862309286, "loss": 0.2596, "step": 3460 }, { "epoch": 0.7004654928152196, "grad_norm": 0.4843895733356476, "learning_rate": 0.0001454826341039128, "loss": 0.1994, "step": 3461 }, { "epoch": 0.7006678809957498, "grad_norm": 0.313245564699173, "learning_rate": 0.00014545430498327702, "loss": 0.2487, "step": 3462 }, { "epoch": 0.7008702691762801, "grad_norm": 0.24608662724494934, "learning_rate": 0.00014542597126405163, "loss": 0.219, "step": 3463 }, { "epoch": 0.7010726573568103, "grad_norm": 0.27048546075820923, "learning_rate": 0.00014539763294910311, "loss": 0.2236, "step": 3464 }, { "epoch": 0.7012750455373407, "grad_norm": 0.3017890453338623, "learning_rate": 0.00014536929004129844, "loss": 0.2483, "step": 3465 }, { "epoch": 0.7014774337178709, "grad_norm": 0.40976661443710327, "learning_rate": 0.00014534094254350506, "loss": 0.2344, "step": 3466 }, { "epoch": 0.7016798218984012, "grad_norm": 0.3057340979576111, "learning_rate": 0.00014531259045859086, "loss": 0.2256, "step": 3467 }, { "epoch": 0.7018822100789314, "grad_norm": 0.37210866808891296, "learning_rate": 0.0001452842337894242, "loss": 0.254, "step": 3468 }, { "epoch": 0.7020845982594617, "grad_norm": 0.2670847177505493, "learning_rate": 0.00014525587253887395, "loss": 0.246, "step": 3469 }, { "epoch": 0.7022869864399919, "grad_norm": 0.5012633800506592, "learning_rate": 0.0001452275067098094, "loss": 0.2463, "step": 3470 }, { "epoch": 0.7024893746205222, "grad_norm": 0.38355302810668945, "learning_rate": 0.00014519913630510028, "loss": 0.2273, "step": 3471 }, { "epoch": 0.7026917628010524, "grad_norm": 0.34492430090904236, "learning_rate": 0.00014517076132761686, "loss": 0.2533, "step": 3472 }, { "epoch": 0.7028941509815827, "grad_norm": 0.4136013984680176, "learning_rate": 0.0001451423817802297, "loss": 0.2704, "step": 3473 }, { "epoch": 0.7030965391621129, "grad_norm": 0.3448164761066437, "learning_rate": 0.00014511399766581006, "loss": 0.2584, "step": 3474 }, { "epoch": 0.7032989273426432, "grad_norm": 0.287451833486557, "learning_rate": 0.00014508560898722952, "loss": 0.2235, "step": 3475 }, { "epoch": 0.7035013155231734, "grad_norm": 0.3065091669559479, "learning_rate": 0.0001450572157473601, "loss": 0.2619, "step": 3476 }, { "epoch": 0.7037037037037037, "grad_norm": 0.2735513746738434, "learning_rate": 0.00014502881794907442, "loss": 0.2376, "step": 3477 }, { "epoch": 0.7039060918842339, "grad_norm": 0.3324407637119293, "learning_rate": 0.0001450004155952454, "loss": 0.2468, "step": 3478 }, { "epoch": 0.7041084800647642, "grad_norm": 0.3772326707839966, "learning_rate": 0.00014497200868874652, "loss": 0.241, "step": 3479 }, { "epoch": 0.7043108682452944, "grad_norm": 0.2763902246952057, "learning_rate": 0.00014494359723245167, "loss": 0.2414, "step": 3480 }, { "epoch": 0.7045132564258247, "grad_norm": 0.3471759259700775, "learning_rate": 0.00014491518122923528, "loss": 0.2279, "step": 3481 }, { "epoch": 0.7047156446063549, "grad_norm": 0.3083358108997345, "learning_rate": 0.00014488676068197208, "loss": 0.2776, "step": 3482 }, { "epoch": 0.7049180327868853, "grad_norm": 0.28121310472488403, "learning_rate": 0.00014485833559353748, "loss": 0.225, "step": 3483 }, { "epoch": 0.7051204209674155, "grad_norm": 0.3246133625507355, "learning_rate": 0.00014482990596680718, "loss": 0.2256, "step": 3484 }, { "epoch": 0.7053228091479458, "grad_norm": 0.29464036226272583, "learning_rate": 0.00014480147180465734, "loss": 0.2081, "step": 3485 }, { "epoch": 0.7055251973284761, "grad_norm": 0.31715965270996094, "learning_rate": 0.00014477303310996473, "loss": 0.254, "step": 3486 }, { "epoch": 0.7057275855090063, "grad_norm": 0.27292385697364807, "learning_rate": 0.0001447445898856064, "loss": 0.2361, "step": 3487 }, { "epoch": 0.7059299736895366, "grad_norm": 0.2801492512226105, "learning_rate": 0.00014471614213445996, "loss": 0.2489, "step": 3488 }, { "epoch": 0.7061323618700668, "grad_norm": 0.44114720821380615, "learning_rate": 0.0001446876898594035, "loss": 0.2458, "step": 3489 }, { "epoch": 0.7063347500505971, "grad_norm": 0.33379796147346497, "learning_rate": 0.00014465923306331544, "loss": 0.2793, "step": 3490 }, { "epoch": 0.7065371382311273, "grad_norm": 0.3142834007740021, "learning_rate": 0.0001446307717490748, "loss": 0.2649, "step": 3491 }, { "epoch": 0.7067395264116576, "grad_norm": 0.31089064478874207, "learning_rate": 0.00014460230591956097, "loss": 0.2507, "step": 3492 }, { "epoch": 0.7069419145921878, "grad_norm": 0.2820049524307251, "learning_rate": 0.00014457383557765386, "loss": 0.2474, "step": 3493 }, { "epoch": 0.7071443027727181, "grad_norm": 0.2603655457496643, "learning_rate": 0.00014454536072623373, "loss": 0.2381, "step": 3494 }, { "epoch": 0.7073466909532483, "grad_norm": 0.33057910203933716, "learning_rate": 0.00014451688136818145, "loss": 0.2445, "step": 3495 }, { "epoch": 0.7075490791337786, "grad_norm": 0.30307960510253906, "learning_rate": 0.0001444883975063782, "loss": 0.2287, "step": 3496 }, { "epoch": 0.7077514673143088, "grad_norm": 0.40663906931877136, "learning_rate": 0.0001444599091437057, "loss": 0.2353, "step": 3497 }, { "epoch": 0.7079538554948391, "grad_norm": 0.30998459458351135, "learning_rate": 0.0001444314162830461, "loss": 0.2435, "step": 3498 }, { "epoch": 0.7081562436753693, "grad_norm": 0.328166663646698, "learning_rate": 0.00014440291892728205, "loss": 0.2454, "step": 3499 }, { "epoch": 0.7083586318558996, "grad_norm": 0.297428160905838, "learning_rate": 0.00014437441707929657, "loss": 0.2609, "step": 3500 }, { "epoch": 0.7083586318558996, "eval_loss": 0.26719534397125244, "eval_runtime": 0.7389, "eval_samples_per_second": 6.766, "eval_steps_per_second": 1.353, "step": 3500 }, { "epoch": 0.7085610200364298, "grad_norm": 0.3447844982147217, "learning_rate": 0.00014434591074197317, "loss": 0.2263, "step": 3501 }, { "epoch": 0.7087634082169602, "grad_norm": 0.36139580607414246, "learning_rate": 0.00014431739991819584, "loss": 0.2489, "step": 3502 }, { "epoch": 0.7089657963974904, "grad_norm": 0.2836248576641083, "learning_rate": 0.000144288884610849, "loss": 0.225, "step": 3503 }, { "epoch": 0.7091681845780207, "grad_norm": 0.36919328570365906, "learning_rate": 0.00014426036482281752, "loss": 0.2653, "step": 3504 }, { "epoch": 0.7093705727585509, "grad_norm": 0.27661412954330444, "learning_rate": 0.00014423184055698676, "loss": 0.221, "step": 3505 }, { "epoch": 0.7095729609390812, "grad_norm": 0.30864477157592773, "learning_rate": 0.0001442033118162425, "loss": 0.2465, "step": 3506 }, { "epoch": 0.7097753491196114, "grad_norm": 0.2647888958454132, "learning_rate": 0.00014417477860347098, "loss": 0.2402, "step": 3507 }, { "epoch": 0.7099777373001417, "grad_norm": 0.24299830198287964, "learning_rate": 0.00014414624092155885, "loss": 0.1997, "step": 3508 }, { "epoch": 0.7101801254806719, "grad_norm": 0.323573499917984, "learning_rate": 0.00014411769877339332, "loss": 0.2546, "step": 3509 }, { "epoch": 0.7103825136612022, "grad_norm": 0.2725816071033478, "learning_rate": 0.0001440891521618619, "loss": 0.2336, "step": 3510 }, { "epoch": 0.7105849018417324, "grad_norm": 0.3446320593357086, "learning_rate": 0.00014406060108985275, "loss": 0.2503, "step": 3511 }, { "epoch": 0.7107872900222627, "grad_norm": 0.35565873980522156, "learning_rate": 0.00014403204556025427, "loss": 0.283, "step": 3512 }, { "epoch": 0.7109896782027929, "grad_norm": 0.27790892124176025, "learning_rate": 0.00014400348557595544, "loss": 0.2213, "step": 3513 }, { "epoch": 0.7111920663833232, "grad_norm": 0.29456159472465515, "learning_rate": 0.0001439749211398457, "loss": 0.2316, "step": 3514 }, { "epoch": 0.7113944545638534, "grad_norm": 0.24280983209609985, "learning_rate": 0.00014394635225481477, "loss": 0.2218, "step": 3515 }, { "epoch": 0.7115968427443837, "grad_norm": 0.3166976571083069, "learning_rate": 0.00014391777892375313, "loss": 0.1995, "step": 3516 }, { "epoch": 0.711799230924914, "grad_norm": 0.24177271127700806, "learning_rate": 0.00014388920114955143, "loss": 0.187, "step": 3517 }, { "epoch": 0.7120016191054442, "grad_norm": 0.2750643193721771, "learning_rate": 0.00014386061893510087, "loss": 0.2276, "step": 3518 }, { "epoch": 0.7122040072859745, "grad_norm": 0.3380866050720215, "learning_rate": 0.0001438320322832931, "loss": 0.2652, "step": 3519 }, { "epoch": 0.7124063954665047, "grad_norm": 0.30895286798477173, "learning_rate": 0.00014380344119702023, "loss": 0.2374, "step": 3520 }, { "epoch": 0.7126087836470351, "grad_norm": 0.3229091763496399, "learning_rate": 0.0001437748456791748, "loss": 0.2416, "step": 3521 }, { "epoch": 0.7128111718275653, "grad_norm": 0.2924569845199585, "learning_rate": 0.00014374624573264982, "loss": 0.2296, "step": 3522 }, { "epoch": 0.7130135600080956, "grad_norm": 0.28520187735557556, "learning_rate": 0.00014371764136033872, "loss": 0.2299, "step": 3523 }, { "epoch": 0.7132159481886258, "grad_norm": 0.284915953874588, "learning_rate": 0.0001436890325651354, "loss": 0.2233, "step": 3524 }, { "epoch": 0.7134183363691561, "grad_norm": 0.3498363792896271, "learning_rate": 0.00014366041934993416, "loss": 0.2379, "step": 3525 }, { "epoch": 0.7136207245496863, "grad_norm": 0.29052790999412537, "learning_rate": 0.00014363180171762983, "loss": 0.2445, "step": 3526 }, { "epoch": 0.7138231127302166, "grad_norm": 0.4334908127784729, "learning_rate": 0.00014360317967111765, "loss": 0.2351, "step": 3527 }, { "epoch": 0.7140255009107468, "grad_norm": 0.2792114317417145, "learning_rate": 0.00014357455321329328, "loss": 0.2175, "step": 3528 }, { "epoch": 0.7142278890912771, "grad_norm": 0.3908980190753937, "learning_rate": 0.0001435459223470528, "loss": 0.279, "step": 3529 }, { "epoch": 0.7144302772718073, "grad_norm": 0.2973538339138031, "learning_rate": 0.0001435172870752928, "loss": 0.2501, "step": 3530 }, { "epoch": 0.7146326654523376, "grad_norm": 0.2654437720775604, "learning_rate": 0.00014348864740091038, "loss": 0.2043, "step": 3531 }, { "epoch": 0.7148350536328678, "grad_norm": 0.2809726297855377, "learning_rate": 0.0001434600033268029, "loss": 0.2059, "step": 3532 }, { "epoch": 0.7150374418133981, "grad_norm": 0.27874892950057983, "learning_rate": 0.00014343135485586828, "loss": 0.2401, "step": 3533 }, { "epoch": 0.7152398299939283, "grad_norm": 0.3021167814731598, "learning_rate": 0.00014340270199100495, "loss": 0.2552, "step": 3534 }, { "epoch": 0.7154422181744586, "grad_norm": 0.3264015316963196, "learning_rate": 0.0001433740447351116, "loss": 0.2306, "step": 3535 }, { "epoch": 0.7156446063549888, "grad_norm": 0.26608943939208984, "learning_rate": 0.00014334538309108757, "loss": 0.2053, "step": 3536 }, { "epoch": 0.7158469945355191, "grad_norm": 0.3866048753261566, "learning_rate": 0.00014331671706183246, "loss": 0.2176, "step": 3537 }, { "epoch": 0.7160493827160493, "grad_norm": 0.27158334851264954, "learning_rate": 0.00014328804665024645, "loss": 0.2598, "step": 3538 }, { "epoch": 0.7162517708965797, "grad_norm": 0.4619157016277313, "learning_rate": 0.0001432593718592301, "loss": 0.2526, "step": 3539 }, { "epoch": 0.7164541590771099, "grad_norm": 0.32520925998687744, "learning_rate": 0.00014323069269168444, "loss": 0.2662, "step": 3540 }, { "epoch": 0.7166565472576402, "grad_norm": 0.27976465225219727, "learning_rate": 0.00014320200915051085, "loss": 0.2724, "step": 3541 }, { "epoch": 0.7168589354381704, "grad_norm": 0.2563546299934387, "learning_rate": 0.00014317332123861133, "loss": 0.2409, "step": 3542 }, { "epoch": 0.7170613236187007, "grad_norm": 0.2647092044353485, "learning_rate": 0.00014314462895888817, "loss": 0.209, "step": 3543 }, { "epoch": 0.7172637117992309, "grad_norm": 0.26110538840293884, "learning_rate": 0.00014311593231424415, "loss": 0.222, "step": 3544 }, { "epoch": 0.7174660999797612, "grad_norm": 0.35597339272499084, "learning_rate": 0.00014308723130758254, "loss": 0.2434, "step": 3545 }, { "epoch": 0.7176684881602914, "grad_norm": 0.2322133183479309, "learning_rate": 0.00014305852594180692, "loss": 0.1974, "step": 3546 }, { "epoch": 0.7178708763408217, "grad_norm": 0.2905055582523346, "learning_rate": 0.0001430298162198215, "loss": 0.207, "step": 3547 }, { "epoch": 0.718073264521352, "grad_norm": 0.35906949639320374, "learning_rate": 0.00014300110214453078, "loss": 0.2312, "step": 3548 }, { "epoch": 0.7182756527018822, "grad_norm": 0.31365084648132324, "learning_rate": 0.00014297238371883974, "loss": 0.1912, "step": 3549 }, { "epoch": 0.7184780408824125, "grad_norm": 0.36804234981536865, "learning_rate": 0.00014294366094565384, "loss": 0.2453, "step": 3550 }, { "epoch": 0.7184780408824125, "eval_loss": 0.2697356641292572, "eval_runtime": 0.7383, "eval_samples_per_second": 6.772, "eval_steps_per_second": 1.354, "step": 3550 }, { "epoch": 0.7186804290629427, "grad_norm": 0.32128703594207764, "learning_rate": 0.0001429149338278789, "loss": 0.1886, "step": 3551 }, { "epoch": 0.718882817243473, "grad_norm": 0.332903116941452, "learning_rate": 0.00014288620236842128, "loss": 0.2355, "step": 3552 }, { "epoch": 0.7190852054240032, "grad_norm": 0.283093124628067, "learning_rate": 0.00014285746657018768, "loss": 0.2728, "step": 3553 }, { "epoch": 0.7192875936045335, "grad_norm": 0.24236519634723663, "learning_rate": 0.00014282872643608534, "loss": 0.1791, "step": 3554 }, { "epoch": 0.7194899817850637, "grad_norm": 0.4784996211528778, "learning_rate": 0.00014279998196902182, "loss": 0.2489, "step": 3555 }, { "epoch": 0.719692369965594, "grad_norm": 0.26929202675819397, "learning_rate": 0.00014277123317190524, "loss": 0.215, "step": 3556 }, { "epoch": 0.7198947581461242, "grad_norm": 0.3588574528694153, "learning_rate": 0.0001427424800476441, "loss": 0.2819, "step": 3557 }, { "epoch": 0.7200971463266546, "grad_norm": 0.29234176874160767, "learning_rate": 0.00014271372259914728, "loss": 0.2522, "step": 3558 }, { "epoch": 0.7202995345071848, "grad_norm": 0.3432823121547699, "learning_rate": 0.00014268496082932422, "loss": 0.2103, "step": 3559 }, { "epoch": 0.7205019226877151, "grad_norm": 0.3589431941509247, "learning_rate": 0.0001426561947410847, "loss": 0.2251, "step": 3560 }, { "epoch": 0.7207043108682453, "grad_norm": 0.2709294855594635, "learning_rate": 0.00014262742433733902, "loss": 0.2039, "step": 3561 }, { "epoch": 0.7209066990487756, "grad_norm": 0.32245975732803345, "learning_rate": 0.0001425986496209978, "loss": 0.2192, "step": 3562 }, { "epoch": 0.7211090872293058, "grad_norm": 0.3126862347126007, "learning_rate": 0.0001425698705949722, "loss": 0.2463, "step": 3563 }, { "epoch": 0.7213114754098361, "grad_norm": 0.28890764713287354, "learning_rate": 0.00014254108726217374, "loss": 0.2187, "step": 3564 }, { "epoch": 0.7215138635903663, "grad_norm": 0.3311489522457123, "learning_rate": 0.0001425122996255145, "loss": 0.2877, "step": 3565 }, { "epoch": 0.7217162517708966, "grad_norm": 0.26622334122657776, "learning_rate": 0.00014248350768790685, "loss": 0.2306, "step": 3566 }, { "epoch": 0.7219186399514268, "grad_norm": 0.27433136105537415, "learning_rate": 0.00014245471145226364, "loss": 0.2277, "step": 3567 }, { "epoch": 0.7221210281319571, "grad_norm": 0.2935827970504761, "learning_rate": 0.00014242591092149823, "loss": 0.2446, "step": 3568 }, { "epoch": 0.7223234163124873, "grad_norm": 0.2975723445415497, "learning_rate": 0.00014239710609852428, "loss": 0.2466, "step": 3569 }, { "epoch": 0.7225258044930176, "grad_norm": 0.3258729577064514, "learning_rate": 0.000142368296986256, "loss": 0.2191, "step": 3570 }, { "epoch": 0.7227281926735478, "grad_norm": 0.3339604139328003, "learning_rate": 0.00014233948358760803, "loss": 0.1956, "step": 3571 }, { "epoch": 0.7229305808540781, "grad_norm": 0.39879077672958374, "learning_rate": 0.0001423106659054954, "loss": 0.2573, "step": 3572 }, { "epoch": 0.7231329690346083, "grad_norm": 0.31401699781417847, "learning_rate": 0.0001422818439428335, "loss": 0.2251, "step": 3573 }, { "epoch": 0.7233353572151386, "grad_norm": 0.26941850781440735, "learning_rate": 0.0001422530177025383, "loss": 0.216, "step": 3574 }, { "epoch": 0.7235377453956688, "grad_norm": 0.27729305624961853, "learning_rate": 0.00014222418718752615, "loss": 0.1955, "step": 3575 }, { "epoch": 0.7237401335761992, "grad_norm": 0.3241328299045563, "learning_rate": 0.00014219535240071377, "loss": 0.2437, "step": 3576 }, { "epoch": 0.7239425217567295, "grad_norm": 0.2551616430282593, "learning_rate": 0.0001421665133450184, "loss": 0.2118, "step": 3577 }, { "epoch": 0.7241449099372597, "grad_norm": 0.27195173501968384, "learning_rate": 0.00014213767002335765, "loss": 0.2311, "step": 3578 }, { "epoch": 0.72434729811779, "grad_norm": 0.29431605339050293, "learning_rate": 0.0001421088224386496, "loss": 0.2416, "step": 3579 }, { "epoch": 0.7245496862983202, "grad_norm": 0.42031076550483704, "learning_rate": 0.00014207997059381274, "loss": 0.2509, "step": 3580 }, { "epoch": 0.7247520744788505, "grad_norm": 0.2811602056026459, "learning_rate": 0.00014205111449176597, "loss": 0.2103, "step": 3581 }, { "epoch": 0.7249544626593807, "grad_norm": 0.6383354067802429, "learning_rate": 0.00014202225413542871, "loss": 0.2508, "step": 3582 }, { "epoch": 0.725156850839911, "grad_norm": 0.28183814883232117, "learning_rate": 0.0001419933895277207, "loss": 0.2053, "step": 3583 }, { "epoch": 0.7253592390204412, "grad_norm": 0.3368355631828308, "learning_rate": 0.00014196452067156216, "loss": 0.283, "step": 3584 }, { "epoch": 0.7255616272009715, "grad_norm": 0.24328923225402832, "learning_rate": 0.00014193564756987374, "loss": 0.2156, "step": 3585 }, { "epoch": 0.7257640153815017, "grad_norm": 0.28014034032821655, "learning_rate": 0.00014190677022557654, "loss": 0.2245, "step": 3586 }, { "epoch": 0.725966403562032, "grad_norm": 0.3064032793045044, "learning_rate": 0.00014187788864159206, "loss": 0.2699, "step": 3587 }, { "epoch": 0.7261687917425622, "grad_norm": 0.325112909078598, "learning_rate": 0.0001418490028208422, "loss": 0.2325, "step": 3588 }, { "epoch": 0.7263711799230925, "grad_norm": 0.32190853357315063, "learning_rate": 0.00014182011276624938, "loss": 0.2499, "step": 3589 }, { "epoch": 0.7265735681036227, "grad_norm": 0.299363911151886, "learning_rate": 0.00014179121848073632, "loss": 0.2269, "step": 3590 }, { "epoch": 0.726775956284153, "grad_norm": 0.2915518879890442, "learning_rate": 0.00014176231996722633, "loss": 0.2687, "step": 3591 }, { "epoch": 0.7269783444646832, "grad_norm": 0.4215262234210968, "learning_rate": 0.000141733417228643, "loss": 0.2414, "step": 3592 }, { "epoch": 0.7271807326452135, "grad_norm": 0.28340229392051697, "learning_rate": 0.0001417045102679104, "loss": 0.2472, "step": 3593 }, { "epoch": 0.7273831208257437, "grad_norm": 0.2818836569786072, "learning_rate": 0.00014167559908795306, "loss": 0.2392, "step": 3594 }, { "epoch": 0.7275855090062741, "grad_norm": 0.3229272663593292, "learning_rate": 0.00014164668369169591, "loss": 0.2371, "step": 3595 }, { "epoch": 0.7277878971868043, "grad_norm": 0.3302474319934845, "learning_rate": 0.0001416177640820643, "loss": 0.2704, "step": 3596 }, { "epoch": 0.7279902853673346, "grad_norm": 0.5139620900154114, "learning_rate": 0.000141588840261984, "loss": 0.1976, "step": 3597 }, { "epoch": 0.7281926735478648, "grad_norm": 0.30498552322387695, "learning_rate": 0.00014155991223438122, "loss": 0.2457, "step": 3598 }, { "epoch": 0.7283950617283951, "grad_norm": 0.35138848423957825, "learning_rate": 0.00014153098000218263, "loss": 0.2284, "step": 3599 }, { "epoch": 0.7285974499089253, "grad_norm": 0.281194806098938, "learning_rate": 0.00014150204356831524, "loss": 0.2158, "step": 3600 }, { "epoch": 0.7285974499089253, "eval_loss": 0.26394587755203247, "eval_runtime": 0.7386, "eval_samples_per_second": 6.77, "eval_steps_per_second": 1.354, "step": 3600 }, { "epoch": 0.7287998380894556, "grad_norm": 0.2820914685726166, "learning_rate": 0.00014147310293570657, "loss": 0.2492, "step": 3601 }, { "epoch": 0.7290022262699858, "grad_norm": 0.24649406969547272, "learning_rate": 0.00014144415810728452, "loss": 0.2352, "step": 3602 }, { "epoch": 0.7292046144505161, "grad_norm": 0.2653089761734009, "learning_rate": 0.00014141520908597741, "loss": 0.2308, "step": 3603 }, { "epoch": 0.7294070026310463, "grad_norm": 0.25622621178627014, "learning_rate": 0.000141386255874714, "loss": 0.2085, "step": 3604 }, { "epoch": 0.7296093908115766, "grad_norm": 0.2600463926792145, "learning_rate": 0.0001413572984764235, "loss": 0.2411, "step": 3605 }, { "epoch": 0.7298117789921068, "grad_norm": 0.36637434363365173, "learning_rate": 0.0001413283368940355, "loss": 0.2619, "step": 3606 }, { "epoch": 0.7300141671726371, "grad_norm": 0.3341212868690491, "learning_rate": 0.00014129937113048003, "loss": 0.2627, "step": 3607 }, { "epoch": 0.7302165553531674, "grad_norm": 0.2946942150592804, "learning_rate": 0.00014127040118868753, "loss": 0.2332, "step": 3608 }, { "epoch": 0.7304189435336976, "grad_norm": 0.33799755573272705, "learning_rate": 0.0001412414270715889, "loss": 0.2814, "step": 3609 }, { "epoch": 0.7306213317142279, "grad_norm": 0.4021568298339844, "learning_rate": 0.00014121244878211538, "loss": 0.2477, "step": 3610 }, { "epoch": 0.7308237198947581, "grad_norm": 0.8903542160987854, "learning_rate": 0.00014118346632319877, "loss": 0.2525, "step": 3611 }, { "epoch": 0.7310261080752884, "grad_norm": 0.27222204208374023, "learning_rate": 0.00014115447969777114, "loss": 0.2341, "step": 3612 }, { "epoch": 0.7312284962558186, "grad_norm": 0.47628235816955566, "learning_rate": 0.0001411254889087651, "loss": 0.2444, "step": 3613 }, { "epoch": 0.731430884436349, "grad_norm": 0.290464848279953, "learning_rate": 0.0001410964939591136, "loss": 0.2337, "step": 3614 }, { "epoch": 0.7316332726168792, "grad_norm": 0.3501092791557312, "learning_rate": 0.00014106749485175008, "loss": 0.2542, "step": 3615 }, { "epoch": 0.7318356607974095, "grad_norm": 0.3015660047531128, "learning_rate": 0.00014103849158960834, "loss": 0.2356, "step": 3616 }, { "epoch": 0.7320380489779397, "grad_norm": 0.4557827413082123, "learning_rate": 0.00014100948417562265, "loss": 0.2674, "step": 3617 }, { "epoch": 0.73224043715847, "grad_norm": 0.448643296957016, "learning_rate": 0.00014098047261272765, "loss": 0.2324, "step": 3618 }, { "epoch": 0.7324428253390002, "grad_norm": 0.3245983123779297, "learning_rate": 0.00014095145690385842, "loss": 0.2622, "step": 3619 }, { "epoch": 0.7326452135195305, "grad_norm": 0.3534657955169678, "learning_rate": 0.00014092243705195046, "loss": 0.2672, "step": 3620 }, { "epoch": 0.7328476017000607, "grad_norm": 0.5434038043022156, "learning_rate": 0.00014089341305993975, "loss": 0.2324, "step": 3621 }, { "epoch": 0.733049989880591, "grad_norm": 0.32712897658348083, "learning_rate": 0.0001408643849307626, "loss": 0.2647, "step": 3622 }, { "epoch": 0.7332523780611212, "grad_norm": 0.38458430767059326, "learning_rate": 0.00014083535266735576, "loss": 0.2628, "step": 3623 }, { "epoch": 0.7334547662416515, "grad_norm": 0.30817049741744995, "learning_rate": 0.0001408063162726564, "loss": 0.2344, "step": 3624 }, { "epoch": 0.7336571544221817, "grad_norm": 0.5152879953384399, "learning_rate": 0.00014077727574960213, "loss": 0.2419, "step": 3625 }, { "epoch": 0.733859542602712, "grad_norm": 0.3348987400531769, "learning_rate": 0.000140748231101131, "loss": 0.2133, "step": 3626 }, { "epoch": 0.7340619307832422, "grad_norm": 0.28777918219566345, "learning_rate": 0.0001407191823301814, "loss": 0.2384, "step": 3627 }, { "epoch": 0.7342643189637725, "grad_norm": 0.26365089416503906, "learning_rate": 0.0001406901294396922, "loss": 0.2235, "step": 3628 }, { "epoch": 0.7344667071443027, "grad_norm": 0.2960914671421051, "learning_rate": 0.00014066107243260268, "loss": 0.2433, "step": 3629 }, { "epoch": 0.734669095324833, "grad_norm": 0.2921745479106903, "learning_rate": 0.00014063201131185246, "loss": 0.2398, "step": 3630 }, { "epoch": 0.7348714835053632, "grad_norm": 0.36642351746559143, "learning_rate": 0.0001406029460803817, "loss": 0.2413, "step": 3631 }, { "epoch": 0.7350738716858936, "grad_norm": 0.23936298489570618, "learning_rate": 0.0001405738767411309, "loss": 0.2219, "step": 3632 }, { "epoch": 0.7352762598664238, "grad_norm": 0.26252833008766174, "learning_rate": 0.000140544803297041, "loss": 0.2292, "step": 3633 }, { "epoch": 0.7354786480469541, "grad_norm": 0.3257715702056885, "learning_rate": 0.0001405157257510533, "loss": 0.2147, "step": 3634 }, { "epoch": 0.7356810362274843, "grad_norm": 0.27792125940322876, "learning_rate": 0.00014048664410610962, "loss": 0.2064, "step": 3635 }, { "epoch": 0.7358834244080146, "grad_norm": 0.4515530467033386, "learning_rate": 0.0001404575583651521, "loss": 0.2262, "step": 3636 }, { "epoch": 0.7360858125885448, "grad_norm": 0.3586551547050476, "learning_rate": 0.00014042846853112335, "loss": 0.2715, "step": 3637 }, { "epoch": 0.7362882007690751, "grad_norm": 0.2839229702949524, "learning_rate": 0.00014039937460696636, "loss": 0.2308, "step": 3638 }, { "epoch": 0.7364905889496054, "grad_norm": 0.2356303334236145, "learning_rate": 0.0001403702765956246, "loss": 0.1926, "step": 3639 }, { "epoch": 0.7366929771301356, "grad_norm": 0.26308801770210266, "learning_rate": 0.0001403411745000418, "loss": 0.2178, "step": 3640 }, { "epoch": 0.7368953653106659, "grad_norm": 0.3480371832847595, "learning_rate": 0.00014031206832316225, "loss": 0.2719, "step": 3641 }, { "epoch": 0.7370977534911961, "grad_norm": 0.28486573696136475, "learning_rate": 0.00014028295806793064, "loss": 0.2486, "step": 3642 }, { "epoch": 0.7373001416717264, "grad_norm": 0.30231842398643494, "learning_rate": 0.000140253843737292, "loss": 0.2635, "step": 3643 }, { "epoch": 0.7375025298522566, "grad_norm": 0.31522294878959656, "learning_rate": 0.00014022472533419187, "loss": 0.2559, "step": 3644 }, { "epoch": 0.7377049180327869, "grad_norm": 0.25341594219207764, "learning_rate": 0.00014019560286157606, "loss": 0.2218, "step": 3645 }, { "epoch": 0.7379073062133171, "grad_norm": 0.3260419964790344, "learning_rate": 0.00014016647632239093, "loss": 0.2633, "step": 3646 }, { "epoch": 0.7381096943938474, "grad_norm": 0.3874565362930298, "learning_rate": 0.0001401373457195832, "loss": 0.2387, "step": 3647 }, { "epoch": 0.7383120825743776, "grad_norm": 0.288016676902771, "learning_rate": 0.00014010821105609996, "loss": 0.2298, "step": 3648 }, { "epoch": 0.738514470754908, "grad_norm": 0.2668969929218292, "learning_rate": 0.0001400790723348888, "loss": 0.2076, "step": 3649 }, { "epoch": 0.7387168589354381, "grad_norm": 0.33233851194381714, "learning_rate": 0.00014004992955889766, "loss": 0.2395, "step": 3650 }, { "epoch": 0.7387168589354381, "eval_loss": 0.2731722295284271, "eval_runtime": 0.74, "eval_samples_per_second": 6.757, "eval_steps_per_second": 1.351, "step": 3650 }, { "epoch": 0.7389192471159685, "grad_norm": 0.43040069937705994, "learning_rate": 0.00014002078273107487, "loss": 0.3069, "step": 3651 }, { "epoch": 0.7391216352964987, "grad_norm": 0.2473803013563156, "learning_rate": 0.0001399916318543692, "loss": 0.2209, "step": 3652 }, { "epoch": 0.739324023477029, "grad_norm": 0.27324000000953674, "learning_rate": 0.00013996247693172985, "loss": 0.2068, "step": 3653 }, { "epoch": 0.7395264116575592, "grad_norm": 0.275651216506958, "learning_rate": 0.00013993331796610642, "loss": 0.2329, "step": 3654 }, { "epoch": 0.7397287998380895, "grad_norm": 0.30674195289611816, "learning_rate": 0.0001399041549604489, "loss": 0.2697, "step": 3655 }, { "epoch": 0.7399311880186197, "grad_norm": 0.29092442989349365, "learning_rate": 0.0001398749879177077, "loss": 0.216, "step": 3656 }, { "epoch": 0.74013357619915, "grad_norm": 0.29733288288116455, "learning_rate": 0.0001398458168408336, "loss": 0.2313, "step": 3657 }, { "epoch": 0.7403359643796802, "grad_norm": 0.2697985768318176, "learning_rate": 0.00013981664173277783, "loss": 0.2458, "step": 3658 }, { "epoch": 0.7405383525602105, "grad_norm": 0.3197477161884308, "learning_rate": 0.00013978746259649209, "loss": 0.2327, "step": 3659 }, { "epoch": 0.7407407407407407, "grad_norm": 0.3050107955932617, "learning_rate": 0.00013975827943492835, "loss": 0.2245, "step": 3660 }, { "epoch": 0.740943128921271, "grad_norm": 0.30876925587654114, "learning_rate": 0.0001397290922510391, "loss": 0.2203, "step": 3661 }, { "epoch": 0.7411455171018012, "grad_norm": 0.2831538915634155, "learning_rate": 0.00013969990104777713, "loss": 0.2244, "step": 3662 }, { "epoch": 0.7413479052823315, "grad_norm": 0.27325353026390076, "learning_rate": 0.00013967070582809575, "loss": 0.2191, "step": 3663 }, { "epoch": 0.7415502934628617, "grad_norm": 0.26076215505599976, "learning_rate": 0.0001396415065949486, "loss": 0.2338, "step": 3664 }, { "epoch": 0.741752681643392, "grad_norm": 0.39337509870529175, "learning_rate": 0.0001396123033512898, "loss": 0.2139, "step": 3665 }, { "epoch": 0.7419550698239222, "grad_norm": 0.4706534743309021, "learning_rate": 0.0001395830961000738, "loss": 0.2194, "step": 3666 }, { "epoch": 0.7421574580044525, "grad_norm": 0.3701160252094269, "learning_rate": 0.00013955388484425543, "loss": 0.2305, "step": 3667 }, { "epoch": 0.7423598461849829, "grad_norm": 0.31697842478752136, "learning_rate": 0.00013952466958679004, "loss": 0.2156, "step": 3668 }, { "epoch": 0.742562234365513, "grad_norm": 0.2615698277950287, "learning_rate": 0.0001394954503306333, "loss": 0.2608, "step": 3669 }, { "epoch": 0.7427646225460434, "grad_norm": 0.25155559182167053, "learning_rate": 0.00013946622707874135, "loss": 0.2329, "step": 3670 }, { "epoch": 0.7429670107265736, "grad_norm": 0.3719973862171173, "learning_rate": 0.00013943699983407062, "loss": 0.2478, "step": 3671 }, { "epoch": 0.7431693989071039, "grad_norm": 0.2693372964859009, "learning_rate": 0.00013940776859957808, "loss": 0.2092, "step": 3672 }, { "epoch": 0.7433717870876341, "grad_norm": 0.33275237679481506, "learning_rate": 0.00013937853337822102, "loss": 0.2544, "step": 3673 }, { "epoch": 0.7435741752681644, "grad_norm": 0.30477502942085266, "learning_rate": 0.00013934929417295714, "loss": 0.2512, "step": 3674 }, { "epoch": 0.7437765634486946, "grad_norm": 0.3011702299118042, "learning_rate": 0.00013932005098674457, "loss": 0.2213, "step": 3675 }, { "epoch": 0.7439789516292249, "grad_norm": 0.3222098648548126, "learning_rate": 0.00013929080382254182, "loss": 0.2227, "step": 3676 }, { "epoch": 0.7441813398097551, "grad_norm": 0.2595398724079132, "learning_rate": 0.0001392615526833078, "loss": 0.2112, "step": 3677 }, { "epoch": 0.7443837279902854, "grad_norm": 0.3210145831108093, "learning_rate": 0.00013923229757200185, "loss": 0.2448, "step": 3678 }, { "epoch": 0.7445861161708156, "grad_norm": 0.4244385063648224, "learning_rate": 0.0001392030384915837, "loss": 0.2458, "step": 3679 }, { "epoch": 0.7447885043513459, "grad_norm": 0.3760313391685486, "learning_rate": 0.00013917377544501344, "loss": 0.228, "step": 3680 }, { "epoch": 0.7449908925318761, "grad_norm": 0.25254809856414795, "learning_rate": 0.00013914450843525167, "loss": 0.2259, "step": 3681 }, { "epoch": 0.7451932807124064, "grad_norm": 0.3769349157810211, "learning_rate": 0.00013911523746525922, "loss": 0.2457, "step": 3682 }, { "epoch": 0.7453956688929366, "grad_norm": 0.35421350598335266, "learning_rate": 0.00013908596253799752, "loss": 0.2095, "step": 3683 }, { "epoch": 0.7455980570734669, "grad_norm": 0.33458277583122253, "learning_rate": 0.00013905668365642827, "loss": 0.264, "step": 3684 }, { "epoch": 0.7458004452539971, "grad_norm": 0.31940433382987976, "learning_rate": 0.00013902740082351355, "loss": 0.2836, "step": 3685 }, { "epoch": 0.7460028334345274, "grad_norm": 0.29059967398643494, "learning_rate": 0.00013899811404221595, "loss": 0.2041, "step": 3686 }, { "epoch": 0.7462052216150576, "grad_norm": 0.2859112024307251, "learning_rate": 0.00013896882331549835, "loss": 0.2211, "step": 3687 }, { "epoch": 0.746407609795588, "grad_norm": 0.2609440088272095, "learning_rate": 0.0001389395286463241, "loss": 0.2189, "step": 3688 }, { "epoch": 0.7466099979761182, "grad_norm": 0.2980991005897522, "learning_rate": 0.00013891023003765693, "loss": 0.2549, "step": 3689 }, { "epoch": 0.7468123861566485, "grad_norm": 0.28233975172042847, "learning_rate": 0.00013888092749246098, "loss": 0.2378, "step": 3690 }, { "epoch": 0.7470147743371787, "grad_norm": 0.41510292887687683, "learning_rate": 0.00013885162101370075, "loss": 0.2489, "step": 3691 }, { "epoch": 0.747217162517709, "grad_norm": 0.28230106830596924, "learning_rate": 0.00013882231060434116, "loss": 0.193, "step": 3692 }, { "epoch": 0.7474195506982392, "grad_norm": 0.5089678764343262, "learning_rate": 0.00013879299626734756, "loss": 0.2595, "step": 3693 }, { "epoch": 0.7476219388787695, "grad_norm": 0.36559078097343445, "learning_rate": 0.00013876367800568564, "loss": 0.2262, "step": 3694 }, { "epoch": 0.7478243270592997, "grad_norm": 0.3150606155395508, "learning_rate": 0.00013873435582232156, "loss": 0.2326, "step": 3695 }, { "epoch": 0.74802671523983, "grad_norm": 0.3053276836872101, "learning_rate": 0.00013870502972022173, "loss": 0.2593, "step": 3696 }, { "epoch": 0.7482291034203602, "grad_norm": 0.3074091374874115, "learning_rate": 0.00013867569970235316, "loss": 0.2233, "step": 3697 }, { "epoch": 0.7484314916008905, "grad_norm": 0.3457677364349365, "learning_rate": 0.0001386463657716831, "loss": 0.2811, "step": 3698 }, { "epoch": 0.7486338797814208, "grad_norm": 0.28982600569725037, "learning_rate": 0.00013861702793117924, "loss": 0.2437, "step": 3699 }, { "epoch": 0.748836267961951, "grad_norm": 0.2695489823818207, "learning_rate": 0.00013858768618380972, "loss": 0.2155, "step": 3700 }, { "epoch": 0.748836267961951, "eval_loss": 0.2748314142227173, "eval_runtime": 0.7396, "eval_samples_per_second": 6.761, "eval_steps_per_second": 1.352, "step": 3700 }, { "epoch": 0.7490386561424813, "grad_norm": 0.26668769121170044, "learning_rate": 0.00013855834053254302, "loss": 0.2064, "step": 3701 }, { "epoch": 0.7492410443230115, "grad_norm": 0.2459854632616043, "learning_rate": 0.000138528990980348, "loss": 0.2249, "step": 3702 }, { "epoch": 0.7494434325035418, "grad_norm": 0.3013794720172882, "learning_rate": 0.00013849963753019394, "loss": 0.2528, "step": 3703 }, { "epoch": 0.749645820684072, "grad_norm": 0.3634145259857178, "learning_rate": 0.00013847028018505056, "loss": 0.2639, "step": 3704 }, { "epoch": 0.7498482088646024, "grad_norm": 0.2606116831302643, "learning_rate": 0.00013844091894788787, "loss": 0.2357, "step": 3705 }, { "epoch": 0.7500505970451325, "grad_norm": 0.2868957817554474, "learning_rate": 0.00013841155382167636, "loss": 0.2351, "step": 3706 }, { "epoch": 0.7502529852256629, "grad_norm": 0.27671927213668823, "learning_rate": 0.0001383821848093869, "loss": 0.2715, "step": 3707 }, { "epoch": 0.7504553734061931, "grad_norm": 0.2719424068927765, "learning_rate": 0.0001383528119139907, "loss": 0.2122, "step": 3708 }, { "epoch": 0.7506577615867234, "grad_norm": 0.305531769990921, "learning_rate": 0.00013832343513845943, "loss": 0.2612, "step": 3709 }, { "epoch": 0.7508601497672536, "grad_norm": 0.27149519324302673, "learning_rate": 0.00013829405448576512, "loss": 0.2141, "step": 3710 }, { "epoch": 0.7510625379477839, "grad_norm": 0.30463123321533203, "learning_rate": 0.00013826466995888018, "loss": 0.2458, "step": 3711 }, { "epoch": 0.7512649261283141, "grad_norm": 0.31632566452026367, "learning_rate": 0.00013823528156077744, "loss": 0.2942, "step": 3712 }, { "epoch": 0.7514673143088444, "grad_norm": 0.25641223788261414, "learning_rate": 0.00013820588929443014, "loss": 0.2307, "step": 3713 }, { "epoch": 0.7516697024893746, "grad_norm": 0.3119845688343048, "learning_rate": 0.0001381764931628118, "loss": 0.2242, "step": 3714 }, { "epoch": 0.7518720906699049, "grad_norm": 0.3371204435825348, "learning_rate": 0.00013814709316889648, "loss": 0.228, "step": 3715 }, { "epoch": 0.7520744788504351, "grad_norm": 0.26916682720184326, "learning_rate": 0.00013811768931565855, "loss": 0.2293, "step": 3716 }, { "epoch": 0.7522768670309654, "grad_norm": 0.3246425986289978, "learning_rate": 0.00013808828160607282, "loss": 0.2447, "step": 3717 }, { "epoch": 0.7524792552114956, "grad_norm": 0.2535354495048523, "learning_rate": 0.00013805887004311436, "loss": 0.2232, "step": 3718 }, { "epoch": 0.7526816433920259, "grad_norm": 0.2773735225200653, "learning_rate": 0.00013802945462975882, "loss": 0.2405, "step": 3719 }, { "epoch": 0.7528840315725561, "grad_norm": 0.2671958804130554, "learning_rate": 0.00013800003536898207, "loss": 0.2619, "step": 3720 }, { "epoch": 0.7530864197530864, "grad_norm": 0.3313668668270111, "learning_rate": 0.00013797061226376048, "loss": 0.204, "step": 3721 }, { "epoch": 0.7532888079336166, "grad_norm": 0.2732089161872864, "learning_rate": 0.00013794118531707076, "loss": 0.1993, "step": 3722 }, { "epoch": 0.7534911961141469, "grad_norm": 0.3743329346179962, "learning_rate": 0.00013791175453189, "loss": 0.2622, "step": 3723 }, { "epoch": 0.7536935842946771, "grad_norm": 0.5092394351959229, "learning_rate": 0.00013788231991119577, "loss": 0.2615, "step": 3724 }, { "epoch": 0.7538959724752075, "grad_norm": 0.2902195453643799, "learning_rate": 0.00013785288145796586, "loss": 0.276, "step": 3725 }, { "epoch": 0.7540983606557377, "grad_norm": 0.35427287220954895, "learning_rate": 0.00013782343917517856, "loss": 0.2512, "step": 3726 }, { "epoch": 0.754300748836268, "grad_norm": 0.239785298705101, "learning_rate": 0.00013779399306581262, "loss": 0.2079, "step": 3727 }, { "epoch": 0.7545031370167982, "grad_norm": 0.2598539888858795, "learning_rate": 0.00013776454313284706, "loss": 0.2382, "step": 3728 }, { "epoch": 0.7547055251973285, "grad_norm": 0.2689754068851471, "learning_rate": 0.00013773508937926123, "loss": 0.2359, "step": 3729 }, { "epoch": 0.7549079133778588, "grad_norm": 0.26831790804862976, "learning_rate": 0.00013770563180803502, "loss": 0.2067, "step": 3730 }, { "epoch": 0.755110301558389, "grad_norm": 0.30765849351882935, "learning_rate": 0.00013767617042214863, "loss": 0.2504, "step": 3731 }, { "epoch": 0.7553126897389193, "grad_norm": 0.33630865812301636, "learning_rate": 0.00013764670522458262, "loss": 0.2614, "step": 3732 }, { "epoch": 0.7555150779194495, "grad_norm": 0.279212087392807, "learning_rate": 0.00013761723621831803, "loss": 0.2096, "step": 3733 }, { "epoch": 0.7557174660999798, "grad_norm": 0.3570208251476288, "learning_rate": 0.00013758776340633616, "loss": 0.2428, "step": 3734 }, { "epoch": 0.75591985428051, "grad_norm": 0.28167930245399475, "learning_rate": 0.00013755828679161883, "loss": 0.2107, "step": 3735 }, { "epoch": 0.7561222424610403, "grad_norm": 0.2791215181350708, "learning_rate": 0.00013752880637714812, "loss": 0.2581, "step": 3736 }, { "epoch": 0.7563246306415705, "grad_norm": 0.25869086384773254, "learning_rate": 0.00013749932216590655, "loss": 0.2606, "step": 3737 }, { "epoch": 0.7565270188221008, "grad_norm": 0.32635021209716797, "learning_rate": 0.00013746983416087707, "loss": 0.2695, "step": 3738 }, { "epoch": 0.756729407002631, "grad_norm": 0.3721697926521301, "learning_rate": 0.00013744034236504293, "loss": 0.2272, "step": 3739 }, { "epoch": 0.7569317951831613, "grad_norm": 0.35051414370536804, "learning_rate": 0.0001374108467813878, "loss": 0.2694, "step": 3740 }, { "epoch": 0.7571341833636915, "grad_norm": 0.3295314610004425, "learning_rate": 0.0001373813474128957, "loss": 0.2536, "step": 3741 }, { "epoch": 0.7573365715442218, "grad_norm": 0.3035524785518646, "learning_rate": 0.00013735184426255117, "loss": 0.2624, "step": 3742 }, { "epoch": 0.757538959724752, "grad_norm": 0.6171966791152954, "learning_rate": 0.00013732233733333894, "loss": 0.2573, "step": 3743 }, { "epoch": 0.7577413479052824, "grad_norm": 0.519873857498169, "learning_rate": 0.00013729282662824422, "loss": 0.2199, "step": 3744 }, { "epoch": 0.7579437360858126, "grad_norm": 0.3408195376396179, "learning_rate": 0.00013726331215025266, "loss": 0.2143, "step": 3745 }, { "epoch": 0.7581461242663429, "grad_norm": 0.37395820021629333, "learning_rate": 0.00013723379390235014, "loss": 0.2184, "step": 3746 }, { "epoch": 0.7583485124468731, "grad_norm": 0.40238866209983826, "learning_rate": 0.00013720427188752306, "loss": 0.218, "step": 3747 }, { "epoch": 0.7585509006274034, "grad_norm": 0.2972874641418457, "learning_rate": 0.0001371747461087581, "loss": 0.2184, "step": 3748 }, { "epoch": 0.7587532888079336, "grad_norm": 0.33098655939102173, "learning_rate": 0.00013714521656904243, "loss": 0.2227, "step": 3749 }, { "epoch": 0.7589556769884639, "grad_norm": 0.4189983904361725, "learning_rate": 0.00013711568327136347, "loss": 0.2447, "step": 3750 }, { "epoch": 0.7589556769884639, "eval_loss": 0.27769315242767334, "eval_runtime": 0.7403, "eval_samples_per_second": 6.754, "eval_steps_per_second": 1.351, "step": 3750 }, { "epoch": 0.7591580651689941, "grad_norm": 0.24924065172672272, "learning_rate": 0.00013708614621870917, "loss": 0.2007, "step": 3751 }, { "epoch": 0.7593604533495244, "grad_norm": 0.33730489015579224, "learning_rate": 0.0001370566054140677, "loss": 0.2599, "step": 3752 }, { "epoch": 0.7595628415300546, "grad_norm": 0.28673139214515686, "learning_rate": 0.00013702706086042777, "loss": 0.223, "step": 3753 }, { "epoch": 0.7597652297105849, "grad_norm": 0.35743647813796997, "learning_rate": 0.0001369975125607783, "loss": 0.2624, "step": 3754 }, { "epoch": 0.7599676178911151, "grad_norm": 0.3028452694416046, "learning_rate": 0.00013696796051810873, "loss": 0.2421, "step": 3755 }, { "epoch": 0.7601700060716454, "grad_norm": 0.3120434582233429, "learning_rate": 0.0001369384047354088, "loss": 0.2582, "step": 3756 }, { "epoch": 0.7603723942521756, "grad_norm": 0.38598236441612244, "learning_rate": 0.0001369088452156687, "loss": 0.2957, "step": 3757 }, { "epoch": 0.7605747824327059, "grad_norm": 0.261727899312973, "learning_rate": 0.0001368792819618789, "loss": 0.2294, "step": 3758 }, { "epoch": 0.7607771706132362, "grad_norm": 0.3340347409248352, "learning_rate": 0.00013684971497703033, "loss": 0.2494, "step": 3759 }, { "epoch": 0.7609795587937664, "grad_norm": 0.32166656851768494, "learning_rate": 0.00013682014426411428, "loss": 0.2624, "step": 3760 }, { "epoch": 0.7611819469742968, "grad_norm": 0.3166605830192566, "learning_rate": 0.0001367905698261224, "loss": 0.2406, "step": 3761 }, { "epoch": 0.761384335154827, "grad_norm": 0.34740495681762695, "learning_rate": 0.00013676099166604665, "loss": 0.2369, "step": 3762 }, { "epoch": 0.7615867233353573, "grad_norm": 0.3042324185371399, "learning_rate": 0.0001367314097868795, "loss": 0.1994, "step": 3763 }, { "epoch": 0.7617891115158875, "grad_norm": 0.2825720012187958, "learning_rate": 0.00013670182419161375, "loss": 0.2239, "step": 3764 }, { "epoch": 0.7619914996964178, "grad_norm": 0.29228103160858154, "learning_rate": 0.0001366722348832425, "loss": 0.2575, "step": 3765 }, { "epoch": 0.762193887876948, "grad_norm": 0.31010910868644714, "learning_rate": 0.00013664264186475934, "loss": 0.2331, "step": 3766 }, { "epoch": 0.7623962760574783, "grad_norm": 0.2906114459037781, "learning_rate": 0.00013661304513915817, "loss": 0.2519, "step": 3767 }, { "epoch": 0.7625986642380085, "grad_norm": 0.3258095979690552, "learning_rate": 0.00013658344470943328, "loss": 0.2487, "step": 3768 }, { "epoch": 0.7628010524185388, "grad_norm": 0.25708824396133423, "learning_rate": 0.0001365538405785793, "loss": 0.2267, "step": 3769 }, { "epoch": 0.763003440599069, "grad_norm": 0.3476763367652893, "learning_rate": 0.00013652423274959128, "loss": 0.2114, "step": 3770 }, { "epoch": 0.7632058287795993, "grad_norm": 0.34453055262565613, "learning_rate": 0.00013649462122546465, "loss": 0.211, "step": 3771 }, { "epoch": 0.7634082169601295, "grad_norm": 0.24794785678386688, "learning_rate": 0.00013646500600919515, "loss": 0.2422, "step": 3772 }, { "epoch": 0.7636106051406598, "grad_norm": 0.2998785376548767, "learning_rate": 0.000136435387103779, "loss": 0.2643, "step": 3773 }, { "epoch": 0.76381299332119, "grad_norm": 0.33762747049331665, "learning_rate": 0.00013640576451221268, "loss": 0.2893, "step": 3774 }, { "epoch": 0.7640153815017203, "grad_norm": 0.2480754256248474, "learning_rate": 0.0001363761382374931, "loss": 0.2511, "step": 3775 }, { "epoch": 0.7642177696822505, "grad_norm": 0.3679254651069641, "learning_rate": 0.0001363465082826176, "loss": 0.2672, "step": 3776 }, { "epoch": 0.7644201578627808, "grad_norm": 0.29588577151298523, "learning_rate": 0.00013631687465058372, "loss": 0.2382, "step": 3777 }, { "epoch": 0.764622546043311, "grad_norm": 0.3305865228176117, "learning_rate": 0.00013628723734438952, "loss": 0.2683, "step": 3778 }, { "epoch": 0.7648249342238413, "grad_norm": 0.2598002254962921, "learning_rate": 0.00013625759636703343, "loss": 0.1883, "step": 3779 }, { "epoch": 0.7650273224043715, "grad_norm": 0.27733346819877625, "learning_rate": 0.00013622795172151417, "loss": 0.2434, "step": 3780 }, { "epoch": 0.7652297105849019, "grad_norm": 0.29522767663002014, "learning_rate": 0.0001361983034108309, "loss": 0.2324, "step": 3781 }, { "epoch": 0.7654320987654321, "grad_norm": 0.34732332825660706, "learning_rate": 0.0001361686514379831, "loss": 0.249, "step": 3782 }, { "epoch": 0.7656344869459624, "grad_norm": 0.28269949555397034, "learning_rate": 0.00013613899580597067, "loss": 0.2494, "step": 3783 }, { "epoch": 0.7658368751264926, "grad_norm": 0.29983946681022644, "learning_rate": 0.0001361093365177939, "loss": 0.2453, "step": 3784 }, { "epoch": 0.7660392633070229, "grad_norm": 0.26113417744636536, "learning_rate": 0.0001360796735764533, "loss": 0.2142, "step": 3785 }, { "epoch": 0.7662416514875531, "grad_norm": 0.2883092761039734, "learning_rate": 0.0001360500069849499, "loss": 0.2115, "step": 3786 }, { "epoch": 0.7664440396680834, "grad_norm": 0.35741910338401794, "learning_rate": 0.00013602033674628506, "loss": 0.2536, "step": 3787 }, { "epoch": 0.7666464278486136, "grad_norm": 0.3507242798805237, "learning_rate": 0.00013599066286346052, "loss": 0.234, "step": 3788 }, { "epoch": 0.7668488160291439, "grad_norm": 0.31215426325798035, "learning_rate": 0.00013596098533947835, "loss": 0.2234, "step": 3789 }, { "epoch": 0.7670512042096742, "grad_norm": 0.2733488976955414, "learning_rate": 0.00013593130417734103, "loss": 0.229, "step": 3790 }, { "epoch": 0.7672535923902044, "grad_norm": 0.3050664961338043, "learning_rate": 0.00013590161938005136, "loss": 0.2256, "step": 3791 }, { "epoch": 0.7674559805707347, "grad_norm": 0.3089406490325928, "learning_rate": 0.00013587193095061255, "loss": 0.2541, "step": 3792 }, { "epoch": 0.7676583687512649, "grad_norm": 0.2558094263076782, "learning_rate": 0.00013584223889202818, "loss": 0.2159, "step": 3793 }, { "epoch": 0.7678607569317952, "grad_norm": 0.27690911293029785, "learning_rate": 0.00013581254320730216, "loss": 0.2479, "step": 3794 }, { "epoch": 0.7680631451123254, "grad_norm": 0.28298115730285645, "learning_rate": 0.00013578284389943884, "loss": 0.2667, "step": 3795 }, { "epoch": 0.7682655332928557, "grad_norm": 0.2989111542701721, "learning_rate": 0.00013575314097144278, "loss": 0.2413, "step": 3796 }, { "epoch": 0.7684679214733859, "grad_norm": 0.4821934998035431, "learning_rate": 0.00013572343442631908, "loss": 0.2648, "step": 3797 }, { "epoch": 0.7686703096539163, "grad_norm": 0.26796281337738037, "learning_rate": 0.00013569372426707314, "loss": 0.242, "step": 3798 }, { "epoch": 0.7688726978344465, "grad_norm": 0.2872363328933716, "learning_rate": 0.00013566401049671073, "loss": 0.2224, "step": 3799 }, { "epoch": 0.7690750860149768, "grad_norm": 0.2876395583152771, "learning_rate": 0.0001356342931182379, "loss": 0.2371, "step": 3800 }, { "epoch": 0.7690750860149768, "eval_loss": 0.2702665328979492, "eval_runtime": 0.7392, "eval_samples_per_second": 6.764, "eval_steps_per_second": 1.353, "step": 3800 }, { "epoch": 0.769277474195507, "grad_norm": 0.4570426642894745, "learning_rate": 0.00013560457213466123, "loss": 0.2508, "step": 3801 }, { "epoch": 0.7694798623760373, "grad_norm": 0.2703198492527008, "learning_rate": 0.00013557484754898752, "loss": 0.2449, "step": 3802 }, { "epoch": 0.7696822505565675, "grad_norm": 0.3182757794857025, "learning_rate": 0.00013554511936422406, "loss": 0.221, "step": 3803 }, { "epoch": 0.7698846387370978, "grad_norm": 0.30589956045150757, "learning_rate": 0.00013551538758337835, "loss": 0.2433, "step": 3804 }, { "epoch": 0.770087026917628, "grad_norm": 0.2872970402240753, "learning_rate": 0.00013548565220945842, "loss": 0.2522, "step": 3805 }, { "epoch": 0.7702894150981583, "grad_norm": 0.442940890789032, "learning_rate": 0.00013545591324547255, "loss": 0.2421, "step": 3806 }, { "epoch": 0.7704918032786885, "grad_norm": 0.3310321867465973, "learning_rate": 0.0001354261706944294, "loss": 0.2519, "step": 3807 }, { "epoch": 0.7706941914592188, "grad_norm": 0.3431413173675537, "learning_rate": 0.00013539642455933802, "loss": 0.2626, "step": 3808 }, { "epoch": 0.770896579639749, "grad_norm": 0.31048741936683655, "learning_rate": 0.0001353666748432078, "loss": 0.2482, "step": 3809 }, { "epoch": 0.7710989678202793, "grad_norm": 0.31904760003089905, "learning_rate": 0.00013533692154904853, "loss": 0.2378, "step": 3810 }, { "epoch": 0.7713013560008095, "grad_norm": 0.2913106083869934, "learning_rate": 0.00013530716467987034, "loss": 0.2479, "step": 3811 }, { "epoch": 0.7715037441813398, "grad_norm": 0.3222017288208008, "learning_rate": 0.00013527740423868368, "loss": 0.2563, "step": 3812 }, { "epoch": 0.77170613236187, "grad_norm": 0.31831422448158264, "learning_rate": 0.00013524764022849944, "loss": 0.2458, "step": 3813 }, { "epoch": 0.7719085205424003, "grad_norm": 0.29415562748908997, "learning_rate": 0.00013521787265232877, "loss": 0.2653, "step": 3814 }, { "epoch": 0.7721109087229305, "grad_norm": 0.35008999705314636, "learning_rate": 0.0001351881015131833, "loss": 0.2528, "step": 3815 }, { "epoch": 0.7723132969034608, "grad_norm": 0.2886951267719269, "learning_rate": 0.00013515832681407496, "loss": 0.237, "step": 3816 }, { "epoch": 0.772515685083991, "grad_norm": 0.259790301322937, "learning_rate": 0.00013512854855801605, "loss": 0.2272, "step": 3817 }, { "epoch": 0.7727180732645214, "grad_norm": 0.2906879782676697, "learning_rate": 0.00013509876674801916, "loss": 0.2251, "step": 3818 }, { "epoch": 0.7729204614450516, "grad_norm": 0.3026115298271179, "learning_rate": 0.00013506898138709734, "loss": 0.2541, "step": 3819 }, { "epoch": 0.7731228496255819, "grad_norm": 0.34342706203460693, "learning_rate": 0.00013503919247826395, "loss": 0.2424, "step": 3820 }, { "epoch": 0.7733252378061122, "grad_norm": 0.317668080329895, "learning_rate": 0.00013500940002453274, "loss": 0.2323, "step": 3821 }, { "epoch": 0.7735276259866424, "grad_norm": 0.2953903079032898, "learning_rate": 0.00013497960402891778, "loss": 0.2357, "step": 3822 }, { "epoch": 0.7737300141671727, "grad_norm": 0.3209197223186493, "learning_rate": 0.00013494980449443354, "loss": 0.2586, "step": 3823 }, { "epoch": 0.7739324023477029, "grad_norm": 0.32878735661506653, "learning_rate": 0.00013492000142409477, "loss": 0.2189, "step": 3824 }, { "epoch": 0.7741347905282332, "grad_norm": 0.28302595019340515, "learning_rate": 0.0001348901948209167, "loss": 0.2361, "step": 3825 }, { "epoch": 0.7743371787087634, "grad_norm": 0.311894029378891, "learning_rate": 0.0001348603846879148, "loss": 0.1971, "step": 3826 }, { "epoch": 0.7745395668892937, "grad_norm": 0.29674622416496277, "learning_rate": 0.00013483057102810494, "loss": 0.2246, "step": 3827 }, { "epoch": 0.7747419550698239, "grad_norm": 0.2627718150615692, "learning_rate": 0.00013480075384450342, "loss": 0.2092, "step": 3828 }, { "epoch": 0.7749443432503542, "grad_norm": 0.26441818475723267, "learning_rate": 0.00013477093314012676, "loss": 0.2176, "step": 3829 }, { "epoch": 0.7751467314308844, "grad_norm": 0.3652019500732422, "learning_rate": 0.00013474110891799194, "loss": 0.2678, "step": 3830 }, { "epoch": 0.7753491196114147, "grad_norm": 0.30413907766342163, "learning_rate": 0.00013471128118111624, "loss": 0.2403, "step": 3831 }, { "epoch": 0.7755515077919449, "grad_norm": 0.5906927585601807, "learning_rate": 0.00013468144993251734, "loss": 0.2491, "step": 3832 }, { "epoch": 0.7757538959724752, "grad_norm": 0.38794979453086853, "learning_rate": 0.00013465161517521324, "loss": 0.2314, "step": 3833 }, { "epoch": 0.7759562841530054, "grad_norm": 0.3258013129234314, "learning_rate": 0.00013462177691222235, "loss": 0.2641, "step": 3834 }, { "epoch": 0.7761586723335357, "grad_norm": 0.2821758985519409, "learning_rate": 0.0001345919351465633, "loss": 0.2221, "step": 3835 }, { "epoch": 0.776361060514066, "grad_norm": 0.27629354596138, "learning_rate": 0.00013456208988125526, "loss": 0.213, "step": 3836 }, { "epoch": 0.7765634486945963, "grad_norm": 0.29150551557540894, "learning_rate": 0.0001345322411193176, "loss": 0.2341, "step": 3837 }, { "epoch": 0.7767658368751265, "grad_norm": 0.31135258078575134, "learning_rate": 0.00013450238886377014, "loss": 0.2542, "step": 3838 }, { "epoch": 0.7769682250556568, "grad_norm": 0.3928423225879669, "learning_rate": 0.00013447253311763303, "loss": 0.2261, "step": 3839 }, { "epoch": 0.777170613236187, "grad_norm": 0.35403716564178467, "learning_rate": 0.0001344426738839267, "loss": 0.2361, "step": 3840 }, { "epoch": 0.7773730014167173, "grad_norm": 0.2817465662956238, "learning_rate": 0.00013441281116567203, "loss": 0.2367, "step": 3841 }, { "epoch": 0.7775753895972475, "grad_norm": 0.3627435266971588, "learning_rate": 0.0001343829449658902, "loss": 0.2702, "step": 3842 }, { "epoch": 0.7777777777777778, "grad_norm": 0.35481467843055725, "learning_rate": 0.00013435307528760282, "loss": 0.2577, "step": 3843 }, { "epoch": 0.777980165958308, "grad_norm": 0.5557239055633545, "learning_rate": 0.00013432320213383172, "loss": 0.2507, "step": 3844 }, { "epoch": 0.7781825541388383, "grad_norm": 0.40870675444602966, "learning_rate": 0.00013429332550759916, "loss": 0.2328, "step": 3845 }, { "epoch": 0.7783849423193685, "grad_norm": 0.37118127942085266, "learning_rate": 0.0001342634454119278, "loss": 0.2839, "step": 3846 }, { "epoch": 0.7785873304998988, "grad_norm": 0.34032946825027466, "learning_rate": 0.00013423356184984054, "loss": 0.2595, "step": 3847 }, { "epoch": 0.778789718680429, "grad_norm": 0.25894200801849365, "learning_rate": 0.00013420367482436067, "loss": 0.2174, "step": 3848 }, { "epoch": 0.7789921068609593, "grad_norm": 0.24875733256340027, "learning_rate": 0.00013417378433851188, "loss": 0.2056, "step": 3849 }, { "epoch": 0.7791944950414896, "grad_norm": 0.2893766760826111, "learning_rate": 0.00013414389039531822, "loss": 0.2305, "step": 3850 }, { "epoch": 0.7791944950414896, "eval_loss": 0.2665500342845917, "eval_runtime": 0.7438, "eval_samples_per_second": 6.722, "eval_steps_per_second": 1.344, "step": 3850 }, { "epoch": 0.7793968832220198, "grad_norm": 0.3141258955001831, "learning_rate": 0.00013411399299780396, "loss": 0.2666, "step": 3851 }, { "epoch": 0.7795992714025501, "grad_norm": 0.2987823188304901, "learning_rate": 0.00013408409214899384, "loss": 0.2598, "step": 3852 }, { "epoch": 0.7798016595830803, "grad_norm": 0.37383440136909485, "learning_rate": 0.00013405418785191294, "loss": 0.2609, "step": 3853 }, { "epoch": 0.7800040477636107, "grad_norm": 0.324432909488678, "learning_rate": 0.0001340242801095866, "loss": 0.2469, "step": 3854 }, { "epoch": 0.7802064359441409, "grad_norm": 0.3773249387741089, "learning_rate": 0.00013399436892504065, "loss": 0.2378, "step": 3855 }, { "epoch": 0.7804088241246712, "grad_norm": 0.37897977232933044, "learning_rate": 0.00013396445430130115, "loss": 0.2207, "step": 3856 }, { "epoch": 0.7806112123052014, "grad_norm": 0.3486320972442627, "learning_rate": 0.00013393453624139455, "loss": 0.2591, "step": 3857 }, { "epoch": 0.7808136004857317, "grad_norm": 0.2976657450199127, "learning_rate": 0.00013390461474834762, "loss": 0.2193, "step": 3858 }, { "epoch": 0.7810159886662619, "grad_norm": 0.2994661033153534, "learning_rate": 0.00013387468982518753, "loss": 0.2529, "step": 3859 }, { "epoch": 0.7812183768467922, "grad_norm": 0.3973534405231476, "learning_rate": 0.0001338447614749418, "loss": 0.2323, "step": 3860 }, { "epoch": 0.7814207650273224, "grad_norm": 0.2849239706993103, "learning_rate": 0.0001338148297006382, "loss": 0.2434, "step": 3861 }, { "epoch": 0.7816231532078527, "grad_norm": 0.2878144085407257, "learning_rate": 0.00013378489450530497, "loss": 0.2419, "step": 3862 }, { "epoch": 0.7818255413883829, "grad_norm": 0.32753539085388184, "learning_rate": 0.0001337549558919706, "loss": 0.2772, "step": 3863 }, { "epoch": 0.7820279295689132, "grad_norm": 0.2736656367778778, "learning_rate": 0.00013372501386366397, "loss": 0.224, "step": 3864 }, { "epoch": 0.7822303177494434, "grad_norm": 0.29261481761932373, "learning_rate": 0.00013369506842341431, "loss": 0.261, "step": 3865 }, { "epoch": 0.7824327059299737, "grad_norm": 0.3649926781654358, "learning_rate": 0.0001336651195742512, "loss": 0.2538, "step": 3866 }, { "epoch": 0.7826350941105039, "grad_norm": 0.30018988251686096, "learning_rate": 0.00013363516731920453, "loss": 0.2475, "step": 3867 }, { "epoch": 0.7828374822910342, "grad_norm": 0.27778393030166626, "learning_rate": 0.00013360521166130458, "loss": 0.259, "step": 3868 }, { "epoch": 0.7830398704715644, "grad_norm": 0.2849215865135193, "learning_rate": 0.0001335752526035819, "loss": 0.23, "step": 3869 }, { "epoch": 0.7832422586520947, "grad_norm": 0.27813011407852173, "learning_rate": 0.00013354529014906747, "loss": 0.2454, "step": 3870 }, { "epoch": 0.7834446468326249, "grad_norm": 0.30883848667144775, "learning_rate": 0.00013351532430079256, "loss": 0.2458, "step": 3871 }, { "epoch": 0.7836470350131552, "grad_norm": 0.2674468159675598, "learning_rate": 0.00013348535506178884, "loss": 0.2559, "step": 3872 }, { "epoch": 0.7838494231936854, "grad_norm": 0.26857122778892517, "learning_rate": 0.00013345538243508825, "loss": 0.2429, "step": 3873 }, { "epoch": 0.7840518113742158, "grad_norm": 0.27286744117736816, "learning_rate": 0.0001334254064237231, "loss": 0.219, "step": 3874 }, { "epoch": 0.784254199554746, "grad_norm": 0.27423974871635437, "learning_rate": 0.00013339542703072604, "loss": 0.2557, "step": 3875 }, { "epoch": 0.7844565877352763, "grad_norm": 0.2912842333316803, "learning_rate": 0.00013336544425913012, "loss": 0.2284, "step": 3876 }, { "epoch": 0.7846589759158065, "grad_norm": 0.31245914101600647, "learning_rate": 0.0001333354581119686, "loss": 0.2634, "step": 3877 }, { "epoch": 0.7848613640963368, "grad_norm": 0.2883394658565521, "learning_rate": 0.00013330546859227524, "loss": 0.2106, "step": 3878 }, { "epoch": 0.785063752276867, "grad_norm": 0.2798727750778198, "learning_rate": 0.00013327547570308402, "loss": 0.2401, "step": 3879 }, { "epoch": 0.7852661404573973, "grad_norm": 0.2785923182964325, "learning_rate": 0.00013324547944742934, "loss": 0.259, "step": 3880 }, { "epoch": 0.7854685286379276, "grad_norm": 0.26384279131889343, "learning_rate": 0.0001332154798283459, "loss": 0.2411, "step": 3881 }, { "epoch": 0.7856709168184578, "grad_norm": 0.3410887122154236, "learning_rate": 0.00013318547684886873, "loss": 0.2398, "step": 3882 }, { "epoch": 0.7858733049989881, "grad_norm": 0.2712464928627014, "learning_rate": 0.0001331554705120332, "loss": 0.2588, "step": 3883 }, { "epoch": 0.7860756931795183, "grad_norm": 0.3876480460166931, "learning_rate": 0.0001331254608208751, "loss": 0.2451, "step": 3884 }, { "epoch": 0.7862780813600486, "grad_norm": 0.26584160327911377, "learning_rate": 0.00013309544777843045, "loss": 0.244, "step": 3885 }, { "epoch": 0.7864804695405788, "grad_norm": 0.3508015275001526, "learning_rate": 0.00013306543138773567, "loss": 0.2187, "step": 3886 }, { "epoch": 0.7866828577211091, "grad_norm": 0.2587803602218628, "learning_rate": 0.00013303541165182747, "loss": 0.2203, "step": 3887 }, { "epoch": 0.7868852459016393, "grad_norm": 0.24990132451057434, "learning_rate": 0.00013300538857374296, "loss": 0.2234, "step": 3888 }, { "epoch": 0.7870876340821696, "grad_norm": 0.2898756265640259, "learning_rate": 0.00013297536215651956, "loss": 0.2462, "step": 3889 }, { "epoch": 0.7872900222626998, "grad_norm": 0.2641238868236542, "learning_rate": 0.000132945332403195, "loss": 0.1869, "step": 3890 }, { "epoch": 0.7874924104432302, "grad_norm": 0.28255870938301086, "learning_rate": 0.00013291529931680742, "loss": 0.2374, "step": 3891 }, { "epoch": 0.7876947986237604, "grad_norm": 0.31957346200942993, "learning_rate": 0.00013288526290039523, "loss": 0.2374, "step": 3892 }, { "epoch": 0.7878971868042907, "grad_norm": 0.30044397711753845, "learning_rate": 0.0001328552231569972, "loss": 0.2218, "step": 3893 }, { "epoch": 0.7880995749848209, "grad_norm": 0.3349805176258087, "learning_rate": 0.00013282518008965244, "loss": 0.2455, "step": 3894 }, { "epoch": 0.7883019631653512, "grad_norm": 0.29345056414604187, "learning_rate": 0.0001327951337014004, "loss": 0.2344, "step": 3895 }, { "epoch": 0.7885043513458814, "grad_norm": 0.2971920669078827, "learning_rate": 0.00013276508399528083, "loss": 0.2442, "step": 3896 }, { "epoch": 0.7887067395264117, "grad_norm": 0.3356497585773468, "learning_rate": 0.00013273503097433387, "loss": 0.2679, "step": 3897 }, { "epoch": 0.7889091277069419, "grad_norm": 0.3730000853538513, "learning_rate": 0.00013270497464159994, "loss": 0.2319, "step": 3898 }, { "epoch": 0.7891115158874722, "grad_norm": 0.2450721710920334, "learning_rate": 0.00013267491500011986, "loss": 0.2304, "step": 3899 }, { "epoch": 0.7893139040680024, "grad_norm": 0.32902464270591736, "learning_rate": 0.00013264485205293473, "loss": 0.2454, "step": 3900 }, { "epoch": 0.7893139040680024, "eval_loss": 0.2717682421207428, "eval_runtime": 0.7399, "eval_samples_per_second": 6.758, "eval_steps_per_second": 1.352, "step": 3900 }, { "epoch": 0.7895162922485327, "grad_norm": 0.2387569099664688, "learning_rate": 0.000132614785803086, "loss": 0.191, "step": 3901 }, { "epoch": 0.7897186804290629, "grad_norm": 0.28501030802726746, "learning_rate": 0.00013258471625361552, "loss": 0.2262, "step": 3902 }, { "epoch": 0.7899210686095932, "grad_norm": 0.673722505569458, "learning_rate": 0.0001325546434075653, "loss": 0.2345, "step": 3903 }, { "epoch": 0.7901234567901234, "grad_norm": 0.3353344798088074, "learning_rate": 0.00013252456726797786, "loss": 0.2099, "step": 3904 }, { "epoch": 0.7903258449706537, "grad_norm": 0.34284526109695435, "learning_rate": 0.00013249448783789598, "loss": 0.2502, "step": 3905 }, { "epoch": 0.7905282331511839, "grad_norm": 0.33169737458229065, "learning_rate": 0.0001324644051203628, "loss": 0.2368, "step": 3906 }, { "epoch": 0.7907306213317142, "grad_norm": 0.30647405982017517, "learning_rate": 0.00013243431911842175, "loss": 0.2523, "step": 3907 }, { "epoch": 0.7909330095122444, "grad_norm": 0.29891833662986755, "learning_rate": 0.0001324042298351166, "loss": 0.2269, "step": 3908 }, { "epoch": 0.7911353976927747, "grad_norm": 0.29664790630340576, "learning_rate": 0.0001323741372734915, "loss": 0.249, "step": 3909 }, { "epoch": 0.7913377858733049, "grad_norm": 0.253517210483551, "learning_rate": 0.0001323440414365909, "loss": 0.1896, "step": 3910 }, { "epoch": 0.7915401740538353, "grad_norm": 0.3175423741340637, "learning_rate": 0.00013231394232745959, "loss": 0.2201, "step": 3911 }, { "epoch": 0.7917425622343656, "grad_norm": 0.2625952363014221, "learning_rate": 0.0001322838399491426, "loss": 0.232, "step": 3912 }, { "epoch": 0.7919449504148958, "grad_norm": 0.2627769708633423, "learning_rate": 0.00013225373430468545, "loss": 0.2129, "step": 3913 }, { "epoch": 0.7921473385954261, "grad_norm": 0.3566378653049469, "learning_rate": 0.00013222362539713393, "loss": 0.2348, "step": 3914 }, { "epoch": 0.7923497267759563, "grad_norm": 0.2556769549846649, "learning_rate": 0.0001321935132295341, "loss": 0.1982, "step": 3915 }, { "epoch": 0.7925521149564866, "grad_norm": 0.30709758400917053, "learning_rate": 0.00013216339780493242, "loss": 0.2415, "step": 3916 }, { "epoch": 0.7927545031370168, "grad_norm": 0.32757511734962463, "learning_rate": 0.00013213327912637562, "loss": 0.2345, "step": 3917 }, { "epoch": 0.7929568913175471, "grad_norm": 0.25708115100860596, "learning_rate": 0.0001321031571969108, "loss": 0.2151, "step": 3918 }, { "epoch": 0.7931592794980773, "grad_norm": 0.2545197010040283, "learning_rate": 0.0001320730320195854, "loss": 0.2267, "step": 3919 }, { "epoch": 0.7933616676786076, "grad_norm": 0.3093016743659973, "learning_rate": 0.00013204290359744716, "loss": 0.241, "step": 3920 }, { "epoch": 0.7935640558591378, "grad_norm": 0.3456737697124481, "learning_rate": 0.00013201277193354414, "loss": 0.2537, "step": 3921 }, { "epoch": 0.7937664440396681, "grad_norm": 0.29381608963012695, "learning_rate": 0.00013198263703092478, "loss": 0.2613, "step": 3922 }, { "epoch": 0.7939688322201983, "grad_norm": 0.4102049171924591, "learning_rate": 0.0001319524988926378, "loss": 0.2701, "step": 3923 }, { "epoch": 0.7941712204007286, "grad_norm": 0.28350502252578735, "learning_rate": 0.00013192235752173222, "loss": 0.245, "step": 3924 }, { "epoch": 0.7943736085812588, "grad_norm": 0.3443673253059387, "learning_rate": 0.0001318922129212575, "loss": 0.2672, "step": 3925 }, { "epoch": 0.7945759967617891, "grad_norm": 0.3078950345516205, "learning_rate": 0.0001318620650942633, "loss": 0.2396, "step": 3926 }, { "epoch": 0.7947783849423193, "grad_norm": 0.33737850189208984, "learning_rate": 0.0001318319140437997, "loss": 0.2675, "step": 3927 }, { "epoch": 0.7949807731228496, "grad_norm": 0.2881615161895752, "learning_rate": 0.000131801759772917, "loss": 0.2233, "step": 3928 }, { "epoch": 0.7951831613033798, "grad_norm": 0.28353455662727356, "learning_rate": 0.00013177160228466597, "loss": 0.2557, "step": 3929 }, { "epoch": 0.7953855494839102, "grad_norm": 0.40621811151504517, "learning_rate": 0.0001317414415820976, "loss": 0.2427, "step": 3930 }, { "epoch": 0.7955879376644404, "grad_norm": 0.2879510223865509, "learning_rate": 0.00013171127766826323, "loss": 0.2544, "step": 3931 }, { "epoch": 0.7957903258449707, "grad_norm": 0.2822043001651764, "learning_rate": 0.00013168111054621452, "loss": 0.2318, "step": 3932 }, { "epoch": 0.7959927140255009, "grad_norm": 0.28354987502098083, "learning_rate": 0.00013165094021900346, "loss": 0.2519, "step": 3933 }, { "epoch": 0.7961951022060312, "grad_norm": 0.3343326151371002, "learning_rate": 0.0001316207666896824, "loss": 0.2214, "step": 3934 }, { "epoch": 0.7963974903865614, "grad_norm": 0.29361197352409363, "learning_rate": 0.00013159058996130396, "loss": 0.2384, "step": 3935 }, { "epoch": 0.7965998785670917, "grad_norm": 0.2821219563484192, "learning_rate": 0.00013156041003692108, "loss": 0.2219, "step": 3936 }, { "epoch": 0.7968022667476219, "grad_norm": 0.3084186315536499, "learning_rate": 0.0001315302269195871, "loss": 0.2611, "step": 3937 }, { "epoch": 0.7970046549281522, "grad_norm": 0.2837059795856476, "learning_rate": 0.00013150004061235557, "loss": 0.2157, "step": 3938 }, { "epoch": 0.7972070431086824, "grad_norm": 0.32660189270973206, "learning_rate": 0.0001314698511182805, "loss": 0.2524, "step": 3939 }, { "epoch": 0.7974094312892127, "grad_norm": 0.27194464206695557, "learning_rate": 0.00013143965844041608, "loss": 0.2147, "step": 3940 }, { "epoch": 0.797611819469743, "grad_norm": 0.26827967166900635, "learning_rate": 0.00013140946258181693, "loss": 0.2114, "step": 3941 }, { "epoch": 0.7978142076502732, "grad_norm": 0.4834533631801605, "learning_rate": 0.0001313792635455379, "loss": 0.2276, "step": 3942 }, { "epoch": 0.7980165958308035, "grad_norm": 0.2604771852493286, "learning_rate": 0.00013134906133463424, "loss": 0.2364, "step": 3943 }, { "epoch": 0.7982189840113337, "grad_norm": 0.2622084319591522, "learning_rate": 0.0001313188559521615, "loss": 0.2335, "step": 3944 }, { "epoch": 0.798421372191864, "grad_norm": 0.3305802047252655, "learning_rate": 0.00013128864740117558, "loss": 0.2401, "step": 3945 }, { "epoch": 0.7986237603723942, "grad_norm": 0.2993631064891815, "learning_rate": 0.0001312584356847326, "loss": 0.2432, "step": 3946 }, { "epoch": 0.7988261485529246, "grad_norm": 0.26266202330589294, "learning_rate": 0.0001312282208058891, "loss": 0.2166, "step": 3947 }, { "epoch": 0.7990285367334548, "grad_norm": 0.26602888107299805, "learning_rate": 0.00013119800276770188, "loss": 0.2462, "step": 3948 }, { "epoch": 0.7992309249139851, "grad_norm": 0.4510941803455353, "learning_rate": 0.00013116778157322805, "loss": 0.2508, "step": 3949 }, { "epoch": 0.7994333130945153, "grad_norm": 0.26324382424354553, "learning_rate": 0.0001311375572255252, "loss": 0.1988, "step": 3950 }, { "epoch": 0.7994333130945153, "eval_loss": 0.26543259620666504, "eval_runtime": 0.7381, "eval_samples_per_second": 6.774, "eval_steps_per_second": 1.355, "step": 3950 }, { "epoch": 0.7996357012750456, "grad_norm": 0.560707688331604, "learning_rate": 0.00013110732972765102, "loss": 0.2846, "step": 3951 }, { "epoch": 0.7998380894555758, "grad_norm": 0.33031123876571655, "learning_rate": 0.00013107709908266357, "loss": 0.2273, "step": 3952 }, { "epoch": 0.8000404776361061, "grad_norm": 0.34044280648231506, "learning_rate": 0.00013104686529362137, "loss": 0.2757, "step": 3953 }, { "epoch": 0.8002428658166363, "grad_norm": 0.30630120635032654, "learning_rate": 0.00013101662836358308, "loss": 0.1898, "step": 3954 }, { "epoch": 0.8004452539971666, "grad_norm": 0.3719131648540497, "learning_rate": 0.00013098638829560778, "loss": 0.2554, "step": 3955 }, { "epoch": 0.8006476421776968, "grad_norm": 0.3335683047771454, "learning_rate": 0.00013095614509275487, "loss": 0.249, "step": 3956 }, { "epoch": 0.8008500303582271, "grad_norm": 0.37623754143714905, "learning_rate": 0.00013092589875808404, "loss": 0.2631, "step": 3957 }, { "epoch": 0.8010524185387573, "grad_norm": 0.2632478177547455, "learning_rate": 0.00013089564929465522, "loss": 0.2345, "step": 3958 }, { "epoch": 0.8012548067192876, "grad_norm": 0.36044973134994507, "learning_rate": 0.00013086539670552883, "loss": 0.2686, "step": 3959 }, { "epoch": 0.8014571948998178, "grad_norm": 0.32119905948638916, "learning_rate": 0.00013083514099376545, "loss": 0.2469, "step": 3960 }, { "epoch": 0.8016595830803481, "grad_norm": 0.2759612500667572, "learning_rate": 0.00013080488216242608, "loss": 0.2274, "step": 3961 }, { "epoch": 0.8018619712608783, "grad_norm": 0.3435122072696686, "learning_rate": 0.00013077462021457195, "loss": 0.247, "step": 3962 }, { "epoch": 0.8020643594414086, "grad_norm": 0.29719454050064087, "learning_rate": 0.00013074435515326467, "loss": 0.2415, "step": 3963 }, { "epoch": 0.8022667476219388, "grad_norm": 0.3413975238800049, "learning_rate": 0.00013071408698156614, "loss": 0.2576, "step": 3964 }, { "epoch": 0.8024691358024691, "grad_norm": 0.33543604612350464, "learning_rate": 0.00013068381570253856, "loss": 0.2259, "step": 3965 }, { "epoch": 0.8026715239829993, "grad_norm": 0.2922574579715729, "learning_rate": 0.00013065354131924445, "loss": 0.246, "step": 3966 }, { "epoch": 0.8028739121635297, "grad_norm": 0.26383689045906067, "learning_rate": 0.00013062326383474668, "loss": 0.2267, "step": 3967 }, { "epoch": 0.8030763003440599, "grad_norm": 0.3454423248767853, "learning_rate": 0.0001305929832521084, "loss": 0.2488, "step": 3968 }, { "epoch": 0.8032786885245902, "grad_norm": 0.32194650173187256, "learning_rate": 0.0001305626995743931, "loss": 0.2327, "step": 3969 }, { "epoch": 0.8034810767051204, "grad_norm": 0.40299713611602783, "learning_rate": 0.00013053241280466452, "loss": 0.2356, "step": 3970 }, { "epoch": 0.8036834648856507, "grad_norm": 0.29903537034988403, "learning_rate": 0.0001305021229459868, "loss": 0.248, "step": 3971 }, { "epoch": 0.803885853066181, "grad_norm": 0.43103647232055664, "learning_rate": 0.00013047183000142437, "loss": 0.2178, "step": 3972 }, { "epoch": 0.8040882412467112, "grad_norm": 0.40093111991882324, "learning_rate": 0.0001304415339740419, "loss": 0.2566, "step": 3973 }, { "epoch": 0.8042906294272415, "grad_norm": 0.287648469209671, "learning_rate": 0.00013041123486690442, "loss": 0.2448, "step": 3974 }, { "epoch": 0.8044930176077717, "grad_norm": 0.2900941073894501, "learning_rate": 0.0001303809326830773, "loss": 0.2246, "step": 3975 }, { "epoch": 0.804695405788302, "grad_norm": 0.31337985396385193, "learning_rate": 0.00013035062742562618, "loss": 0.2169, "step": 3976 }, { "epoch": 0.8048977939688322, "grad_norm": 0.27474328875541687, "learning_rate": 0.00013032031909761705, "loss": 0.2235, "step": 3977 }, { "epoch": 0.8051001821493625, "grad_norm": 0.30365967750549316, "learning_rate": 0.0001302900077021162, "loss": 0.2536, "step": 3978 }, { "epoch": 0.8053025703298927, "grad_norm": 0.3188045024871826, "learning_rate": 0.00013025969324219022, "loss": 0.2359, "step": 3979 }, { "epoch": 0.805504958510423, "grad_norm": 0.3143835961818695, "learning_rate": 0.00013022937572090596, "loss": 0.2544, "step": 3980 }, { "epoch": 0.8057073466909532, "grad_norm": 0.2658599317073822, "learning_rate": 0.00013019905514133063, "loss": 0.2183, "step": 3981 }, { "epoch": 0.8059097348714835, "grad_norm": 0.32992562651634216, "learning_rate": 0.0001301687315065318, "loss": 0.2483, "step": 3982 }, { "epoch": 0.8061121230520137, "grad_norm": 0.30123093724250793, "learning_rate": 0.0001301384048195773, "loss": 0.219, "step": 3983 }, { "epoch": 0.806314511232544, "grad_norm": 0.25195813179016113, "learning_rate": 0.0001301080750835352, "loss": 0.2223, "step": 3984 }, { "epoch": 0.8065168994130743, "grad_norm": 0.2840067148208618, "learning_rate": 0.000130077742301474, "loss": 0.2412, "step": 3985 }, { "epoch": 0.8067192875936046, "grad_norm": 0.26293548941612244, "learning_rate": 0.00013004740647646246, "loss": 0.2125, "step": 3986 }, { "epoch": 0.8069216757741348, "grad_norm": 0.2873854339122772, "learning_rate": 0.00013001706761156957, "loss": 0.2239, "step": 3987 }, { "epoch": 0.8071240639546651, "grad_norm": 0.3114651143550873, "learning_rate": 0.00012998672570986477, "loss": 0.2515, "step": 3988 }, { "epoch": 0.8073264521351953, "grad_norm": 0.328300803899765, "learning_rate": 0.00012995638077441772, "loss": 0.2393, "step": 3989 }, { "epoch": 0.8075288403157256, "grad_norm": 0.2859802842140198, "learning_rate": 0.00012992603280829838, "loss": 0.2563, "step": 3990 }, { "epoch": 0.8077312284962558, "grad_norm": 0.26750272512435913, "learning_rate": 0.00012989568181457704, "loss": 0.2377, "step": 3991 }, { "epoch": 0.8079336166767861, "grad_norm": 0.5690582394599915, "learning_rate": 0.00012986532779632432, "loss": 0.2893, "step": 3992 }, { "epoch": 0.8081360048573163, "grad_norm": 0.26616957783699036, "learning_rate": 0.00012983497075661111, "loss": 0.2584, "step": 3993 }, { "epoch": 0.8083383930378466, "grad_norm": 0.40735387802124023, "learning_rate": 0.0001298046106985086, "loss": 0.2282, "step": 3994 }, { "epoch": 0.8085407812183768, "grad_norm": 0.26333582401275635, "learning_rate": 0.00012977424762508833, "loss": 0.2144, "step": 3995 }, { "epoch": 0.8087431693989071, "grad_norm": 0.35248100757598877, "learning_rate": 0.00012974388153942212, "loss": 0.2685, "step": 3996 }, { "epoch": 0.8089455575794373, "grad_norm": 0.3542243540287018, "learning_rate": 0.00012971351244458202, "loss": 0.2355, "step": 3997 }, { "epoch": 0.8091479457599676, "grad_norm": 0.26675522327423096, "learning_rate": 0.00012968314034364056, "loss": 0.2498, "step": 3998 }, { "epoch": 0.8093503339404978, "grad_norm": 0.33968645334243774, "learning_rate": 0.00012965276523967042, "loss": 0.2583, "step": 3999 }, { "epoch": 0.8095527221210281, "grad_norm": 0.2958551049232483, "learning_rate": 0.0001296223871357446, "loss": 0.2385, "step": 4000 }, { "epoch": 0.8095527221210281, "eval_loss": 0.26493722200393677, "eval_runtime": 0.7357, "eval_samples_per_second": 6.797, "eval_steps_per_second": 1.359, "step": 4000 }, { "epoch": 0.8097551103015583, "grad_norm": 0.2750597298145294, "learning_rate": 0.00012959200603493648, "loss": 0.2214, "step": 4001 }, { "epoch": 0.8099574984820886, "grad_norm": 0.3002444803714752, "learning_rate": 0.0001295616219403197, "loss": 0.2292, "step": 4002 }, { "epoch": 0.810159886662619, "grad_norm": 0.2769399583339691, "learning_rate": 0.00012953123485496824, "loss": 0.2599, "step": 4003 }, { "epoch": 0.8103622748431492, "grad_norm": 0.2609650492668152, "learning_rate": 0.00012950084478195625, "loss": 0.2499, "step": 4004 }, { "epoch": 0.8105646630236795, "grad_norm": 0.2414253205060959, "learning_rate": 0.00012947045172435838, "loss": 0.225, "step": 4005 }, { "epoch": 0.8107670512042097, "grad_norm": 0.3793722093105316, "learning_rate": 0.0001294400556852494, "loss": 0.2173, "step": 4006 }, { "epoch": 0.81096943938474, "grad_norm": 0.29010626673698425, "learning_rate": 0.00012940965666770451, "loss": 0.2417, "step": 4007 }, { "epoch": 0.8111718275652702, "grad_norm": 0.2701972723007202, "learning_rate": 0.00012937925467479912, "loss": 0.2269, "step": 4008 }, { "epoch": 0.8113742157458005, "grad_norm": 0.3228740990161896, "learning_rate": 0.00012934884970960907, "loss": 0.2531, "step": 4009 }, { "epoch": 0.8115766039263307, "grad_norm": 0.3706228733062744, "learning_rate": 0.0001293184417752103, "loss": 0.236, "step": 4010 }, { "epoch": 0.811778992106861, "grad_norm": 0.4635535776615143, "learning_rate": 0.00012928803087467928, "loss": 0.2882, "step": 4011 }, { "epoch": 0.8119813802873912, "grad_norm": 0.32409602403640747, "learning_rate": 0.00012925761701109258, "loss": 0.2508, "step": 4012 }, { "epoch": 0.8121837684679215, "grad_norm": 0.3405529856681824, "learning_rate": 0.00012922720018752721, "loss": 0.2153, "step": 4013 }, { "epoch": 0.8123861566484517, "grad_norm": 0.3170727789402008, "learning_rate": 0.0001291967804070604, "loss": 0.2567, "step": 4014 }, { "epoch": 0.812588544828982, "grad_norm": 0.2811383605003357, "learning_rate": 0.0001291663576727697, "loss": 0.2313, "step": 4015 }, { "epoch": 0.8127909330095122, "grad_norm": 0.264926552772522, "learning_rate": 0.00012913593198773295, "loss": 0.238, "step": 4016 }, { "epoch": 0.8129933211900425, "grad_norm": 0.2697588801383972, "learning_rate": 0.00012910550335502836, "loss": 0.2191, "step": 4017 }, { "epoch": 0.8131957093705727, "grad_norm": 0.27033311128616333, "learning_rate": 0.0001290750717777343, "loss": 0.2307, "step": 4018 }, { "epoch": 0.813398097551103, "grad_norm": 0.26735609769821167, "learning_rate": 0.00012904463725892958, "loss": 0.2433, "step": 4019 }, { "epoch": 0.8136004857316332, "grad_norm": 0.3760308027267456, "learning_rate": 0.00012901419980169322, "loss": 0.2575, "step": 4020 }, { "epoch": 0.8138028739121635, "grad_norm": 0.27508193254470825, "learning_rate": 0.00012898375940910458, "loss": 0.2081, "step": 4021 }, { "epoch": 0.8140052620926937, "grad_norm": 0.2871641516685486, "learning_rate": 0.0001289533160842433, "loss": 0.2684, "step": 4022 }, { "epoch": 0.8142076502732241, "grad_norm": 0.2664308547973633, "learning_rate": 0.00012892286983018925, "loss": 0.2353, "step": 4023 }, { "epoch": 0.8144100384537543, "grad_norm": 0.3087598979473114, "learning_rate": 0.00012889242065002273, "loss": 0.2462, "step": 4024 }, { "epoch": 0.8146124266342846, "grad_norm": 0.2692663073539734, "learning_rate": 0.00012886196854682428, "loss": 0.2344, "step": 4025 }, { "epoch": 0.8148148148148148, "grad_norm": 0.29703715443611145, "learning_rate": 0.0001288315135236747, "loss": 0.2446, "step": 4026 }, { "epoch": 0.8150172029953451, "grad_norm": 0.32801687717437744, "learning_rate": 0.00012880105558365509, "loss": 0.2691, "step": 4027 }, { "epoch": 0.8152195911758753, "grad_norm": 0.2968493103981018, "learning_rate": 0.0001287705947298469, "loss": 0.2384, "step": 4028 }, { "epoch": 0.8154219793564056, "grad_norm": 0.3002225160598755, "learning_rate": 0.00012874013096533178, "loss": 0.2361, "step": 4029 }, { "epoch": 0.8156243675369358, "grad_norm": 0.3209449052810669, "learning_rate": 0.0001287096642931918, "loss": 0.2536, "step": 4030 }, { "epoch": 0.8158267557174661, "grad_norm": 0.2750207781791687, "learning_rate": 0.00012867919471650925, "loss": 0.2123, "step": 4031 }, { "epoch": 0.8160291438979964, "grad_norm": 0.46219751238822937, "learning_rate": 0.00012864872223836667, "loss": 0.2511, "step": 4032 }, { "epoch": 0.8162315320785266, "grad_norm": 0.2897518575191498, "learning_rate": 0.00012861824686184698, "loss": 0.2251, "step": 4033 }, { "epoch": 0.8164339202590569, "grad_norm": 0.23715439438819885, "learning_rate": 0.00012858776859003338, "loss": 0.2265, "step": 4034 }, { "epoch": 0.8166363084395871, "grad_norm": 0.27081891894340515, "learning_rate": 0.00012855728742600935, "loss": 0.2126, "step": 4035 }, { "epoch": 0.8168386966201174, "grad_norm": 0.2993212640285492, "learning_rate": 0.0001285268033728586, "loss": 0.2701, "step": 4036 }, { "epoch": 0.8170410848006476, "grad_norm": 0.2464466542005539, "learning_rate": 0.0001284963164336652, "loss": 0.2014, "step": 4037 }, { "epoch": 0.8172434729811779, "grad_norm": 0.27498987317085266, "learning_rate": 0.00012846582661151353, "loss": 0.247, "step": 4038 }, { "epoch": 0.8174458611617081, "grad_norm": 0.31696516275405884, "learning_rate": 0.0001284353339094882, "loss": 0.2369, "step": 4039 }, { "epoch": 0.8176482493422385, "grad_norm": 0.2576943635940552, "learning_rate": 0.00012840483833067418, "loss": 0.2366, "step": 4040 }, { "epoch": 0.8178506375227687, "grad_norm": 0.4740954339504242, "learning_rate": 0.00012837433987815663, "loss": 0.2481, "step": 4041 }, { "epoch": 0.818053025703299, "grad_norm": 0.27298733592033386, "learning_rate": 0.00012834383855502113, "loss": 0.2112, "step": 4042 }, { "epoch": 0.8182554138838292, "grad_norm": 0.2754977345466614, "learning_rate": 0.00012831333436435344, "loss": 0.2545, "step": 4043 }, { "epoch": 0.8184578020643595, "grad_norm": 0.379720538854599, "learning_rate": 0.00012828282730923966, "loss": 0.281, "step": 4044 }, { "epoch": 0.8186601902448897, "grad_norm": 0.28790631890296936, "learning_rate": 0.0001282523173927662, "loss": 0.2221, "step": 4045 }, { "epoch": 0.81886257842542, "grad_norm": 0.2704559862613678, "learning_rate": 0.0001282218046180197, "loss": 0.2507, "step": 4046 }, { "epoch": 0.8190649666059502, "grad_norm": 0.29410356283187866, "learning_rate": 0.00012819128898808714, "loss": 0.2122, "step": 4047 }, { "epoch": 0.8192673547864805, "grad_norm": 0.29373201727867126, "learning_rate": 0.00012816077050605576, "loss": 0.2455, "step": 4048 }, { "epoch": 0.8194697429670107, "grad_norm": 0.27533572912216187, "learning_rate": 0.0001281302491750131, "loss": 0.209, "step": 4049 }, { "epoch": 0.819672131147541, "grad_norm": 0.2924385070800781, "learning_rate": 0.00012809972499804704, "loss": 0.197, "step": 4050 }, { "epoch": 0.819672131147541, "eval_loss": 0.27634698152542114, "eval_runtime": 0.7401, "eval_samples_per_second": 6.756, "eval_steps_per_second": 1.351, "step": 4050 }, { "epoch": 0.8198745193280712, "grad_norm": 0.5106258988380432, "learning_rate": 0.00012806919797824564, "loss": 0.2496, "step": 4051 }, { "epoch": 0.8200769075086015, "grad_norm": 0.31579113006591797, "learning_rate": 0.0001280386681186973, "loss": 0.24, "step": 4052 }, { "epoch": 0.8202792956891317, "grad_norm": 0.2940889000892639, "learning_rate": 0.00012800813542249072, "loss": 0.2201, "step": 4053 }, { "epoch": 0.820481683869662, "grad_norm": 0.325055330991745, "learning_rate": 0.0001279775998927149, "loss": 0.2579, "step": 4054 }, { "epoch": 0.8206840720501922, "grad_norm": 0.2981805205345154, "learning_rate": 0.00012794706153245906, "loss": 0.2451, "step": 4055 }, { "epoch": 0.8208864602307225, "grad_norm": 0.280321329832077, "learning_rate": 0.0001279165203448128, "loss": 0.2306, "step": 4056 }, { "epoch": 0.8210888484112527, "grad_norm": 0.336793452501297, "learning_rate": 0.00012788597633286593, "loss": 0.2391, "step": 4057 }, { "epoch": 0.821291236591783, "grad_norm": 0.2743752896785736, "learning_rate": 0.00012785542949970857, "loss": 0.2433, "step": 4058 }, { "epoch": 0.8214936247723132, "grad_norm": 0.270509272813797, "learning_rate": 0.00012782487984843116, "loss": 0.2349, "step": 4059 }, { "epoch": 0.8216960129528436, "grad_norm": 0.4689193665981293, "learning_rate": 0.00012779432738212437, "loss": 0.2196, "step": 4060 }, { "epoch": 0.8218984011333738, "grad_norm": 0.3306290805339813, "learning_rate": 0.00012776377210387913, "loss": 0.2352, "step": 4061 }, { "epoch": 0.8221007893139041, "grad_norm": 0.28863513469696045, "learning_rate": 0.0001277332140167868, "loss": 0.2359, "step": 4062 }, { "epoch": 0.8223031774944344, "grad_norm": 0.2727032005786896, "learning_rate": 0.00012770265312393887, "loss": 0.1916, "step": 4063 }, { "epoch": 0.8225055656749646, "grad_norm": 0.2952982485294342, "learning_rate": 0.00012767208942842715, "loss": 0.2046, "step": 4064 }, { "epoch": 0.8227079538554949, "grad_norm": 0.27022436261177063, "learning_rate": 0.00012764152293334382, "loss": 0.2266, "step": 4065 }, { "epoch": 0.8229103420360251, "grad_norm": 0.35855549573898315, "learning_rate": 0.00012761095364178124, "loss": 0.2852, "step": 4066 }, { "epoch": 0.8231127302165554, "grad_norm": 0.2672812044620514, "learning_rate": 0.00012758038155683205, "loss": 0.199, "step": 4067 }, { "epoch": 0.8233151183970856, "grad_norm": 0.36959728598594666, "learning_rate": 0.00012754980668158928, "loss": 0.2756, "step": 4068 }, { "epoch": 0.8235175065776159, "grad_norm": 0.27906864881515503, "learning_rate": 0.00012751922901914616, "loss": 0.2363, "step": 4069 }, { "epoch": 0.8237198947581461, "grad_norm": 0.2816850244998932, "learning_rate": 0.00012748864857259617, "loss": 0.2152, "step": 4070 }, { "epoch": 0.8239222829386764, "grad_norm": 0.27567291259765625, "learning_rate": 0.00012745806534503315, "loss": 0.2369, "step": 4071 }, { "epoch": 0.8241246711192066, "grad_norm": 0.27985164523124695, "learning_rate": 0.0001274274793395512, "loss": 0.2626, "step": 4072 }, { "epoch": 0.8243270592997369, "grad_norm": 0.26450982689857483, "learning_rate": 0.00012739689055924473, "loss": 0.2379, "step": 4073 }, { "epoch": 0.8245294474802671, "grad_norm": 0.3406170904636383, "learning_rate": 0.0001273662990072083, "loss": 0.2737, "step": 4074 }, { "epoch": 0.8247318356607974, "grad_norm": 0.31535694003105164, "learning_rate": 0.0001273357046865369, "loss": 0.2237, "step": 4075 }, { "epoch": 0.8249342238413276, "grad_norm": 0.3160010576248169, "learning_rate": 0.00012730510760032573, "loss": 0.2622, "step": 4076 }, { "epoch": 0.825136612021858, "grad_norm": 0.25679177045822144, "learning_rate": 0.00012727450775167027, "loss": 0.2174, "step": 4077 }, { "epoch": 0.8253390002023882, "grad_norm": 0.2731407582759857, "learning_rate": 0.00012724390514366632, "loss": 0.2193, "step": 4078 }, { "epoch": 0.8255413883829185, "grad_norm": 0.3082418143749237, "learning_rate": 0.0001272132997794099, "loss": 0.2555, "step": 4079 }, { "epoch": 0.8257437765634487, "grad_norm": 0.2599988281726837, "learning_rate": 0.00012718269166199736, "loss": 0.236, "step": 4080 }, { "epoch": 0.825946164743979, "grad_norm": 0.28256499767303467, "learning_rate": 0.0001271520807945253, "loss": 0.2459, "step": 4081 }, { "epoch": 0.8261485529245092, "grad_norm": 0.32175830006599426, "learning_rate": 0.00012712146718009062, "loss": 0.2839, "step": 4082 }, { "epoch": 0.8263509411050395, "grad_norm": 0.2378125935792923, "learning_rate": 0.00012709085082179047, "loss": 0.2149, "step": 4083 }, { "epoch": 0.8265533292855697, "grad_norm": 0.3288223147392273, "learning_rate": 0.00012706023172272228, "loss": 0.2315, "step": 4084 }, { "epoch": 0.8267557174661, "grad_norm": 0.28977319598197937, "learning_rate": 0.00012702960988598378, "loss": 0.2359, "step": 4085 }, { "epoch": 0.8269581056466302, "grad_norm": 0.35099443793296814, "learning_rate": 0.000126998985314673, "loss": 0.2244, "step": 4086 }, { "epoch": 0.8271604938271605, "grad_norm": 0.27095192670822144, "learning_rate": 0.00012696835801188816, "loss": 0.2226, "step": 4087 }, { "epoch": 0.8273628820076907, "grad_norm": 0.27652791142463684, "learning_rate": 0.00012693772798072784, "loss": 0.206, "step": 4088 }, { "epoch": 0.827565270188221, "grad_norm": 0.24089790880680084, "learning_rate": 0.00012690709522429085, "loss": 0.2188, "step": 4089 }, { "epoch": 0.8277676583687512, "grad_norm": 0.32007068395614624, "learning_rate": 0.0001268764597456763, "loss": 0.244, "step": 4090 }, { "epoch": 0.8279700465492815, "grad_norm": 0.2929043471813202, "learning_rate": 0.00012684582154798356, "loss": 0.2341, "step": 4091 }, { "epoch": 0.8281724347298117, "grad_norm": 0.35205259919166565, "learning_rate": 0.00012681518063431232, "loss": 0.2495, "step": 4092 }, { "epoch": 0.828374822910342, "grad_norm": 0.2678942382335663, "learning_rate": 0.00012678453700776246, "loss": 0.2477, "step": 4093 }, { "epoch": 0.8285772110908723, "grad_norm": 0.3012922704219818, "learning_rate": 0.00012675389067143416, "loss": 0.2505, "step": 4094 }, { "epoch": 0.8287795992714025, "grad_norm": 0.2550067901611328, "learning_rate": 0.00012672324162842796, "loss": 0.2433, "step": 4095 }, { "epoch": 0.8289819874519329, "grad_norm": 0.26899462938308716, "learning_rate": 0.00012669258988184457, "loss": 0.2533, "step": 4096 }, { "epoch": 0.829184375632463, "grad_norm": 0.335483193397522, "learning_rate": 0.00012666193543478502, "loss": 0.2151, "step": 4097 }, { "epoch": 0.8293867638129934, "grad_norm": 0.29949456453323364, "learning_rate": 0.00012663127829035058, "loss": 0.2246, "step": 4098 }, { "epoch": 0.8295891519935236, "grad_norm": 0.2744475305080414, "learning_rate": 0.00012660061845164286, "loss": 0.2368, "step": 4099 }, { "epoch": 0.8297915401740539, "grad_norm": 0.28630226850509644, "learning_rate": 0.0001265699559217637, "loss": 0.2577, "step": 4100 }, { "epoch": 0.8297915401740539, "eval_loss": 0.2671542763710022, "eval_runtime": 0.7421, "eval_samples_per_second": 6.738, "eval_steps_per_second": 1.348, "step": 4100 }, { "epoch": 0.8299939283545841, "grad_norm": 0.26792484521865845, "learning_rate": 0.00012653929070381514, "loss": 0.2567, "step": 4101 }, { "epoch": 0.8301963165351144, "grad_norm": 0.3261709213256836, "learning_rate": 0.00012650862280089967, "loss": 0.2265, "step": 4102 }, { "epoch": 0.8303987047156446, "grad_norm": 0.3107450306415558, "learning_rate": 0.00012647795221611987, "loss": 0.2699, "step": 4103 }, { "epoch": 0.8306010928961749, "grad_norm": 0.3134928047657013, "learning_rate": 0.00012644727895257872, "loss": 0.2574, "step": 4104 }, { "epoch": 0.8308034810767051, "grad_norm": 0.44662341475486755, "learning_rate": 0.00012641660301337937, "loss": 0.2677, "step": 4105 }, { "epoch": 0.8310058692572354, "grad_norm": 0.3029615581035614, "learning_rate": 0.00012638592440162533, "loss": 0.2132, "step": 4106 }, { "epoch": 0.8312082574377656, "grad_norm": 0.3081475496292114, "learning_rate": 0.0001263552431204203, "loss": 0.2333, "step": 4107 }, { "epoch": 0.8314106456182959, "grad_norm": 0.2571415305137634, "learning_rate": 0.0001263245591728683, "loss": 0.2376, "step": 4108 }, { "epoch": 0.8316130337988261, "grad_norm": 0.3064200282096863, "learning_rate": 0.00012629387256207365, "loss": 0.2657, "step": 4109 }, { "epoch": 0.8318154219793564, "grad_norm": 0.34786108136177063, "learning_rate": 0.00012626318329114089, "loss": 0.2352, "step": 4110 }, { "epoch": 0.8320178101598866, "grad_norm": 0.3301227390766144, "learning_rate": 0.0001262324913631748, "loss": 0.2761, "step": 4111 }, { "epoch": 0.8322201983404169, "grad_norm": 0.2794135510921478, "learning_rate": 0.00012620179678128051, "loss": 0.2733, "step": 4112 }, { "epoch": 0.8324225865209471, "grad_norm": 0.25935810804367065, "learning_rate": 0.00012617109954856333, "loss": 0.2474, "step": 4113 }, { "epoch": 0.8326249747014774, "grad_norm": 0.34428855776786804, "learning_rate": 0.00012614039966812892, "loss": 0.2705, "step": 4114 }, { "epoch": 0.8328273628820076, "grad_norm": 0.27258262038230896, "learning_rate": 0.00012610969714308315, "loss": 0.2574, "step": 4115 }, { "epoch": 0.833029751062538, "grad_norm": 0.2827422320842743, "learning_rate": 0.0001260789919765322, "loss": 0.2316, "step": 4116 }, { "epoch": 0.8332321392430682, "grad_norm": 0.41177263855934143, "learning_rate": 0.00012604828417158248, "loss": 0.2643, "step": 4117 }, { "epoch": 0.8334345274235985, "grad_norm": 0.41825127601623535, "learning_rate": 0.0001260175737313407, "loss": 0.2359, "step": 4118 }, { "epoch": 0.8336369156041287, "grad_norm": 0.2745526134967804, "learning_rate": 0.0001259868606589138, "loss": 0.2481, "step": 4119 }, { "epoch": 0.833839303784659, "grad_norm": 0.330802321434021, "learning_rate": 0.00012595614495740902, "loss": 0.2572, "step": 4120 }, { "epoch": 0.8340416919651892, "grad_norm": 0.2694607377052307, "learning_rate": 0.00012592542662993384, "loss": 0.2102, "step": 4121 }, { "epoch": 0.8342440801457195, "grad_norm": 0.31097790598869324, "learning_rate": 0.00012589470567959601, "loss": 0.2288, "step": 4122 }, { "epoch": 0.8344464683262498, "grad_norm": 0.30229610204696655, "learning_rate": 0.0001258639821095036, "loss": 0.2442, "step": 4123 }, { "epoch": 0.83464885650678, "grad_norm": 0.28008151054382324, "learning_rate": 0.00012583325592276486, "loss": 0.249, "step": 4124 }, { "epoch": 0.8348512446873103, "grad_norm": 0.3760617971420288, "learning_rate": 0.00012580252712248832, "loss": 0.2887, "step": 4125 }, { "epoch": 0.8350536328678405, "grad_norm": 0.27306854724884033, "learning_rate": 0.00012577179571178287, "loss": 0.2536, "step": 4126 }, { "epoch": 0.8352560210483708, "grad_norm": 0.3283601701259613, "learning_rate": 0.0001257410616937575, "loss": 0.253, "step": 4127 }, { "epoch": 0.835458409228901, "grad_norm": 0.26647692918777466, "learning_rate": 0.0001257103250715217, "loss": 0.2347, "step": 4128 }, { "epoch": 0.8356607974094313, "grad_norm": 0.31827718019485474, "learning_rate": 0.00012567958584818492, "loss": 0.2342, "step": 4129 }, { "epoch": 0.8358631855899615, "grad_norm": 0.3126533031463623, "learning_rate": 0.0001256488440268571, "loss": 0.2565, "step": 4130 }, { "epoch": 0.8360655737704918, "grad_norm": 0.2776491940021515, "learning_rate": 0.00012561809961064837, "loss": 0.2314, "step": 4131 }, { "epoch": 0.836267961951022, "grad_norm": 0.25948649644851685, "learning_rate": 0.00012558735260266915, "loss": 0.224, "step": 4132 }, { "epoch": 0.8364703501315524, "grad_norm": 0.3881494998931885, "learning_rate": 0.00012555660300603004, "loss": 0.2598, "step": 4133 }, { "epoch": 0.8366727383120826, "grad_norm": 0.31026434898376465, "learning_rate": 0.00012552585082384202, "loss": 0.2294, "step": 4134 }, { "epoch": 0.8368751264926129, "grad_norm": 0.2892588973045349, "learning_rate": 0.00012549509605921626, "loss": 0.2383, "step": 4135 }, { "epoch": 0.8370775146731431, "grad_norm": 0.28856706619262695, "learning_rate": 0.0001254643387152642, "loss": 0.2234, "step": 4136 }, { "epoch": 0.8372799028536734, "grad_norm": 0.32769984006881714, "learning_rate": 0.0001254335787950975, "loss": 0.28, "step": 4137 }, { "epoch": 0.8374822910342036, "grad_norm": 0.34366121888160706, "learning_rate": 0.0001254028163018282, "loss": 0.2125, "step": 4138 }, { "epoch": 0.8376846792147339, "grad_norm": 0.24694864451885223, "learning_rate": 0.0001253720512385685, "loss": 0.219, "step": 4139 }, { "epoch": 0.8378870673952641, "grad_norm": 0.2664625644683838, "learning_rate": 0.00012534128360843088, "loss": 0.2167, "step": 4140 }, { "epoch": 0.8380894555757944, "grad_norm": 0.27844277024269104, "learning_rate": 0.0001253105134145281, "loss": 0.2333, "step": 4141 }, { "epoch": 0.8382918437563246, "grad_norm": 0.34461063146591187, "learning_rate": 0.00012527974065997314, "loss": 0.2255, "step": 4142 }, { "epoch": 0.8384942319368549, "grad_norm": 0.37902191281318665, "learning_rate": 0.00012524896534787927, "loss": 0.2471, "step": 4143 }, { "epoch": 0.8386966201173851, "grad_norm": 0.3178468942642212, "learning_rate": 0.00012521818748136005, "loss": 0.2689, "step": 4144 }, { "epoch": 0.8388990082979154, "grad_norm": 0.37401625514030457, "learning_rate": 0.0001251874070635292, "loss": 0.2641, "step": 4145 }, { "epoch": 0.8391013964784456, "grad_norm": 0.4163181781768799, "learning_rate": 0.0001251566240975008, "loss": 0.2422, "step": 4146 }, { "epoch": 0.8393037846589759, "grad_norm": 0.30229562520980835, "learning_rate": 0.00012512583858638915, "loss": 0.2431, "step": 4147 }, { "epoch": 0.8395061728395061, "grad_norm": 0.2687293291091919, "learning_rate": 0.0001250950505333088, "loss": 0.2104, "step": 4148 }, { "epoch": 0.8397085610200364, "grad_norm": 0.3084995150566101, "learning_rate": 0.00012506425994137453, "loss": 0.2719, "step": 4149 }, { "epoch": 0.8399109492005666, "grad_norm": 0.30836740136146545, "learning_rate": 0.00012503346681370144, "loss": 0.2236, "step": 4150 }, { "epoch": 0.8399109492005666, "eval_loss": 0.2687932252883911, "eval_runtime": 0.7396, "eval_samples_per_second": 6.76, "eval_steps_per_second": 1.352, "step": 4150 }, { "epoch": 0.840113337381097, "grad_norm": 0.313474178314209, "learning_rate": 0.00012500267115340489, "loss": 0.229, "step": 4151 }, { "epoch": 0.8403157255616271, "grad_norm": 0.25299134850502014, "learning_rate": 0.0001249718729636004, "loss": 0.2492, "step": 4152 }, { "epoch": 0.8405181137421575, "grad_norm": 0.31801363825798035, "learning_rate": 0.0001249410722474038, "loss": 0.2544, "step": 4153 }, { "epoch": 0.8407205019226878, "grad_norm": 0.3109778165817261, "learning_rate": 0.00012491026900793127, "loss": 0.238, "step": 4154 }, { "epoch": 0.840922890103218, "grad_norm": 0.28446948528289795, "learning_rate": 0.00012487946324829904, "loss": 0.2125, "step": 4155 }, { "epoch": 0.8411252782837483, "grad_norm": 0.3104979693889618, "learning_rate": 0.0001248486549716238, "loss": 0.2325, "step": 4156 }, { "epoch": 0.8413276664642785, "grad_norm": 0.3089185953140259, "learning_rate": 0.00012481784418102242, "loss": 0.2224, "step": 4157 }, { "epoch": 0.8415300546448088, "grad_norm": 0.28800642490386963, "learning_rate": 0.00012478703087961192, "loss": 0.2248, "step": 4158 }, { "epoch": 0.841732442825339, "grad_norm": 0.26682931184768677, "learning_rate": 0.00012475621507050975, "loss": 0.2331, "step": 4159 }, { "epoch": 0.8419348310058693, "grad_norm": 0.24975162744522095, "learning_rate": 0.0001247253967568335, "loss": 0.2743, "step": 4160 }, { "epoch": 0.8421372191863995, "grad_norm": 0.2340884804725647, "learning_rate": 0.00012469457594170105, "loss": 0.2211, "step": 4161 }, { "epoch": 0.8423396073669298, "grad_norm": 0.4196295738220215, "learning_rate": 0.0001246637526282305, "loss": 0.2465, "step": 4162 }, { "epoch": 0.84254199554746, "grad_norm": 0.34679028391838074, "learning_rate": 0.00012463292681954029, "loss": 0.2195, "step": 4163 }, { "epoch": 0.8427443837279903, "grad_norm": 0.4965565800666809, "learning_rate": 0.000124602098518749, "loss": 0.2472, "step": 4164 }, { "epoch": 0.8429467719085205, "grad_norm": 0.31285765767097473, "learning_rate": 0.00012457126772897554, "loss": 0.2732, "step": 4165 }, { "epoch": 0.8431491600890508, "grad_norm": 0.32587265968322754, "learning_rate": 0.000124540434453339, "loss": 0.2426, "step": 4166 }, { "epoch": 0.843351548269581, "grad_norm": 0.30432063341140747, "learning_rate": 0.00012450959869495884, "loss": 0.2891, "step": 4167 }, { "epoch": 0.8435539364501113, "grad_norm": 0.2635737359523773, "learning_rate": 0.00012447876045695465, "loss": 0.2311, "step": 4168 }, { "epoch": 0.8437563246306415, "grad_norm": 0.28429943323135376, "learning_rate": 0.00012444791974244632, "loss": 0.2306, "step": 4169 }, { "epoch": 0.8439587128111719, "grad_norm": 0.2997816503047943, "learning_rate": 0.000124417076554554, "loss": 0.2489, "step": 4170 }, { "epoch": 0.844161100991702, "grad_norm": 0.258544385433197, "learning_rate": 0.00012438623089639807, "loss": 0.2333, "step": 4171 }, { "epoch": 0.8443634891722324, "grad_norm": 0.2563791871070862, "learning_rate": 0.0001243553827710992, "loss": 0.2013, "step": 4172 }, { "epoch": 0.8445658773527626, "grad_norm": 0.28656166791915894, "learning_rate": 0.00012432453218177826, "loss": 0.2551, "step": 4173 }, { "epoch": 0.8447682655332929, "grad_norm": 0.2514500916004181, "learning_rate": 0.0001242936791315564, "loss": 0.2311, "step": 4174 }, { "epoch": 0.8449706537138231, "grad_norm": 0.3577767014503479, "learning_rate": 0.00012426282362355497, "loss": 0.233, "step": 4175 }, { "epoch": 0.8451730418943534, "grad_norm": 0.25653162598609924, "learning_rate": 0.00012423196566089563, "loss": 0.2361, "step": 4176 }, { "epoch": 0.8453754300748836, "grad_norm": 0.26661503314971924, "learning_rate": 0.00012420110524670027, "loss": 0.2388, "step": 4177 }, { "epoch": 0.8455778182554139, "grad_norm": 0.3121111989021301, "learning_rate": 0.00012417024238409104, "loss": 0.2512, "step": 4178 }, { "epoch": 0.8457802064359441, "grad_norm": 0.24404673278331757, "learning_rate": 0.0001241393770761903, "loss": 0.23, "step": 4179 }, { "epoch": 0.8459825946164744, "grad_norm": 0.26664167642593384, "learning_rate": 0.00012410850932612067, "loss": 0.2175, "step": 4180 }, { "epoch": 0.8461849827970046, "grad_norm": 0.3187786340713501, "learning_rate": 0.000124077639137005, "loss": 0.2389, "step": 4181 }, { "epoch": 0.8463873709775349, "grad_norm": 0.27171164751052856, "learning_rate": 0.0001240467665119665, "loss": 0.2225, "step": 4182 }, { "epoch": 0.8465897591580651, "grad_norm": 0.27636775374412537, "learning_rate": 0.00012401589145412848, "loss": 0.2409, "step": 4183 }, { "epoch": 0.8467921473385954, "grad_norm": 0.2631177008152008, "learning_rate": 0.00012398501396661455, "loss": 0.208, "step": 4184 }, { "epoch": 0.8469945355191257, "grad_norm": 0.24153582751750946, "learning_rate": 0.00012395413405254853, "loss": 0.2238, "step": 4185 }, { "epoch": 0.8471969236996559, "grad_norm": 0.2937990725040436, "learning_rate": 0.0001239232517150546, "loss": 0.2307, "step": 4186 }, { "epoch": 0.8473993118801862, "grad_norm": 0.2699730694293976, "learning_rate": 0.00012389236695725713, "loss": 0.2187, "step": 4187 }, { "epoch": 0.8476017000607164, "grad_norm": 0.31642740964889526, "learning_rate": 0.00012386147978228062, "loss": 0.2496, "step": 4188 }, { "epoch": 0.8478040882412468, "grad_norm": 0.25987985730171204, "learning_rate": 0.00012383059019325, "loss": 0.221, "step": 4189 }, { "epoch": 0.848006476421777, "grad_norm": 0.2971332371234894, "learning_rate": 0.0001237996981932903, "loss": 0.2033, "step": 4190 }, { "epoch": 0.8482088646023073, "grad_norm": 0.2770395576953888, "learning_rate": 0.00012376880378552684, "loss": 0.2042, "step": 4191 }, { "epoch": 0.8484112527828375, "grad_norm": 0.3305583894252777, "learning_rate": 0.00012373790697308524, "loss": 0.2493, "step": 4192 }, { "epoch": 0.8486136409633678, "grad_norm": 0.2884077727794647, "learning_rate": 0.0001237070077590913, "loss": 0.2318, "step": 4193 }, { "epoch": 0.848816029143898, "grad_norm": 0.2692849934101105, "learning_rate": 0.00012367610614667104, "loss": 0.2435, "step": 4194 }, { "epoch": 0.8490184173244283, "grad_norm": 0.3168359696865082, "learning_rate": 0.00012364520213895084, "loss": 0.2159, "step": 4195 }, { "epoch": 0.8492208055049585, "grad_norm": 0.2593216598033905, "learning_rate": 0.00012361429573905716, "loss": 0.2372, "step": 4196 }, { "epoch": 0.8494231936854888, "grad_norm": 0.33089300990104675, "learning_rate": 0.00012358338695011683, "loss": 0.2329, "step": 4197 }, { "epoch": 0.849625581866019, "grad_norm": 0.32414549589157104, "learning_rate": 0.00012355247577525686, "loss": 0.2377, "step": 4198 }, { "epoch": 0.8498279700465493, "grad_norm": 0.2713814973831177, "learning_rate": 0.0001235215622176045, "loss": 0.2442, "step": 4199 }, { "epoch": 0.8500303582270795, "grad_norm": 0.525406002998352, "learning_rate": 0.00012349064628028731, "loss": 0.2433, "step": 4200 }, { "epoch": 0.8500303582270795, "eval_loss": 0.26852184534072876, "eval_runtime": 0.7369, "eval_samples_per_second": 6.785, "eval_steps_per_second": 1.357, "step": 4200 }, { "epoch": 0.8502327464076098, "grad_norm": 0.33354291319847107, "learning_rate": 0.000123459727966433, "loss": 0.2556, "step": 4201 }, { "epoch": 0.85043513458814, "grad_norm": 0.30555057525634766, "learning_rate": 0.00012342880727916962, "loss": 0.2485, "step": 4202 }, { "epoch": 0.8506375227686703, "grad_norm": 0.30072835087776184, "learning_rate": 0.0001233978842216253, "loss": 0.2212, "step": 4203 }, { "epoch": 0.8508399109492005, "grad_norm": 0.29450398683547974, "learning_rate": 0.0001233669587969286, "loss": 0.2406, "step": 4204 }, { "epoch": 0.8510422991297308, "grad_norm": 0.25204646587371826, "learning_rate": 0.00012333603100820817, "loss": 0.231, "step": 4205 }, { "epoch": 0.851244687310261, "grad_norm": 0.3143097758293152, "learning_rate": 0.000123305100858593, "loss": 0.2027, "step": 4206 }, { "epoch": 0.8514470754907913, "grad_norm": 0.2622186243534088, "learning_rate": 0.00012327416835121227, "loss": 0.2354, "step": 4207 }, { "epoch": 0.8516494636713215, "grad_norm": 0.32362186908721924, "learning_rate": 0.00012324323348919538, "loss": 0.2323, "step": 4208 }, { "epoch": 0.8518518518518519, "grad_norm": 0.27008962631225586, "learning_rate": 0.00012321229627567203, "loss": 0.2444, "step": 4209 }, { "epoch": 0.8520542400323821, "grad_norm": 0.26546764373779297, "learning_rate": 0.0001231813567137721, "loss": 0.2562, "step": 4210 }, { "epoch": 0.8522566282129124, "grad_norm": 0.3695333003997803, "learning_rate": 0.00012315041480662572, "loss": 0.2415, "step": 4211 }, { "epoch": 0.8524590163934426, "grad_norm": 0.27647483348846436, "learning_rate": 0.00012311947055736332, "loss": 0.2173, "step": 4212 }, { "epoch": 0.8526614045739729, "grad_norm": 0.29639866948127747, "learning_rate": 0.00012308852396911545, "loss": 0.2011, "step": 4213 }, { "epoch": 0.8528637927545032, "grad_norm": 0.24613668024539948, "learning_rate": 0.00012305757504501297, "loss": 0.2152, "step": 4214 }, { "epoch": 0.8530661809350334, "grad_norm": 0.2693268656730652, "learning_rate": 0.00012302662378818702, "loss": 0.2334, "step": 4215 }, { "epoch": 0.8532685691155637, "grad_norm": 0.2689792811870575, "learning_rate": 0.0001229956702017689, "loss": 0.2235, "step": 4216 }, { "epoch": 0.8534709572960939, "grad_norm": 0.28079694509506226, "learning_rate": 0.00012296471428889017, "loss": 0.2363, "step": 4217 }, { "epoch": 0.8536733454766242, "grad_norm": 0.41205066442489624, "learning_rate": 0.00012293375605268257, "loss": 0.2761, "step": 4218 }, { "epoch": 0.8538757336571544, "grad_norm": 0.28291985392570496, "learning_rate": 0.0001229027954962782, "loss": 0.2173, "step": 4219 }, { "epoch": 0.8540781218376847, "grad_norm": 0.2866034209728241, "learning_rate": 0.0001228718326228093, "loss": 0.2426, "step": 4220 }, { "epoch": 0.8542805100182149, "grad_norm": 0.26522722840309143, "learning_rate": 0.00012284086743540837, "loss": 0.2061, "step": 4221 }, { "epoch": 0.8544828981987452, "grad_norm": 0.2686139941215515, "learning_rate": 0.00012280989993720812, "loss": 0.2458, "step": 4222 }, { "epoch": 0.8546852863792754, "grad_norm": 0.3038209080696106, "learning_rate": 0.00012277893013134153, "loss": 0.2744, "step": 4223 }, { "epoch": 0.8548876745598057, "grad_norm": 0.29698294401168823, "learning_rate": 0.00012274795802094183, "loss": 0.2576, "step": 4224 }, { "epoch": 0.8550900627403359, "grad_norm": 0.3130031228065491, "learning_rate": 0.00012271698360914241, "loss": 0.2659, "step": 4225 }, { "epoch": 0.8552924509208663, "grad_norm": 0.29156437516212463, "learning_rate": 0.00012268600689907696, "loss": 0.2503, "step": 4226 }, { "epoch": 0.8554948391013965, "grad_norm": 0.32451170682907104, "learning_rate": 0.0001226550278938794, "loss": 0.2334, "step": 4227 }, { "epoch": 0.8556972272819268, "grad_norm": 0.4321478605270386, "learning_rate": 0.0001226240465966838, "loss": 0.2359, "step": 4228 }, { "epoch": 0.855899615462457, "grad_norm": 0.2836534082889557, "learning_rate": 0.00012259306301062457, "loss": 0.2312, "step": 4229 }, { "epoch": 0.8561020036429873, "grad_norm": 0.28352999687194824, "learning_rate": 0.00012256207713883633, "loss": 0.2497, "step": 4230 }, { "epoch": 0.8563043918235175, "grad_norm": 0.2701122760772705, "learning_rate": 0.0001225310889844538, "loss": 0.2047, "step": 4231 }, { "epoch": 0.8565067800040478, "grad_norm": 0.29781049489974976, "learning_rate": 0.00012250009855061214, "loss": 0.2241, "step": 4232 }, { "epoch": 0.856709168184578, "grad_norm": 0.2576650083065033, "learning_rate": 0.00012246910584044656, "loss": 0.2108, "step": 4233 }, { "epoch": 0.8569115563651083, "grad_norm": 0.3117930591106415, "learning_rate": 0.00012243811085709268, "loss": 0.2287, "step": 4234 }, { "epoch": 0.8571139445456385, "grad_norm": 0.2636104226112366, "learning_rate": 0.00012240711360368613, "loss": 0.2458, "step": 4235 }, { "epoch": 0.8573163327261688, "grad_norm": 0.2883046865463257, "learning_rate": 0.00012237611408336298, "loss": 0.2275, "step": 4236 }, { "epoch": 0.857518720906699, "grad_norm": 0.31685107946395874, "learning_rate": 0.00012234511229925935, "loss": 0.2821, "step": 4237 }, { "epoch": 0.8577211090872293, "grad_norm": 0.2774471342563629, "learning_rate": 0.00012231410825451177, "loss": 0.2242, "step": 4238 }, { "epoch": 0.8579234972677595, "grad_norm": 0.2992092967033386, "learning_rate": 0.00012228310195225683, "loss": 0.2295, "step": 4239 }, { "epoch": 0.8581258854482898, "grad_norm": 0.49218326807022095, "learning_rate": 0.00012225209339563145, "loss": 0.2647, "step": 4240 }, { "epoch": 0.85832827362882, "grad_norm": 0.3133196234703064, "learning_rate": 0.00012222108258777277, "loss": 0.2413, "step": 4241 }, { "epoch": 0.8585306618093503, "grad_norm": 0.3050863742828369, "learning_rate": 0.0001221900695318181, "loss": 0.2365, "step": 4242 }, { "epoch": 0.8587330499898805, "grad_norm": 0.2712414860725403, "learning_rate": 0.00012215905423090503, "loss": 0.2068, "step": 4243 }, { "epoch": 0.8589354381704108, "grad_norm": 0.2704077661037445, "learning_rate": 0.00012212803668817135, "loss": 0.1997, "step": 4244 }, { "epoch": 0.8591378263509412, "grad_norm": 0.2854161262512207, "learning_rate": 0.00012209701690675512, "loss": 0.2451, "step": 4245 }, { "epoch": 0.8593402145314714, "grad_norm": 0.2779470384120941, "learning_rate": 0.00012206599488979458, "loss": 0.234, "step": 4246 }, { "epoch": 0.8595426027120017, "grad_norm": 0.2911601662635803, "learning_rate": 0.00012203497064042821, "loss": 0.2534, "step": 4247 }, { "epoch": 0.8597449908925319, "grad_norm": 0.24760468304157257, "learning_rate": 0.00012200394416179473, "loss": 0.207, "step": 4248 }, { "epoch": 0.8599473790730622, "grad_norm": 0.2779715359210968, "learning_rate": 0.00012197291545703306, "loss": 0.2406, "step": 4249 }, { "epoch": 0.8601497672535924, "grad_norm": 0.2952604591846466, "learning_rate": 0.00012194188452928237, "loss": 0.2307, "step": 4250 }, { "epoch": 0.8601497672535924, "eval_loss": 0.26880744099617004, "eval_runtime": 0.7389, "eval_samples_per_second": 6.767, "eval_steps_per_second": 1.353, "step": 4250 }, { "epoch": 0.8603521554341227, "grad_norm": 0.26754164695739746, "learning_rate": 0.00012191085138168205, "loss": 0.2347, "step": 4251 }, { "epoch": 0.8605545436146529, "grad_norm": 0.3067428171634674, "learning_rate": 0.00012187981601737168, "loss": 0.2407, "step": 4252 }, { "epoch": 0.8607569317951832, "grad_norm": 0.26935678720474243, "learning_rate": 0.00012184877843949109, "loss": 0.2517, "step": 4253 }, { "epoch": 0.8609593199757134, "grad_norm": 0.2682383358478546, "learning_rate": 0.00012181773865118038, "loss": 0.2088, "step": 4254 }, { "epoch": 0.8611617081562437, "grad_norm": 0.3583400249481201, "learning_rate": 0.00012178669665557978, "loss": 0.2717, "step": 4255 }, { "epoch": 0.8613640963367739, "grad_norm": 0.26008254289627075, "learning_rate": 0.00012175565245582983, "loss": 0.2705, "step": 4256 }, { "epoch": 0.8615664845173042, "grad_norm": 0.46403968334198, "learning_rate": 0.00012172460605507126, "loss": 0.2724, "step": 4257 }, { "epoch": 0.8617688726978344, "grad_norm": 0.2515455484390259, "learning_rate": 0.00012169355745644498, "loss": 0.2391, "step": 4258 }, { "epoch": 0.8619712608783647, "grad_norm": 0.25544747710227966, "learning_rate": 0.00012166250666309218, "loss": 0.2561, "step": 4259 }, { "epoch": 0.8621736490588949, "grad_norm": 0.29672637581825256, "learning_rate": 0.00012163145367815428, "loss": 0.2362, "step": 4260 }, { "epoch": 0.8623760372394252, "grad_norm": 0.29169073700904846, "learning_rate": 0.00012160039850477286, "loss": 0.2237, "step": 4261 }, { "epoch": 0.8625784254199554, "grad_norm": 0.28334859013557434, "learning_rate": 0.00012156934114608977, "loss": 0.2228, "step": 4262 }, { "epoch": 0.8627808136004858, "grad_norm": 0.2901778221130371, "learning_rate": 0.00012153828160524707, "loss": 0.2407, "step": 4263 }, { "epoch": 0.862983201781016, "grad_norm": 0.276862233877182, "learning_rate": 0.00012150721988538703, "loss": 0.2362, "step": 4264 }, { "epoch": 0.8631855899615463, "grad_norm": 0.30135321617126465, "learning_rate": 0.00012147615598965216, "loss": 0.2148, "step": 4265 }, { "epoch": 0.8633879781420765, "grad_norm": 0.2975095212459564, "learning_rate": 0.00012144508992118518, "loss": 0.2506, "step": 4266 }, { "epoch": 0.8635903663226068, "grad_norm": 0.3389938771724701, "learning_rate": 0.000121414021683129, "loss": 0.2407, "step": 4267 }, { "epoch": 0.863792754503137, "grad_norm": 0.2970937490463257, "learning_rate": 0.00012138295127862682, "loss": 0.2218, "step": 4268 }, { "epoch": 0.8639951426836673, "grad_norm": 0.315762996673584, "learning_rate": 0.00012135187871082201, "loss": 0.2503, "step": 4269 }, { "epoch": 0.8641975308641975, "grad_norm": 0.3555409610271454, "learning_rate": 0.00012132080398285812, "loss": 0.2838, "step": 4270 }, { "epoch": 0.8643999190447278, "grad_norm": 0.29222363233566284, "learning_rate": 0.00012128972709787903, "loss": 0.2555, "step": 4271 }, { "epoch": 0.864602307225258, "grad_norm": 0.31274712085723877, "learning_rate": 0.00012125864805902873, "loss": 0.2368, "step": 4272 }, { "epoch": 0.8648046954057883, "grad_norm": 0.412761390209198, "learning_rate": 0.00012122756686945151, "loss": 0.2347, "step": 4273 }, { "epoch": 0.8650070835863185, "grad_norm": 0.32250848412513733, "learning_rate": 0.00012119648353229179, "loss": 0.2867, "step": 4274 }, { "epoch": 0.8652094717668488, "grad_norm": 0.30340439081192017, "learning_rate": 0.00012116539805069426, "loss": 0.2235, "step": 4275 }, { "epoch": 0.8654118599473791, "grad_norm": 0.2612261176109314, "learning_rate": 0.00012113431042780387, "loss": 0.2122, "step": 4276 }, { "epoch": 0.8656142481279093, "grad_norm": 0.2950068712234497, "learning_rate": 0.0001211032206667657, "loss": 0.2534, "step": 4277 }, { "epoch": 0.8658166363084396, "grad_norm": 0.2997754216194153, "learning_rate": 0.00012107212877072509, "loss": 0.246, "step": 4278 }, { "epoch": 0.8660190244889698, "grad_norm": 0.3149646818637848, "learning_rate": 0.0001210410347428276, "loss": 0.2493, "step": 4279 }, { "epoch": 0.8662214126695001, "grad_norm": 0.33829453587532043, "learning_rate": 0.000121009938586219, "loss": 0.2602, "step": 4280 }, { "epoch": 0.8664238008500303, "grad_norm": 0.3235696256160736, "learning_rate": 0.00012097884030404527, "loss": 0.2269, "step": 4281 }, { "epoch": 0.8666261890305607, "grad_norm": 0.23874787986278534, "learning_rate": 0.00012094773989945261, "loss": 0.2319, "step": 4282 }, { "epoch": 0.8668285772110909, "grad_norm": 0.26383423805236816, "learning_rate": 0.00012091663737558743, "loss": 0.2389, "step": 4283 }, { "epoch": 0.8670309653916212, "grad_norm": 0.28093284368515015, "learning_rate": 0.00012088553273559638, "loss": 0.2476, "step": 4284 }, { "epoch": 0.8672333535721514, "grad_norm": 0.31474965810775757, "learning_rate": 0.00012085442598262624, "loss": 0.2602, "step": 4285 }, { "epoch": 0.8674357417526817, "grad_norm": 0.25037381052970886, "learning_rate": 0.00012082331711982411, "loss": 0.2328, "step": 4286 }, { "epoch": 0.8676381299332119, "grad_norm": 0.34945225715637207, "learning_rate": 0.00012079220615033724, "loss": 0.2495, "step": 4287 }, { "epoch": 0.8678405181137422, "grad_norm": 0.48855215311050415, "learning_rate": 0.00012076109307731314, "loss": 0.2462, "step": 4288 }, { "epoch": 0.8680429062942724, "grad_norm": 0.288464218378067, "learning_rate": 0.00012072997790389946, "loss": 0.2455, "step": 4289 }, { "epoch": 0.8682452944748027, "grad_norm": 0.3199939429759979, "learning_rate": 0.00012069886063324414, "loss": 0.2443, "step": 4290 }, { "epoch": 0.8684476826553329, "grad_norm": 0.35276976227760315, "learning_rate": 0.00012066774126849529, "loss": 0.229, "step": 4291 }, { "epoch": 0.8686500708358632, "grad_norm": 0.22685624659061432, "learning_rate": 0.00012063661981280125, "loss": 0.201, "step": 4292 }, { "epoch": 0.8688524590163934, "grad_norm": 0.3191373944282532, "learning_rate": 0.00012060549626931057, "loss": 0.2442, "step": 4293 }, { "epoch": 0.8690548471969237, "grad_norm": 0.3646087944507599, "learning_rate": 0.00012057437064117198, "loss": 0.2304, "step": 4294 }, { "epoch": 0.8692572353774539, "grad_norm": 0.2860366702079773, "learning_rate": 0.00012054324293153447, "loss": 0.2518, "step": 4295 }, { "epoch": 0.8694596235579842, "grad_norm": 0.2568869888782501, "learning_rate": 0.00012051211314354719, "loss": 0.2683, "step": 4296 }, { "epoch": 0.8696620117385144, "grad_norm": 0.3527676463127136, "learning_rate": 0.00012048098128035951, "loss": 0.2383, "step": 4297 }, { "epoch": 0.8698643999190447, "grad_norm": 0.26397374272346497, "learning_rate": 0.00012044984734512106, "loss": 0.2068, "step": 4298 }, { "epoch": 0.8700667880995749, "grad_norm": 0.28540942072868347, "learning_rate": 0.00012041871134098164, "loss": 0.2392, "step": 4299 }, { "epoch": 0.8702691762801052, "grad_norm": 0.30011117458343506, "learning_rate": 0.00012038757327109125, "loss": 0.2559, "step": 4300 }, { "epoch": 0.8702691762801052, "eval_loss": 0.26672375202178955, "eval_runtime": 0.7378, "eval_samples_per_second": 6.777, "eval_steps_per_second": 1.355, "step": 4300 }, { "epoch": 0.8704715644606354, "grad_norm": 0.2464076280593872, "learning_rate": 0.00012035643313860013, "loss": 0.2172, "step": 4301 }, { "epoch": 0.8706739526411658, "grad_norm": 0.3207111656665802, "learning_rate": 0.0001203252909466587, "loss": 0.2681, "step": 4302 }, { "epoch": 0.870876340821696, "grad_norm": 0.26082876324653625, "learning_rate": 0.00012029414669841758, "loss": 0.246, "step": 4303 }, { "epoch": 0.8710787290022263, "grad_norm": 0.29645365476608276, "learning_rate": 0.00012026300039702766, "loss": 0.202, "step": 4304 }, { "epoch": 0.8712811171827566, "grad_norm": 0.24679231643676758, "learning_rate": 0.00012023185204563998, "loss": 0.1947, "step": 4305 }, { "epoch": 0.8714835053632868, "grad_norm": 0.3063866198062897, "learning_rate": 0.00012020070164740582, "loss": 0.2473, "step": 4306 }, { "epoch": 0.8716858935438171, "grad_norm": 0.305171936750412, "learning_rate": 0.00012016954920547661, "loss": 0.2506, "step": 4307 }, { "epoch": 0.8718882817243473, "grad_norm": 0.28420180082321167, "learning_rate": 0.00012013839472300406, "loss": 0.2275, "step": 4308 }, { "epoch": 0.8720906699048776, "grad_norm": 0.32442429661750793, "learning_rate": 0.00012010723820314, "loss": 0.2362, "step": 4309 }, { "epoch": 0.8722930580854078, "grad_norm": 0.3036497235298157, "learning_rate": 0.0001200760796490366, "loss": 0.2795, "step": 4310 }, { "epoch": 0.8724954462659381, "grad_norm": 0.2749023735523224, "learning_rate": 0.0001200449190638461, "loss": 0.2314, "step": 4311 }, { "epoch": 0.8726978344464683, "grad_norm": 0.3086685836315155, "learning_rate": 0.00012001375645072104, "loss": 0.2589, "step": 4312 }, { "epoch": 0.8729002226269986, "grad_norm": 0.27015221118927, "learning_rate": 0.00011998259181281408, "loss": 0.2445, "step": 4313 }, { "epoch": 0.8731026108075288, "grad_norm": 0.23451970517635345, "learning_rate": 0.00011995142515327815, "loss": 0.1892, "step": 4314 }, { "epoch": 0.8733049989880591, "grad_norm": 0.3884549140930176, "learning_rate": 0.00011992025647526639, "loss": 0.2129, "step": 4315 }, { "epoch": 0.8735073871685893, "grad_norm": 0.3305327594280243, "learning_rate": 0.0001198890857819321, "loss": 0.2281, "step": 4316 }, { "epoch": 0.8737097753491196, "grad_norm": 0.26919251680374146, "learning_rate": 0.0001198579130764288, "loss": 0.269, "step": 4317 }, { "epoch": 0.8739121635296498, "grad_norm": 0.24107398092746735, "learning_rate": 0.00011982673836191023, "loss": 0.2294, "step": 4318 }, { "epoch": 0.8741145517101802, "grad_norm": 0.30871227383613586, "learning_rate": 0.0001197955616415303, "loss": 0.2511, "step": 4319 }, { "epoch": 0.8743169398907104, "grad_norm": 0.2817114293575287, "learning_rate": 0.00011976438291844316, "loss": 0.2449, "step": 4320 }, { "epoch": 0.8745193280712407, "grad_norm": 0.2848239839076996, "learning_rate": 0.00011973320219580312, "loss": 0.2531, "step": 4321 }, { "epoch": 0.8747217162517709, "grad_norm": 0.2998313009738922, "learning_rate": 0.00011970201947676478, "loss": 0.2148, "step": 4322 }, { "epoch": 0.8749241044323012, "grad_norm": 0.24333159625530243, "learning_rate": 0.00011967083476448282, "loss": 0.1948, "step": 4323 }, { "epoch": 0.8751264926128314, "grad_norm": 0.2738083600997925, "learning_rate": 0.0001196396480621122, "loss": 0.2395, "step": 4324 }, { "epoch": 0.8753288807933617, "grad_norm": 0.23754236102104187, "learning_rate": 0.00011960845937280807, "loss": 0.2232, "step": 4325 }, { "epoch": 0.8755312689738919, "grad_norm": 0.2790989279747009, "learning_rate": 0.00011957726869972577, "loss": 0.2207, "step": 4326 }, { "epoch": 0.8757336571544222, "grad_norm": 0.32336124777793884, "learning_rate": 0.00011954607604602084, "loss": 0.2323, "step": 4327 }, { "epoch": 0.8759360453349524, "grad_norm": 0.27224284410476685, "learning_rate": 0.00011951488141484903, "loss": 0.2312, "step": 4328 }, { "epoch": 0.8761384335154827, "grad_norm": 0.2953594923019409, "learning_rate": 0.00011948368480936631, "loss": 0.2485, "step": 4329 }, { "epoch": 0.8763408216960129, "grad_norm": 0.281449556350708, "learning_rate": 0.00011945248623272878, "loss": 0.2772, "step": 4330 }, { "epoch": 0.8765432098765432, "grad_norm": 0.2558891475200653, "learning_rate": 0.0001194212856880928, "loss": 0.2237, "step": 4331 }, { "epoch": 0.8767455980570734, "grad_norm": 0.36965882778167725, "learning_rate": 0.00011939008317861494, "loss": 0.2371, "step": 4332 }, { "epoch": 0.8769479862376037, "grad_norm": 0.26143166422843933, "learning_rate": 0.0001193588787074519, "loss": 0.2301, "step": 4333 }, { "epoch": 0.8771503744181339, "grad_norm": 0.31695854663848877, "learning_rate": 0.00011932767227776065, "loss": 0.2345, "step": 4334 }, { "epoch": 0.8773527625986642, "grad_norm": 0.2816372215747833, "learning_rate": 0.00011929646389269833, "loss": 0.2551, "step": 4335 }, { "epoch": 0.8775551507791945, "grad_norm": 0.25129345059394836, "learning_rate": 0.00011926525355542227, "loss": 0.2437, "step": 4336 }, { "epoch": 0.8777575389597247, "grad_norm": 0.23418568074703217, "learning_rate": 0.00011923404126909, "loss": 0.2251, "step": 4337 }, { "epoch": 0.8779599271402551, "grad_norm": 0.3892250061035156, "learning_rate": 0.00011920282703685923, "loss": 0.2311, "step": 4338 }, { "epoch": 0.8781623153207853, "grad_norm": 0.3089623749256134, "learning_rate": 0.00011917161086188793, "loss": 0.2332, "step": 4339 }, { "epoch": 0.8783647035013156, "grad_norm": 0.3821837604045868, "learning_rate": 0.00011914039274733422, "loss": 0.2771, "step": 4340 }, { "epoch": 0.8785670916818458, "grad_norm": 0.2563057243824005, "learning_rate": 0.0001191091726963564, "loss": 0.2575, "step": 4341 }, { "epoch": 0.8787694798623761, "grad_norm": 0.2678040862083435, "learning_rate": 0.00011907795071211298, "loss": 0.224, "step": 4342 }, { "epoch": 0.8789718680429063, "grad_norm": 0.34176933765411377, "learning_rate": 0.00011904672679776272, "loss": 0.2652, "step": 4343 }, { "epoch": 0.8791742562234366, "grad_norm": 0.304793119430542, "learning_rate": 0.00011901550095646447, "loss": 0.2023, "step": 4344 }, { "epoch": 0.8793766444039668, "grad_norm": 0.266438364982605, "learning_rate": 0.0001189842731913774, "loss": 0.2156, "step": 4345 }, { "epoch": 0.8795790325844971, "grad_norm": 0.2922779321670532, "learning_rate": 0.00011895304350566073, "loss": 0.2285, "step": 4346 }, { "epoch": 0.8797814207650273, "grad_norm": 0.27575254440307617, "learning_rate": 0.000118921811902474, "loss": 0.2364, "step": 4347 }, { "epoch": 0.8799838089455576, "grad_norm": 0.2569499909877777, "learning_rate": 0.0001188905783849769, "loss": 0.2504, "step": 4348 }, { "epoch": 0.8801861971260878, "grad_norm": 0.387317419052124, "learning_rate": 0.00011885934295632928, "loss": 0.2598, "step": 4349 }, { "epoch": 0.8803885853066181, "grad_norm": 0.3084860146045685, "learning_rate": 0.00011882810561969124, "loss": 0.2644, "step": 4350 }, { "epoch": 0.8803885853066181, "eval_loss": 0.27138298749923706, "eval_runtime": 0.7407, "eval_samples_per_second": 6.751, "eval_steps_per_second": 1.35, "step": 4350 }, { "epoch": 0.8805909734871483, "grad_norm": 0.2924489676952362, "learning_rate": 0.00011879686637822305, "loss": 0.2464, "step": 4351 }, { "epoch": 0.8807933616676786, "grad_norm": 0.3202839195728302, "learning_rate": 0.00011876562523508512, "loss": 0.2118, "step": 4352 }, { "epoch": 0.8809957498482088, "grad_norm": 0.32835039496421814, "learning_rate": 0.00011873438219343816, "loss": 0.2227, "step": 4353 }, { "epoch": 0.8811981380287391, "grad_norm": 0.2956985533237457, "learning_rate": 0.00011870313725644295, "loss": 0.2616, "step": 4354 }, { "epoch": 0.8814005262092693, "grad_norm": 0.2905611991882324, "learning_rate": 0.00011867189042726059, "loss": 0.2207, "step": 4355 }, { "epoch": 0.8816029143897997, "grad_norm": 0.29978570342063904, "learning_rate": 0.00011864064170905229, "loss": 0.2191, "step": 4356 }, { "epoch": 0.8818053025703299, "grad_norm": 0.2473001331090927, "learning_rate": 0.00011860939110497945, "loss": 0.2369, "step": 4357 }, { "epoch": 0.8820076907508602, "grad_norm": 0.3887574076652527, "learning_rate": 0.00011857813861820366, "loss": 0.2523, "step": 4358 }, { "epoch": 0.8822100789313904, "grad_norm": 0.32068565487861633, "learning_rate": 0.00011854688425188673, "loss": 0.2293, "step": 4359 }, { "epoch": 0.8824124671119207, "grad_norm": 0.31508004665374756, "learning_rate": 0.00011851562800919071, "loss": 0.2661, "step": 4360 }, { "epoch": 0.8826148552924509, "grad_norm": 0.2505917549133301, "learning_rate": 0.0001184843698932777, "loss": 0.2282, "step": 4361 }, { "epoch": 0.8828172434729812, "grad_norm": 0.38696616888046265, "learning_rate": 0.00011845310990731014, "loss": 0.2559, "step": 4362 }, { "epoch": 0.8830196316535114, "grad_norm": 0.25132423639297485, "learning_rate": 0.00011842184805445051, "loss": 0.2044, "step": 4363 }, { "epoch": 0.8832220198340417, "grad_norm": 0.3269764482975006, "learning_rate": 0.00011839058433786158, "loss": 0.2734, "step": 4364 }, { "epoch": 0.8834244080145719, "grad_norm": 0.32102179527282715, "learning_rate": 0.00011835931876070632, "loss": 0.2147, "step": 4365 }, { "epoch": 0.8836267961951022, "grad_norm": 0.2468416839838028, "learning_rate": 0.00011832805132614781, "loss": 0.2289, "step": 4366 }, { "epoch": 0.8838291843756325, "grad_norm": 0.24596892297267914, "learning_rate": 0.00011829678203734937, "loss": 0.2056, "step": 4367 }, { "epoch": 0.8840315725561627, "grad_norm": 0.2805595397949219, "learning_rate": 0.00011826551089747455, "loss": 0.2155, "step": 4368 }, { "epoch": 0.884233960736693, "grad_norm": 0.345514714717865, "learning_rate": 0.00011823423790968698, "loss": 0.2542, "step": 4369 }, { "epoch": 0.8844363489172232, "grad_norm": 0.25061899423599243, "learning_rate": 0.00011820296307715053, "loss": 0.1959, "step": 4370 }, { "epoch": 0.8846387370977535, "grad_norm": 0.30302125215530396, "learning_rate": 0.0001181716864030293, "loss": 0.196, "step": 4371 }, { "epoch": 0.8848411252782837, "grad_norm": 0.2581700086593628, "learning_rate": 0.0001181404078904875, "loss": 0.2058, "step": 4372 }, { "epoch": 0.885043513458814, "grad_norm": 0.2704077959060669, "learning_rate": 0.00011810912754268962, "loss": 0.2, "step": 4373 }, { "epoch": 0.8852459016393442, "grad_norm": 0.22900304198265076, "learning_rate": 0.00011807784536280018, "loss": 0.2031, "step": 4374 }, { "epoch": 0.8854482898198746, "grad_norm": 0.25832971930503845, "learning_rate": 0.00011804656135398404, "loss": 0.2102, "step": 4375 }, { "epoch": 0.8856506780004048, "grad_norm": 0.31184902787208557, "learning_rate": 0.00011801527551940619, "loss": 0.2341, "step": 4376 }, { "epoch": 0.8858530661809351, "grad_norm": 0.30117902159690857, "learning_rate": 0.00011798398786223179, "loss": 0.2479, "step": 4377 }, { "epoch": 0.8860554543614653, "grad_norm": 0.3672550618648529, "learning_rate": 0.00011795269838562621, "loss": 0.2278, "step": 4378 }, { "epoch": 0.8862578425419956, "grad_norm": 0.2866049110889435, "learning_rate": 0.00011792140709275498, "loss": 0.1992, "step": 4379 }, { "epoch": 0.8864602307225258, "grad_norm": 0.31390565633773804, "learning_rate": 0.00011789011398678385, "loss": 0.2485, "step": 4380 }, { "epoch": 0.8866626189030561, "grad_norm": 0.3039097189903259, "learning_rate": 0.00011785881907087866, "loss": 0.2522, "step": 4381 }, { "epoch": 0.8868650070835863, "grad_norm": 0.2775220572948456, "learning_rate": 0.00011782752234820558, "loss": 0.2482, "step": 4382 }, { "epoch": 0.8870673952641166, "grad_norm": 0.2754577100276947, "learning_rate": 0.00011779622382193083, "loss": 0.2364, "step": 4383 }, { "epoch": 0.8872697834446468, "grad_norm": 0.38729971647262573, "learning_rate": 0.00011776492349522092, "loss": 0.2182, "step": 4384 }, { "epoch": 0.8874721716251771, "grad_norm": 0.3540734350681305, "learning_rate": 0.00011773362137124244, "loss": 0.2203, "step": 4385 }, { "epoch": 0.8876745598057073, "grad_norm": 0.2709711492061615, "learning_rate": 0.00011770231745316222, "loss": 0.2158, "step": 4386 }, { "epoch": 0.8878769479862376, "grad_norm": 0.27588939666748047, "learning_rate": 0.00011767101174414727, "loss": 0.1985, "step": 4387 }, { "epoch": 0.8880793361667678, "grad_norm": 0.27467525005340576, "learning_rate": 0.00011763970424736477, "loss": 0.2165, "step": 4388 }, { "epoch": 0.8882817243472981, "grad_norm": 0.32186320424079895, "learning_rate": 0.00011760839496598208, "loss": 0.2277, "step": 4389 }, { "epoch": 0.8884841125278283, "grad_norm": 0.33760732412338257, "learning_rate": 0.00011757708390316678, "loss": 0.2447, "step": 4390 }, { "epoch": 0.8886865007083586, "grad_norm": 0.27130693197250366, "learning_rate": 0.00011754577106208654, "loss": 0.2472, "step": 4391 }, { "epoch": 0.8888888888888888, "grad_norm": 0.2727298438549042, "learning_rate": 0.00011751445644590928, "loss": 0.238, "step": 4392 }, { "epoch": 0.8890912770694192, "grad_norm": 0.33025866746902466, "learning_rate": 0.0001174831400578031, "loss": 0.2187, "step": 4393 }, { "epoch": 0.8892936652499493, "grad_norm": 0.2998366057872772, "learning_rate": 0.00011745182190093626, "loss": 0.2469, "step": 4394 }, { "epoch": 0.8894960534304797, "grad_norm": 0.27042579650878906, "learning_rate": 0.0001174205019784772, "loss": 0.209, "step": 4395 }, { "epoch": 0.88969844161101, "grad_norm": 0.2757139205932617, "learning_rate": 0.00011738918029359453, "loss": 0.2332, "step": 4396 }, { "epoch": 0.8899008297915402, "grad_norm": 0.2801726460456848, "learning_rate": 0.00011735785684945708, "loss": 0.2433, "step": 4397 }, { "epoch": 0.8901032179720705, "grad_norm": 0.2413640022277832, "learning_rate": 0.00011732653164923381, "loss": 0.2236, "step": 4398 }, { "epoch": 0.8903056061526007, "grad_norm": 0.37460601329803467, "learning_rate": 0.00011729520469609388, "loss": 0.2886, "step": 4399 }, { "epoch": 0.890507994333131, "grad_norm": 0.2749800384044647, "learning_rate": 0.00011726387599320658, "loss": 0.2384, "step": 4400 }, { "epoch": 0.890507994333131, "eval_loss": 0.2683194875717163, "eval_runtime": 0.7387, "eval_samples_per_second": 6.769, "eval_steps_per_second": 1.354, "step": 4400 }, { "epoch": 0.8907103825136612, "grad_norm": 0.3313538134098053, "learning_rate": 0.00011723254554374148, "loss": 0.2359, "step": 4401 }, { "epoch": 0.8909127706941915, "grad_norm": 0.2636862099170685, "learning_rate": 0.00011720121335086824, "loss": 0.2458, "step": 4402 }, { "epoch": 0.8911151588747217, "grad_norm": 0.3257393538951874, "learning_rate": 0.0001171698794177567, "loss": 0.2657, "step": 4403 }, { "epoch": 0.891317547055252, "grad_norm": 0.2799130976200104, "learning_rate": 0.00011713854374757696, "loss": 0.2262, "step": 4404 }, { "epoch": 0.8915199352357822, "grad_norm": 0.2589074671268463, "learning_rate": 0.00011710720634349916, "loss": 0.2034, "step": 4405 }, { "epoch": 0.8917223234163125, "grad_norm": 0.27647581696510315, "learning_rate": 0.00011707586720869374, "loss": 0.2543, "step": 4406 }, { "epoch": 0.8919247115968427, "grad_norm": 0.3513265550136566, "learning_rate": 0.00011704452634633129, "loss": 0.2222, "step": 4407 }, { "epoch": 0.892127099777373, "grad_norm": 0.2585983872413635, "learning_rate": 0.00011701318375958247, "loss": 0.2068, "step": 4408 }, { "epoch": 0.8923294879579032, "grad_norm": 0.3059662878513336, "learning_rate": 0.00011698183945161824, "loss": 0.2385, "step": 4409 }, { "epoch": 0.8925318761384335, "grad_norm": 0.2765025198459625, "learning_rate": 0.00011695049342560968, "loss": 0.2115, "step": 4410 }, { "epoch": 0.8927342643189637, "grad_norm": 0.3690018653869629, "learning_rate": 0.00011691914568472806, "loss": 0.2162, "step": 4411 }, { "epoch": 0.892936652499494, "grad_norm": 0.3051934242248535, "learning_rate": 0.00011688779623214481, "loss": 0.2866, "step": 4412 }, { "epoch": 0.8931390406800243, "grad_norm": 0.43420571088790894, "learning_rate": 0.00011685644507103152, "loss": 0.2625, "step": 4413 }, { "epoch": 0.8933414288605546, "grad_norm": 0.25232359766960144, "learning_rate": 0.00011682509220456002, "loss": 0.2307, "step": 4414 }, { "epoch": 0.8935438170410848, "grad_norm": 0.2460232675075531, "learning_rate": 0.00011679373763590222, "loss": 0.172, "step": 4415 }, { "epoch": 0.8937462052216151, "grad_norm": 0.3492420017719269, "learning_rate": 0.00011676238136823025, "loss": 0.213, "step": 4416 }, { "epoch": 0.8939485934021453, "grad_norm": 0.36887046694755554, "learning_rate": 0.00011673102340471644, "loss": 0.2217, "step": 4417 }, { "epoch": 0.8941509815826756, "grad_norm": 0.2646304666996002, "learning_rate": 0.00011669966374853323, "loss": 0.2468, "step": 4418 }, { "epoch": 0.8943533697632058, "grad_norm": 0.33006051182746887, "learning_rate": 0.00011666830240285328, "loss": 0.2327, "step": 4419 }, { "epoch": 0.8945557579437361, "grad_norm": 0.39953455328941345, "learning_rate": 0.00011663693937084936, "loss": 0.2549, "step": 4420 }, { "epoch": 0.8947581461242663, "grad_norm": 0.3088074326515198, "learning_rate": 0.0001166055746556945, "loss": 0.2437, "step": 4421 }, { "epoch": 0.8949605343047966, "grad_norm": 0.5596060156822205, "learning_rate": 0.00011657420826056184, "loss": 0.2091, "step": 4422 }, { "epoch": 0.8951629224853268, "grad_norm": 0.4558367431163788, "learning_rate": 0.00011654284018862471, "loss": 0.2282, "step": 4423 }, { "epoch": 0.8953653106658571, "grad_norm": 0.3053852617740631, "learning_rate": 0.00011651147044305656, "loss": 0.2614, "step": 4424 }, { "epoch": 0.8955676988463873, "grad_norm": 0.2652442157268524, "learning_rate": 0.00011648009902703112, "loss": 0.2064, "step": 4425 }, { "epoch": 0.8957700870269176, "grad_norm": 0.31589120626449585, "learning_rate": 0.00011644872594372218, "loss": 0.2637, "step": 4426 }, { "epoch": 0.8959724752074479, "grad_norm": 0.25115832686424255, "learning_rate": 0.00011641735119630372, "loss": 0.218, "step": 4427 }, { "epoch": 0.8961748633879781, "grad_norm": 0.3270891010761261, "learning_rate": 0.00011638597478794995, "loss": 0.2325, "step": 4428 }, { "epoch": 0.8963772515685084, "grad_norm": 0.23799145221710205, "learning_rate": 0.0001163545967218352, "loss": 0.1974, "step": 4429 }, { "epoch": 0.8965796397490386, "grad_norm": 0.3497200906276703, "learning_rate": 0.00011632321700113393, "loss": 0.2611, "step": 4430 }, { "epoch": 0.896782027929569, "grad_norm": 0.24682220816612244, "learning_rate": 0.00011629183562902087, "loss": 0.254, "step": 4431 }, { "epoch": 0.8969844161100992, "grad_norm": 0.28864797949790955, "learning_rate": 0.0001162604526086708, "loss": 0.2637, "step": 4432 }, { "epoch": 0.8971868042906295, "grad_norm": 0.39302846789360046, "learning_rate": 0.00011622906794325877, "loss": 0.2285, "step": 4433 }, { "epoch": 0.8973891924711597, "grad_norm": 0.4144213795661926, "learning_rate": 0.00011619768163595991, "loss": 0.2094, "step": 4434 }, { "epoch": 0.89759158065169, "grad_norm": 0.3300606906414032, "learning_rate": 0.00011616629368994962, "loss": 0.2707, "step": 4435 }, { "epoch": 0.8977939688322202, "grad_norm": 0.26967158913612366, "learning_rate": 0.00011613490410840335, "loss": 0.2392, "step": 4436 }, { "epoch": 0.8979963570127505, "grad_norm": 0.3630208671092987, "learning_rate": 0.0001161035128944968, "loss": 0.2342, "step": 4437 }, { "epoch": 0.8981987451932807, "grad_norm": 0.29514622688293457, "learning_rate": 0.00011607212005140576, "loss": 0.2395, "step": 4438 }, { "epoch": 0.898401133373811, "grad_norm": 0.34164735674858093, "learning_rate": 0.00011604072558230625, "loss": 0.2391, "step": 4439 }, { "epoch": 0.8986035215543412, "grad_norm": 0.2891792058944702, "learning_rate": 0.00011600932949037449, "loss": 0.2289, "step": 4440 }, { "epoch": 0.8988059097348715, "grad_norm": 0.2580989599227905, "learning_rate": 0.00011597793177878671, "loss": 0.2443, "step": 4441 }, { "epoch": 0.8990082979154017, "grad_norm": 0.31715089082717896, "learning_rate": 0.00011594653245071946, "loss": 0.2453, "step": 4442 }, { "epoch": 0.899210686095932, "grad_norm": 0.3037600517272949, "learning_rate": 0.00011591513150934937, "loss": 0.2557, "step": 4443 }, { "epoch": 0.8994130742764622, "grad_norm": 0.2914448380470276, "learning_rate": 0.00011588372895785328, "loss": 0.2609, "step": 4444 }, { "epoch": 0.8996154624569925, "grad_norm": 0.3002516031265259, "learning_rate": 0.00011585232479940815, "loss": 0.2394, "step": 4445 }, { "epoch": 0.8998178506375227, "grad_norm": 0.33220770955085754, "learning_rate": 0.00011582091903719114, "loss": 0.2564, "step": 4446 }, { "epoch": 0.900020238818053, "grad_norm": 0.26941895484924316, "learning_rate": 0.00011578951167437957, "loss": 0.2413, "step": 4447 }, { "epoch": 0.9002226269985832, "grad_norm": 0.3035530745983124, "learning_rate": 0.00011575810271415086, "loss": 0.2415, "step": 4448 }, { "epoch": 0.9004250151791136, "grad_norm": 0.30377620458602905, "learning_rate": 0.00011572669215968269, "loss": 0.2435, "step": 4449 }, { "epoch": 0.9006274033596438, "grad_norm": 0.36584311723709106, "learning_rate": 0.0001156952800141528, "loss": 0.2269, "step": 4450 }, { "epoch": 0.9006274033596438, "eval_loss": 0.2701588571071625, "eval_runtime": 0.7366, "eval_samples_per_second": 6.788, "eval_steps_per_second": 1.358, "step": 4450 }, { "epoch": 0.9008297915401741, "grad_norm": 0.2716978192329407, "learning_rate": 0.0001156638662807392, "loss": 0.2314, "step": 4451 }, { "epoch": 0.9010321797207043, "grad_norm": 0.2875329852104187, "learning_rate": 0.00011563245096261994, "loss": 0.2171, "step": 4452 }, { "epoch": 0.9012345679012346, "grad_norm": 0.2926979959011078, "learning_rate": 0.00011560103406297331, "loss": 0.2255, "step": 4453 }, { "epoch": 0.9014369560817648, "grad_norm": 0.2483060508966446, "learning_rate": 0.00011556961558497779, "loss": 0.2231, "step": 4454 }, { "epoch": 0.9016393442622951, "grad_norm": 0.3437354266643524, "learning_rate": 0.00011553819553181191, "loss": 0.2311, "step": 4455 }, { "epoch": 0.9018417324428254, "grad_norm": 0.28860417008399963, "learning_rate": 0.00011550677390665445, "loss": 0.2432, "step": 4456 }, { "epoch": 0.9020441206233556, "grad_norm": 0.2693041265010834, "learning_rate": 0.00011547535071268432, "loss": 0.2399, "step": 4457 }, { "epoch": 0.9022465088038859, "grad_norm": 0.2697674632072449, "learning_rate": 0.00011544392595308058, "loss": 0.2263, "step": 4458 }, { "epoch": 0.9024488969844161, "grad_norm": 0.31832581758499146, "learning_rate": 0.00011541249963102245, "loss": 0.2526, "step": 4459 }, { "epoch": 0.9026512851649464, "grad_norm": 0.3540724515914917, "learning_rate": 0.00011538107174968935, "loss": 0.2643, "step": 4460 }, { "epoch": 0.9028536733454766, "grad_norm": 0.3032924234867096, "learning_rate": 0.00011534964231226082, "loss": 0.2556, "step": 4461 }, { "epoch": 0.9030560615260069, "grad_norm": 0.30924227833747864, "learning_rate": 0.00011531821132191653, "loss": 0.2331, "step": 4462 }, { "epoch": 0.9032584497065371, "grad_norm": 0.2958310544490814, "learning_rate": 0.00011528677878183634, "loss": 0.2366, "step": 4463 }, { "epoch": 0.9034608378870674, "grad_norm": 0.3167951703071594, "learning_rate": 0.00011525534469520027, "loss": 0.197, "step": 4464 }, { "epoch": 0.9036632260675976, "grad_norm": 0.28465691208839417, "learning_rate": 0.00011522390906518851, "loss": 0.2322, "step": 4465 }, { "epoch": 0.903865614248128, "grad_norm": 0.3153257966041565, "learning_rate": 0.00011519247189498137, "loss": 0.2502, "step": 4466 }, { "epoch": 0.9040680024286581, "grad_norm": 0.31020671129226685, "learning_rate": 0.00011516103318775932, "loss": 0.269, "step": 4467 }, { "epoch": 0.9042703906091885, "grad_norm": 0.29223933815956116, "learning_rate": 0.00011512959294670305, "loss": 0.2317, "step": 4468 }, { "epoch": 0.9044727787897187, "grad_norm": 0.2880360186100006, "learning_rate": 0.0001150981511749933, "loss": 0.223, "step": 4469 }, { "epoch": 0.904675166970249, "grad_norm": 0.3483765423297882, "learning_rate": 0.00011506670787581101, "loss": 0.218, "step": 4470 }, { "epoch": 0.9048775551507792, "grad_norm": 0.30725324153900146, "learning_rate": 0.00011503526305233734, "loss": 0.2474, "step": 4471 }, { "epoch": 0.9050799433313095, "grad_norm": 0.3243599236011505, "learning_rate": 0.0001150038167077535, "loss": 0.2513, "step": 4472 }, { "epoch": 0.9052823315118397, "grad_norm": 0.3006613254547119, "learning_rate": 0.00011497236884524094, "loss": 0.2502, "step": 4473 }, { "epoch": 0.90548471969237, "grad_norm": 0.26012781262397766, "learning_rate": 0.0001149409194679812, "loss": 0.2439, "step": 4474 }, { "epoch": 0.9056871078729002, "grad_norm": 0.36949893832206726, "learning_rate": 0.000114909468579156, "loss": 0.2073, "step": 4475 }, { "epoch": 0.9058894960534305, "grad_norm": 0.25878971815109253, "learning_rate": 0.0001148780161819472, "loss": 0.1932, "step": 4476 }, { "epoch": 0.9060918842339607, "grad_norm": 0.2846607267856598, "learning_rate": 0.00011484656227953685, "loss": 0.2044, "step": 4477 }, { "epoch": 0.906294272414491, "grad_norm": 0.3530745506286621, "learning_rate": 0.0001148151068751071, "loss": 0.2143, "step": 4478 }, { "epoch": 0.9064966605950212, "grad_norm": 0.2996197044849396, "learning_rate": 0.0001147836499718403, "loss": 0.242, "step": 4479 }, { "epoch": 0.9066990487755515, "grad_norm": 0.341861367225647, "learning_rate": 0.00011475219157291892, "loss": 0.2284, "step": 4480 }, { "epoch": 0.9069014369560817, "grad_norm": 0.29489073157310486, "learning_rate": 0.00011472073168152557, "loss": 0.2442, "step": 4481 }, { "epoch": 0.907103825136612, "grad_norm": 0.24275663495063782, "learning_rate": 0.00011468927030084307, "loss": 0.2096, "step": 4482 }, { "epoch": 0.9073062133171422, "grad_norm": 0.3213668763637543, "learning_rate": 0.00011465780743405432, "loss": 0.2298, "step": 4483 }, { "epoch": 0.9075086014976725, "grad_norm": 0.2886224687099457, "learning_rate": 0.00011462634308434245, "loss": 0.2327, "step": 4484 }, { "epoch": 0.9077109896782027, "grad_norm": 0.2937127649784088, "learning_rate": 0.00011459487725489065, "loss": 0.2579, "step": 4485 }, { "epoch": 0.907913377858733, "grad_norm": 0.2543278932571411, "learning_rate": 0.00011456340994888229, "loss": 0.2356, "step": 4486 }, { "epoch": 0.9081157660392634, "grad_norm": 0.25066015124320984, "learning_rate": 0.00011453194116950093, "loss": 0.2489, "step": 4487 }, { "epoch": 0.9083181542197936, "grad_norm": 0.4157361388206482, "learning_rate": 0.00011450047091993024, "loss": 0.1968, "step": 4488 }, { "epoch": 0.9085205424003239, "grad_norm": 0.27913933992385864, "learning_rate": 0.00011446899920335405, "loss": 0.258, "step": 4489 }, { "epoch": 0.9087229305808541, "grad_norm": 0.33072513341903687, "learning_rate": 0.00011443752602295634, "loss": 0.2147, "step": 4490 }, { "epoch": 0.9089253187613844, "grad_norm": 0.2571675777435303, "learning_rate": 0.00011440605138192126, "loss": 0.2203, "step": 4491 }, { "epoch": 0.9091277069419146, "grad_norm": 0.22976812720298767, "learning_rate": 0.00011437457528343305, "loss": 0.2028, "step": 4492 }, { "epoch": 0.9093300951224449, "grad_norm": 0.3034539520740509, "learning_rate": 0.00011434309773067616, "loss": 0.2498, "step": 4493 }, { "epoch": 0.9095324833029751, "grad_norm": 0.32470160722732544, "learning_rate": 0.00011431161872683512, "loss": 0.2731, "step": 4494 }, { "epoch": 0.9097348714835054, "grad_norm": 0.2735867500305176, "learning_rate": 0.00011428013827509467, "loss": 0.2614, "step": 4495 }, { "epoch": 0.9099372596640356, "grad_norm": 0.37068819999694824, "learning_rate": 0.00011424865637863967, "loss": 0.2552, "step": 4496 }, { "epoch": 0.9101396478445659, "grad_norm": 0.3341186046600342, "learning_rate": 0.00011421717304065514, "loss": 0.2616, "step": 4497 }, { "epoch": 0.9103420360250961, "grad_norm": 0.253572553396225, "learning_rate": 0.0001141856882643262, "loss": 0.2391, "step": 4498 }, { "epoch": 0.9105444242056264, "grad_norm": 0.2911362946033478, "learning_rate": 0.00011415420205283818, "loss": 0.2249, "step": 4499 }, { "epoch": 0.9107468123861566, "grad_norm": 0.30564719438552856, "learning_rate": 0.00011412271440937652, "loss": 0.2529, "step": 4500 }, { "epoch": 0.9107468123861566, "eval_loss": 0.27335047721862793, "eval_runtime": 0.7406, "eval_samples_per_second": 6.752, "eval_steps_per_second": 1.35, "step": 4500 }, { "epoch": 0.9109492005666869, "grad_norm": 0.3103221356868744, "learning_rate": 0.0001140912253371268, "loss": 0.2481, "step": 4501 }, { "epoch": 0.9111515887472171, "grad_norm": 0.340580016374588, "learning_rate": 0.00011405973483927474, "loss": 0.2776, "step": 4502 }, { "epoch": 0.9113539769277474, "grad_norm": 0.274093896150589, "learning_rate": 0.00011402824291900627, "loss": 0.2332, "step": 4503 }, { "epoch": 0.9115563651082776, "grad_norm": 0.3204316794872284, "learning_rate": 0.00011399674957950735, "loss": 0.2167, "step": 4504 }, { "epoch": 0.911758753288808, "grad_norm": 0.2747783660888672, "learning_rate": 0.00011396525482396419, "loss": 0.251, "step": 4505 }, { "epoch": 0.9119611414693382, "grad_norm": 0.29710814356803894, "learning_rate": 0.00011393375865556309, "loss": 0.2347, "step": 4506 }, { "epoch": 0.9121635296498685, "grad_norm": 0.3104104697704315, "learning_rate": 0.00011390226107749049, "loss": 0.2371, "step": 4507 }, { "epoch": 0.9123659178303987, "grad_norm": 0.308168888092041, "learning_rate": 0.000113870762092933, "loss": 0.2713, "step": 4508 }, { "epoch": 0.912568306010929, "grad_norm": 0.3603154718875885, "learning_rate": 0.0001138392617050773, "loss": 0.2434, "step": 4509 }, { "epoch": 0.9127706941914592, "grad_norm": 0.31686174869537354, "learning_rate": 0.00011380775991711035, "loss": 0.2446, "step": 4510 }, { "epoch": 0.9129730823719895, "grad_norm": 0.2970934510231018, "learning_rate": 0.00011377625673221912, "loss": 0.2634, "step": 4511 }, { "epoch": 0.9131754705525197, "grad_norm": 0.36884772777557373, "learning_rate": 0.0001137447521535908, "loss": 0.2426, "step": 4512 }, { "epoch": 0.91337785873305, "grad_norm": 0.2537902891635895, "learning_rate": 0.00011371324618441269, "loss": 0.2337, "step": 4513 }, { "epoch": 0.9135802469135802, "grad_norm": 0.3862202763557434, "learning_rate": 0.00011368173882787218, "loss": 0.2466, "step": 4514 }, { "epoch": 0.9137826350941105, "grad_norm": 0.2834358215332031, "learning_rate": 0.00011365023008715691, "loss": 0.2304, "step": 4515 }, { "epoch": 0.9139850232746407, "grad_norm": 0.31917238235473633, "learning_rate": 0.00011361871996545461, "loss": 0.2698, "step": 4516 }, { "epoch": 0.914187411455171, "grad_norm": 0.2584877610206604, "learning_rate": 0.00011358720846595313, "loss": 0.2378, "step": 4517 }, { "epoch": 0.9143897996357013, "grad_norm": 0.29169657826423645, "learning_rate": 0.00011355569559184047, "loss": 0.2413, "step": 4518 }, { "epoch": 0.9145921878162315, "grad_norm": 0.23529942333698273, "learning_rate": 0.00011352418134630473, "loss": 0.243, "step": 4519 }, { "epoch": 0.9147945759967618, "grad_norm": 0.2689376175403595, "learning_rate": 0.00011349266573253423, "loss": 0.2198, "step": 4520 }, { "epoch": 0.914996964177292, "grad_norm": 0.27592605352401733, "learning_rate": 0.00011346114875371741, "loss": 0.2286, "step": 4521 }, { "epoch": 0.9151993523578223, "grad_norm": 0.3357422947883606, "learning_rate": 0.0001134296304130428, "loss": 0.2589, "step": 4522 }, { "epoch": 0.9154017405383525, "grad_norm": 0.44812870025634766, "learning_rate": 0.0001133981107136991, "loss": 0.2527, "step": 4523 }, { "epoch": 0.9156041287188829, "grad_norm": 0.2871791422367096, "learning_rate": 0.00011336658965887514, "loss": 0.2148, "step": 4524 }, { "epoch": 0.9158065168994131, "grad_norm": 0.3729591369628906, "learning_rate": 0.0001133350672517599, "loss": 0.1677, "step": 4525 }, { "epoch": 0.9160089050799434, "grad_norm": 0.41365382075309753, "learning_rate": 0.00011330354349554249, "loss": 0.2459, "step": 4526 }, { "epoch": 0.9162112932604736, "grad_norm": 0.31599223613739014, "learning_rate": 0.00011327201839341213, "loss": 0.2652, "step": 4527 }, { "epoch": 0.9164136814410039, "grad_norm": 0.2886614501476288, "learning_rate": 0.00011324049194855819, "loss": 0.225, "step": 4528 }, { "epoch": 0.9166160696215341, "grad_norm": 0.28534746170043945, "learning_rate": 0.00011320896416417026, "loss": 0.2253, "step": 4529 }, { "epoch": 0.9168184578020644, "grad_norm": 0.30376002192497253, "learning_rate": 0.0001131774350434379, "loss": 0.2323, "step": 4530 }, { "epoch": 0.9170208459825946, "grad_norm": 0.42857232689857483, "learning_rate": 0.00011314590458955092, "loss": 0.238, "step": 4531 }, { "epoch": 0.9172232341631249, "grad_norm": 0.2847810983657837, "learning_rate": 0.00011311437280569925, "loss": 0.2275, "step": 4532 }, { "epoch": 0.9174256223436551, "grad_norm": 0.25568756461143494, "learning_rate": 0.00011308283969507297, "loss": 0.204, "step": 4533 }, { "epoch": 0.9176280105241854, "grad_norm": 0.3276273012161255, "learning_rate": 0.00011305130526086223, "loss": 0.2499, "step": 4534 }, { "epoch": 0.9178303987047156, "grad_norm": 0.3064262270927429, "learning_rate": 0.00011301976950625739, "loss": 0.2641, "step": 4535 }, { "epoch": 0.9180327868852459, "grad_norm": 0.29613596200942993, "learning_rate": 0.00011298823243444887, "loss": 0.2236, "step": 4536 }, { "epoch": 0.9182351750657761, "grad_norm": 0.30147165060043335, "learning_rate": 0.00011295669404862728, "loss": 0.2362, "step": 4537 }, { "epoch": 0.9184375632463064, "grad_norm": 0.31944146752357483, "learning_rate": 0.00011292515435198332, "loss": 0.2462, "step": 4538 }, { "epoch": 0.9186399514268366, "grad_norm": 0.5171250104904175, "learning_rate": 0.0001128936133477079, "loss": 0.1999, "step": 4539 }, { "epoch": 0.9188423396073669, "grad_norm": 0.34059274196624756, "learning_rate": 0.00011286207103899195, "loss": 0.2169, "step": 4540 }, { "epoch": 0.9190447277878971, "grad_norm": 0.2513200640678406, "learning_rate": 0.00011283052742902664, "loss": 0.2063, "step": 4541 }, { "epoch": 0.9192471159684275, "grad_norm": 0.26981818675994873, "learning_rate": 0.00011279898252100316, "loss": 0.1965, "step": 4542 }, { "epoch": 0.9194495041489577, "grad_norm": 0.3300010561943054, "learning_rate": 0.00011276743631811295, "loss": 0.2539, "step": 4543 }, { "epoch": 0.919651892329488, "grad_norm": 0.30883342027664185, "learning_rate": 0.00011273588882354749, "loss": 0.1949, "step": 4544 }, { "epoch": 0.9198542805100182, "grad_norm": 0.2752622067928314, "learning_rate": 0.00011270434004049844, "loss": 0.2371, "step": 4545 }, { "epoch": 0.9200566686905485, "grad_norm": 0.24499693512916565, "learning_rate": 0.00011267278997215756, "loss": 0.21, "step": 4546 }, { "epoch": 0.9202590568710788, "grad_norm": 0.2723928987979889, "learning_rate": 0.00011264123862171675, "loss": 0.2453, "step": 4547 }, { "epoch": 0.920461445051609, "grad_norm": 0.276915043592453, "learning_rate": 0.00011260968599236807, "loss": 0.2224, "step": 4548 }, { "epoch": 0.9206638332321393, "grad_norm": 0.2836087644100189, "learning_rate": 0.00011257813208730368, "loss": 0.2228, "step": 4549 }, { "epoch": 0.9208662214126695, "grad_norm": 0.2985590100288391, "learning_rate": 0.00011254657690971586, "loss": 0.2548, "step": 4550 }, { "epoch": 0.9208662214126695, "eval_loss": 0.2719789445400238, "eval_runtime": 0.737, "eval_samples_per_second": 6.784, "eval_steps_per_second": 1.357, "step": 4550 }, { "epoch": 0.9210686095931998, "grad_norm": 0.24112625420093536, "learning_rate": 0.00011251502046279707, "loss": 0.2018, "step": 4551 }, { "epoch": 0.92127099777373, "grad_norm": 0.33927157521247864, "learning_rate": 0.0001124834627497398, "loss": 0.2299, "step": 4552 }, { "epoch": 0.9214733859542603, "grad_norm": 0.252463161945343, "learning_rate": 0.00011245190377373676, "loss": 0.2099, "step": 4553 }, { "epoch": 0.9216757741347905, "grad_norm": 0.3403078019618988, "learning_rate": 0.00011242034353798075, "loss": 0.2567, "step": 4554 }, { "epoch": 0.9218781623153208, "grad_norm": 0.2527310848236084, "learning_rate": 0.0001123887820456647, "loss": 0.2071, "step": 4555 }, { "epoch": 0.922080550495851, "grad_norm": 0.27652308344841003, "learning_rate": 0.00011235721929998169, "loss": 0.2249, "step": 4556 }, { "epoch": 0.9222829386763813, "grad_norm": 0.29642534255981445, "learning_rate": 0.0001123256553041249, "loss": 0.2553, "step": 4557 }, { "epoch": 0.9224853268569115, "grad_norm": 0.2769574820995331, "learning_rate": 0.00011229409006128762, "loss": 0.2369, "step": 4558 }, { "epoch": 0.9226877150374418, "grad_norm": 0.27466651797294617, "learning_rate": 0.00011226252357466331, "loss": 0.2182, "step": 4559 }, { "epoch": 0.922890103217972, "grad_norm": 0.27808988094329834, "learning_rate": 0.00011223095584744553, "loss": 0.2398, "step": 4560 }, { "epoch": 0.9230924913985024, "grad_norm": 0.2918395400047302, "learning_rate": 0.00011219938688282798, "loss": 0.232, "step": 4561 }, { "epoch": 0.9232948795790326, "grad_norm": 0.3108648657798767, "learning_rate": 0.0001121678166840045, "loss": 0.2455, "step": 4562 }, { "epoch": 0.9234972677595629, "grad_norm": 0.27647170424461365, "learning_rate": 0.000112136245254169, "loss": 0.2309, "step": 4563 }, { "epoch": 0.9236996559400931, "grad_norm": 0.25153157114982605, "learning_rate": 0.00011210467259651552, "loss": 0.2444, "step": 4564 }, { "epoch": 0.9239020441206234, "grad_norm": 0.27187663316726685, "learning_rate": 0.00011207309871423828, "loss": 0.2296, "step": 4565 }, { "epoch": 0.9241044323011536, "grad_norm": 0.3199034035205841, "learning_rate": 0.0001120415236105316, "loss": 0.2469, "step": 4566 }, { "epoch": 0.9243068204816839, "grad_norm": 0.36292028427124023, "learning_rate": 0.00011200994728858991, "loss": 0.2094, "step": 4567 }, { "epoch": 0.9245092086622141, "grad_norm": 0.30743375420570374, "learning_rate": 0.00011197836975160778, "loss": 0.2465, "step": 4568 }, { "epoch": 0.9247115968427444, "grad_norm": 0.28878483176231384, "learning_rate": 0.00011194679100277987, "loss": 0.2228, "step": 4569 }, { "epoch": 0.9249139850232746, "grad_norm": 0.25312918424606323, "learning_rate": 0.00011191521104530103, "loss": 0.2491, "step": 4570 }, { "epoch": 0.9251163732038049, "grad_norm": 0.25213220715522766, "learning_rate": 0.00011188362988236614, "loss": 0.2279, "step": 4571 }, { "epoch": 0.9253187613843351, "grad_norm": 0.26037511229515076, "learning_rate": 0.00011185204751717029, "loss": 0.2412, "step": 4572 }, { "epoch": 0.9255211495648654, "grad_norm": 0.2789521813392639, "learning_rate": 0.00011182046395290861, "loss": 0.2533, "step": 4573 }, { "epoch": 0.9257235377453956, "grad_norm": 0.3250572085380554, "learning_rate": 0.00011178887919277642, "loss": 0.2609, "step": 4574 }, { "epoch": 0.9259259259259259, "grad_norm": 0.2678629755973816, "learning_rate": 0.00011175729323996915, "loss": 0.1953, "step": 4575 }, { "epoch": 0.9261283141064561, "grad_norm": 0.3143162429332733, "learning_rate": 0.00011172570609768231, "loss": 0.2449, "step": 4576 }, { "epoch": 0.9263307022869864, "grad_norm": 0.2648458778858185, "learning_rate": 0.00011169411776911157, "loss": 0.2152, "step": 4577 }, { "epoch": 0.9265330904675168, "grad_norm": 0.32966744899749756, "learning_rate": 0.00011166252825745269, "loss": 0.2684, "step": 4578 }, { "epoch": 0.926735478648047, "grad_norm": 0.26951298117637634, "learning_rate": 0.00011163093756590157, "loss": 0.2348, "step": 4579 }, { "epoch": 0.9269378668285773, "grad_norm": 0.3233502507209778, "learning_rate": 0.00011159934569765425, "loss": 0.2827, "step": 4580 }, { "epoch": 0.9271402550091075, "grad_norm": 0.35136401653289795, "learning_rate": 0.00011156775265590682, "loss": 0.2508, "step": 4581 }, { "epoch": 0.9273426431896378, "grad_norm": 0.298715740442276, "learning_rate": 0.00011153615844385557, "loss": 0.2525, "step": 4582 }, { "epoch": 0.927545031370168, "grad_norm": 0.314382404088974, "learning_rate": 0.00011150456306469686, "loss": 0.218, "step": 4583 }, { "epoch": 0.9277474195506983, "grad_norm": 0.29797565937042236, "learning_rate": 0.00011147296652162716, "loss": 0.2336, "step": 4584 }, { "epoch": 0.9279498077312285, "grad_norm": 0.26795753836631775, "learning_rate": 0.00011144136881784311, "loss": 0.2377, "step": 4585 }, { "epoch": 0.9281521959117588, "grad_norm": 0.29854822158813477, "learning_rate": 0.0001114097699565414, "loss": 0.2337, "step": 4586 }, { "epoch": 0.928354584092289, "grad_norm": 0.301384836435318, "learning_rate": 0.0001113781699409189, "loss": 0.2445, "step": 4587 }, { "epoch": 0.9285569722728193, "grad_norm": 0.2761942446231842, "learning_rate": 0.00011134656877417254, "loss": 0.2555, "step": 4588 }, { "epoch": 0.9287593604533495, "grad_norm": 0.283226877450943, "learning_rate": 0.00011131496645949941, "loss": 0.2316, "step": 4589 }, { "epoch": 0.9289617486338798, "grad_norm": 0.3124370574951172, "learning_rate": 0.00011128336300009672, "loss": 0.2384, "step": 4590 }, { "epoch": 0.92916413681441, "grad_norm": 0.29625093936920166, "learning_rate": 0.00011125175839916173, "loss": 0.227, "step": 4591 }, { "epoch": 0.9293665249949403, "grad_norm": 0.28010037541389465, "learning_rate": 0.0001112201526598919, "loss": 0.2786, "step": 4592 }, { "epoch": 0.9295689131754705, "grad_norm": 0.24734483659267426, "learning_rate": 0.00011118854578548477, "loss": 0.187, "step": 4593 }, { "epoch": 0.9297713013560008, "grad_norm": 0.2376982867717743, "learning_rate": 0.00011115693777913796, "loss": 0.1929, "step": 4594 }, { "epoch": 0.929973689536531, "grad_norm": 0.2752760648727417, "learning_rate": 0.00011112532864404925, "loss": 0.2548, "step": 4595 }, { "epoch": 0.9301760777170613, "grad_norm": 0.2565372586250305, "learning_rate": 0.00011109371838341653, "loss": 0.246, "step": 4596 }, { "epoch": 0.9303784658975915, "grad_norm": 0.2597411870956421, "learning_rate": 0.0001110621070004378, "loss": 0.2333, "step": 4597 }, { "epoch": 0.9305808540781219, "grad_norm": 0.25775232911109924, "learning_rate": 0.00011103049449831113, "loss": 0.2496, "step": 4598 }, { "epoch": 0.930783242258652, "grad_norm": 0.2996119558811188, "learning_rate": 0.0001109988808802348, "loss": 0.2434, "step": 4599 }, { "epoch": 0.9309856304391824, "grad_norm": 0.3157181739807129, "learning_rate": 0.00011096726614940709, "loss": 0.2835, "step": 4600 }, { "epoch": 0.9309856304391824, "eval_loss": 0.266468346118927, "eval_runtime": 0.7401, "eval_samples_per_second": 6.755, "eval_steps_per_second": 1.351, "step": 4600 }, { "epoch": 0.9311880186197126, "grad_norm": 0.2493111789226532, "learning_rate": 0.00011093565030902648, "loss": 0.1943, "step": 4601 }, { "epoch": 0.9313904068002429, "grad_norm": 0.22633974254131317, "learning_rate": 0.00011090403336229152, "loss": 0.2091, "step": 4602 }, { "epoch": 0.9315927949807731, "grad_norm": 0.27421262860298157, "learning_rate": 0.00011087241531240086, "loss": 0.1961, "step": 4603 }, { "epoch": 0.9317951831613034, "grad_norm": 0.33170488476753235, "learning_rate": 0.00011084079616255334, "loss": 0.2481, "step": 4604 }, { "epoch": 0.9319975713418336, "grad_norm": 0.29805314540863037, "learning_rate": 0.0001108091759159478, "loss": 0.2388, "step": 4605 }, { "epoch": 0.9321999595223639, "grad_norm": 0.3398219645023346, "learning_rate": 0.00011077755457578325, "loss": 0.2388, "step": 4606 }, { "epoch": 0.9324023477028941, "grad_norm": 0.27871426939964294, "learning_rate": 0.00011074593214525883, "loss": 0.2726, "step": 4607 }, { "epoch": 0.9326047358834244, "grad_norm": 0.3019596040248871, "learning_rate": 0.00011071430862757374, "loss": 0.2641, "step": 4608 }, { "epoch": 0.9328071240639547, "grad_norm": 0.2957676649093628, "learning_rate": 0.00011068268402592733, "loss": 0.2861, "step": 4609 }, { "epoch": 0.9330095122444849, "grad_norm": 0.2980649471282959, "learning_rate": 0.00011065105834351903, "loss": 0.237, "step": 4610 }, { "epoch": 0.9332119004250152, "grad_norm": 0.25749891996383667, "learning_rate": 0.00011061943158354842, "loss": 0.2209, "step": 4611 }, { "epoch": 0.9334142886055454, "grad_norm": 0.2515775263309479, "learning_rate": 0.00011058780374921516, "loss": 0.2195, "step": 4612 }, { "epoch": 0.9336166767860757, "grad_norm": 0.3047962486743927, "learning_rate": 0.00011055617484371899, "loss": 0.2506, "step": 4613 }, { "epoch": 0.9338190649666059, "grad_norm": 0.30205193161964417, "learning_rate": 0.00011052454487025983, "loss": 0.1969, "step": 4614 }, { "epoch": 0.9340214531471362, "grad_norm": 0.3510016202926636, "learning_rate": 0.00011049291383203764, "loss": 0.2523, "step": 4615 }, { "epoch": 0.9342238413276664, "grad_norm": 0.29446735978126526, "learning_rate": 0.00011046128173225254, "loss": 0.2316, "step": 4616 }, { "epoch": 0.9344262295081968, "grad_norm": 0.2585512101650238, "learning_rate": 0.00011042964857410471, "loss": 0.2221, "step": 4617 }, { "epoch": 0.934628617688727, "grad_norm": 0.24132204055786133, "learning_rate": 0.0001103980143607945, "loss": 0.1958, "step": 4618 }, { "epoch": 0.9348310058692573, "grad_norm": 0.3184613585472107, "learning_rate": 0.0001103663790955223, "loss": 0.2287, "step": 4619 }, { "epoch": 0.9350333940497875, "grad_norm": 0.2446582168340683, "learning_rate": 0.00011033474278148864, "loss": 0.243, "step": 4620 }, { "epoch": 0.9352357822303178, "grad_norm": 0.22792799770832062, "learning_rate": 0.0001103031054218941, "loss": 0.1865, "step": 4621 }, { "epoch": 0.935438170410848, "grad_norm": 0.26008936762809753, "learning_rate": 0.00011027146701993951, "loss": 0.2276, "step": 4622 }, { "epoch": 0.9356405585913783, "grad_norm": 0.2646524906158447, "learning_rate": 0.00011023982757882564, "loss": 0.2437, "step": 4623 }, { "epoch": 0.9358429467719085, "grad_norm": 0.23059087991714478, "learning_rate": 0.00011020818710175347, "loss": 0.1967, "step": 4624 }, { "epoch": 0.9360453349524388, "grad_norm": 0.2654322683811188, "learning_rate": 0.00011017654559192403, "loss": 0.2043, "step": 4625 }, { "epoch": 0.936247723132969, "grad_norm": 0.24038441479206085, "learning_rate": 0.00011014490305253849, "loss": 0.2069, "step": 4626 }, { "epoch": 0.9364501113134993, "grad_norm": 0.3121231198310852, "learning_rate": 0.00011011325948679812, "loss": 0.2627, "step": 4627 }, { "epoch": 0.9366524994940295, "grad_norm": 0.2508528232574463, "learning_rate": 0.00011008161489790425, "loss": 0.194, "step": 4628 }, { "epoch": 0.9368548876745598, "grad_norm": 0.3439335823059082, "learning_rate": 0.00011004996928905842, "loss": 0.2551, "step": 4629 }, { "epoch": 0.93705727585509, "grad_norm": 0.2643868327140808, "learning_rate": 0.00011001832266346213, "loss": 0.2419, "step": 4630 }, { "epoch": 0.9372596640356203, "grad_norm": 0.2533891201019287, "learning_rate": 0.00010998667502431706, "loss": 0.207, "step": 4631 }, { "epoch": 0.9374620522161505, "grad_norm": 0.27981844544410706, "learning_rate": 0.000109955026374825, "loss": 0.2601, "step": 4632 }, { "epoch": 0.9376644403966808, "grad_norm": 0.28298893570899963, "learning_rate": 0.00010992337671818782, "loss": 0.2526, "step": 4633 }, { "epoch": 0.937866828577211, "grad_norm": 0.288486510515213, "learning_rate": 0.00010989172605760752, "loss": 0.26, "step": 4634 }, { "epoch": 0.9380692167577414, "grad_norm": 0.3344356119632721, "learning_rate": 0.00010986007439628616, "loss": 0.2326, "step": 4635 }, { "epoch": 0.9382716049382716, "grad_norm": 0.2752384841442108, "learning_rate": 0.00010982842173742595, "loss": 0.2194, "step": 4636 }, { "epoch": 0.9384739931188019, "grad_norm": 0.3313218355178833, "learning_rate": 0.00010979676808422916, "loss": 0.2558, "step": 4637 }, { "epoch": 0.9386763812993322, "grad_norm": 0.2754729688167572, "learning_rate": 0.00010976511343989814, "loss": 0.2682, "step": 4638 }, { "epoch": 0.9388787694798624, "grad_norm": 0.2828863263130188, "learning_rate": 0.00010973345780763544, "loss": 0.2409, "step": 4639 }, { "epoch": 0.9390811576603927, "grad_norm": 0.2455834299325943, "learning_rate": 0.00010970180119064361, "loss": 0.208, "step": 4640 }, { "epoch": 0.9392835458409229, "grad_norm": 0.42734503746032715, "learning_rate": 0.00010967014359212533, "loss": 0.271, "step": 4641 }, { "epoch": 0.9394859340214532, "grad_norm": 0.2849595844745636, "learning_rate": 0.0001096384850152834, "loss": 0.2408, "step": 4642 }, { "epoch": 0.9396883222019834, "grad_norm": 0.37806615233421326, "learning_rate": 0.00010960682546332066, "loss": 0.2634, "step": 4643 }, { "epoch": 0.9398907103825137, "grad_norm": 0.253713995218277, "learning_rate": 0.00010957516493944014, "loss": 0.19, "step": 4644 }, { "epoch": 0.9400930985630439, "grad_norm": 0.27861085534095764, "learning_rate": 0.0001095435034468449, "loss": 0.204, "step": 4645 }, { "epoch": 0.9402954867435742, "grad_norm": 0.26259860396385193, "learning_rate": 0.00010951184098873812, "loss": 0.2234, "step": 4646 }, { "epoch": 0.9404978749241044, "grad_norm": 0.3342563509941101, "learning_rate": 0.00010948017756832307, "loss": 0.238, "step": 4647 }, { "epoch": 0.9407002631046347, "grad_norm": 0.30692070722579956, "learning_rate": 0.00010944851318880314, "loss": 0.2612, "step": 4648 }, { "epoch": 0.9409026512851649, "grad_norm": 0.2919248044490814, "learning_rate": 0.00010941684785338178, "loss": 0.237, "step": 4649 }, { "epoch": 0.9411050394656952, "grad_norm": 0.26972222328186035, "learning_rate": 0.00010938518156526256, "loss": 0.1923, "step": 4650 }, { "epoch": 0.9411050394656952, "eval_loss": 0.26375117897987366, "eval_runtime": 0.737, "eval_samples_per_second": 6.784, "eval_steps_per_second": 1.357, "step": 4650 }, { "epoch": 0.9413074276462254, "grad_norm": 0.2946036159992218, "learning_rate": 0.00010935351432764915, "loss": 0.2034, "step": 4651 }, { "epoch": 0.9415098158267557, "grad_norm": 0.25022605061531067, "learning_rate": 0.00010932184614374533, "loss": 0.2109, "step": 4652 }, { "epoch": 0.941712204007286, "grad_norm": 0.2673112750053406, "learning_rate": 0.0001092901770167549, "loss": 0.2268, "step": 4653 }, { "epoch": 0.9419145921878163, "grad_norm": 0.3491578698158264, "learning_rate": 0.00010925850694988184, "loss": 0.2692, "step": 4654 }, { "epoch": 0.9421169803683465, "grad_norm": 0.30046436190605164, "learning_rate": 0.00010922683594633021, "loss": 0.2325, "step": 4655 }, { "epoch": 0.9423193685488768, "grad_norm": 0.2795022130012512, "learning_rate": 0.00010919516400930412, "loss": 0.2397, "step": 4656 }, { "epoch": 0.942521756729407, "grad_norm": 0.2704300880432129, "learning_rate": 0.00010916349114200784, "loss": 0.2095, "step": 4657 }, { "epoch": 0.9427241449099373, "grad_norm": 0.33030474185943604, "learning_rate": 0.00010913181734764566, "loss": 0.2485, "step": 4658 }, { "epoch": 0.9429265330904675, "grad_norm": 0.3205012381076813, "learning_rate": 0.00010910014262942204, "loss": 0.2446, "step": 4659 }, { "epoch": 0.9431289212709978, "grad_norm": 0.2922936677932739, "learning_rate": 0.00010906846699054144, "loss": 0.208, "step": 4660 }, { "epoch": 0.943331309451528, "grad_norm": 0.2377457320690155, "learning_rate": 0.00010903679043420854, "loss": 0.2016, "step": 4661 }, { "epoch": 0.9435336976320583, "grad_norm": 0.2981378436088562, "learning_rate": 0.00010900511296362801, "loss": 0.2158, "step": 4662 }, { "epoch": 0.9437360858125885, "grad_norm": 0.34946900606155396, "learning_rate": 0.00010897343458200462, "loss": 0.2385, "step": 4663 }, { "epoch": 0.9439384739931188, "grad_norm": 0.3810003101825714, "learning_rate": 0.00010894175529254327, "loss": 0.2299, "step": 4664 }, { "epoch": 0.944140862173649, "grad_norm": 0.2824034094810486, "learning_rate": 0.00010891007509844894, "loss": 0.2584, "step": 4665 }, { "epoch": 0.9443432503541793, "grad_norm": 0.3092564642429352, "learning_rate": 0.00010887839400292672, "loss": 0.2426, "step": 4666 }, { "epoch": 0.9445456385347095, "grad_norm": 0.25669777393341064, "learning_rate": 0.00010884671200918175, "loss": 0.2222, "step": 4667 }, { "epoch": 0.9447480267152398, "grad_norm": 0.3152638077735901, "learning_rate": 0.0001088150291204193, "loss": 0.2484, "step": 4668 }, { "epoch": 0.9449504148957701, "grad_norm": 0.4432920217514038, "learning_rate": 0.00010878334533984467, "loss": 0.1989, "step": 4669 }, { "epoch": 0.9451528030763003, "grad_norm": 0.301040917634964, "learning_rate": 0.00010875166067066334, "loss": 0.2654, "step": 4670 }, { "epoch": 0.9453551912568307, "grad_norm": 0.28827717900276184, "learning_rate": 0.0001087199751160808, "loss": 0.2589, "step": 4671 }, { "epoch": 0.9455575794373609, "grad_norm": 0.2627612352371216, "learning_rate": 0.0001086882886793027, "loss": 0.2177, "step": 4672 }, { "epoch": 0.9457599676178912, "grad_norm": 0.2797980308532715, "learning_rate": 0.0001086566013635347, "loss": 0.224, "step": 4673 }, { "epoch": 0.9459623557984214, "grad_norm": 0.2875703275203705, "learning_rate": 0.0001086249131719826, "loss": 0.2278, "step": 4674 }, { "epoch": 0.9461647439789517, "grad_norm": 0.2887953221797943, "learning_rate": 0.0001085932241078523, "loss": 0.2355, "step": 4675 }, { "epoch": 0.9463671321594819, "grad_norm": 0.33487871289253235, "learning_rate": 0.00010856153417434975, "loss": 0.2275, "step": 4676 }, { "epoch": 0.9465695203400122, "grad_norm": 0.308789998292923, "learning_rate": 0.000108529843374681, "loss": 0.2606, "step": 4677 }, { "epoch": 0.9467719085205424, "grad_norm": 0.2678741216659546, "learning_rate": 0.0001084981517120522, "loss": 0.2206, "step": 4678 }, { "epoch": 0.9469742967010727, "grad_norm": 0.31572091579437256, "learning_rate": 0.00010846645918966958, "loss": 0.2686, "step": 4679 }, { "epoch": 0.9471766848816029, "grad_norm": 0.29302868247032166, "learning_rate": 0.00010843476581073946, "loss": 0.2397, "step": 4680 }, { "epoch": 0.9473790730621332, "grad_norm": 0.4068066477775574, "learning_rate": 0.00010840307157846825, "loss": 0.2529, "step": 4681 }, { "epoch": 0.9475814612426634, "grad_norm": 0.3153510093688965, "learning_rate": 0.00010837137649606242, "loss": 0.232, "step": 4682 }, { "epoch": 0.9477838494231937, "grad_norm": 0.4613901376724243, "learning_rate": 0.00010833968056672854, "loss": 0.2561, "step": 4683 }, { "epoch": 0.9479862376037239, "grad_norm": 0.27325960993766785, "learning_rate": 0.00010830798379367331, "loss": 0.2204, "step": 4684 }, { "epoch": 0.9481886257842542, "grad_norm": 0.2553834319114685, "learning_rate": 0.00010827628618010348, "loss": 0.2109, "step": 4685 }, { "epoch": 0.9483910139647844, "grad_norm": 0.24329812824726105, "learning_rate": 0.0001082445877292258, "loss": 0.2188, "step": 4686 }, { "epoch": 0.9485934021453147, "grad_norm": 0.24125301837921143, "learning_rate": 0.00010821288844424729, "loss": 0.2392, "step": 4687 }, { "epoch": 0.9487957903258449, "grad_norm": 0.29916635155677795, "learning_rate": 0.00010818118832837487, "loss": 0.2382, "step": 4688 }, { "epoch": 0.9489981785063752, "grad_norm": 0.277773916721344, "learning_rate": 0.00010814948738481568, "loss": 0.2504, "step": 4689 }, { "epoch": 0.9492005666869054, "grad_norm": 0.2472153902053833, "learning_rate": 0.00010811778561677686, "loss": 0.2376, "step": 4690 }, { "epoch": 0.9494029548674358, "grad_norm": 0.3063032627105713, "learning_rate": 0.00010808608302746568, "loss": 0.2506, "step": 4691 }, { "epoch": 0.949605343047966, "grad_norm": 0.3086594045162201, "learning_rate": 0.00010805437962008944, "loss": 0.2558, "step": 4692 }, { "epoch": 0.9498077312284963, "grad_norm": 0.2349616289138794, "learning_rate": 0.0001080226753978556, "loss": 0.2006, "step": 4693 }, { "epoch": 0.9500101194090265, "grad_norm": 0.3396564722061157, "learning_rate": 0.00010799097036397166, "loss": 0.231, "step": 4694 }, { "epoch": 0.9502125075895568, "grad_norm": 0.2693391740322113, "learning_rate": 0.00010795926452164515, "loss": 0.219, "step": 4695 }, { "epoch": 0.950414895770087, "grad_norm": 0.39639660716056824, "learning_rate": 0.00010792755787408381, "loss": 0.2265, "step": 4696 }, { "epoch": 0.9506172839506173, "grad_norm": 0.2864764630794525, "learning_rate": 0.00010789585042449532, "loss": 0.2152, "step": 4697 }, { "epoch": 0.9508196721311475, "grad_norm": 0.26854822039604187, "learning_rate": 0.00010786414217608756, "loss": 0.2353, "step": 4698 }, { "epoch": 0.9510220603116778, "grad_norm": 0.2807072401046753, "learning_rate": 0.00010783243313206839, "loss": 0.2415, "step": 4699 }, { "epoch": 0.9512244484922081, "grad_norm": 0.44308915734291077, "learning_rate": 0.00010780072329564583, "loss": 0.3097, "step": 4700 }, { "epoch": 0.9512244484922081, "eval_loss": 0.2648102939128876, "eval_runtime": 0.7419, "eval_samples_per_second": 6.74, "eval_steps_per_second": 1.348, "step": 4700 }, { "epoch": 0.9514268366727383, "grad_norm": 0.290475994348526, "learning_rate": 0.00010776901267002793, "loss": 0.2212, "step": 4701 }, { "epoch": 0.9516292248532686, "grad_norm": 0.2510857582092285, "learning_rate": 0.00010773730125842283, "loss": 0.212, "step": 4702 }, { "epoch": 0.9518316130337988, "grad_norm": 0.26925188302993774, "learning_rate": 0.0001077055890640388, "loss": 0.2465, "step": 4703 }, { "epoch": 0.9520340012143291, "grad_norm": 0.30950313806533813, "learning_rate": 0.00010767387609008413, "loss": 0.2292, "step": 4704 }, { "epoch": 0.9522363893948593, "grad_norm": 0.35539141297340393, "learning_rate": 0.00010764216233976718, "loss": 0.2516, "step": 4705 }, { "epoch": 0.9524387775753896, "grad_norm": 0.27495214343070984, "learning_rate": 0.00010761044781629644, "loss": 0.2038, "step": 4706 }, { "epoch": 0.9526411657559198, "grad_norm": 0.3153650462627411, "learning_rate": 0.00010757873252288047, "loss": 0.2235, "step": 4707 }, { "epoch": 0.9528435539364501, "grad_norm": 0.26855596899986267, "learning_rate": 0.00010754701646272782, "loss": 0.2216, "step": 4708 }, { "epoch": 0.9530459421169803, "grad_norm": 0.24935470521450043, "learning_rate": 0.00010751529963904727, "loss": 0.2277, "step": 4709 }, { "epoch": 0.9532483302975107, "grad_norm": 0.2733069062232971, "learning_rate": 0.00010748358205504754, "loss": 0.2274, "step": 4710 }, { "epoch": 0.9534507184780409, "grad_norm": 0.3918766677379608, "learning_rate": 0.00010745186371393751, "loss": 0.2387, "step": 4711 }, { "epoch": 0.9536531066585712, "grad_norm": 0.29222676157951355, "learning_rate": 0.00010742014461892611, "loss": 0.2373, "step": 4712 }, { "epoch": 0.9538554948391014, "grad_norm": 0.2915707230567932, "learning_rate": 0.0001073884247732223, "loss": 0.2251, "step": 4713 }, { "epoch": 0.9540578830196317, "grad_norm": 0.2980474829673767, "learning_rate": 0.00010735670418003523, "loss": 0.2419, "step": 4714 }, { "epoch": 0.9542602712001619, "grad_norm": 0.2788250148296356, "learning_rate": 0.00010732498284257401, "loss": 0.2131, "step": 4715 }, { "epoch": 0.9544626593806922, "grad_norm": 0.25359243154525757, "learning_rate": 0.0001072932607640479, "loss": 0.2342, "step": 4716 }, { "epoch": 0.9546650475612224, "grad_norm": 0.41266191005706787, "learning_rate": 0.00010726153794766618, "loss": 0.2305, "step": 4717 }, { "epoch": 0.9548674357417527, "grad_norm": 0.29730692505836487, "learning_rate": 0.00010722981439663829, "loss": 0.216, "step": 4718 }, { "epoch": 0.9550698239222829, "grad_norm": 0.29037201404571533, "learning_rate": 0.00010719809011417358, "loss": 0.2067, "step": 4719 }, { "epoch": 0.9552722121028132, "grad_norm": 0.2740785777568817, "learning_rate": 0.00010716636510348168, "loss": 0.2179, "step": 4720 }, { "epoch": 0.9554746002833434, "grad_norm": 0.30382829904556274, "learning_rate": 0.00010713463936777213, "loss": 0.2624, "step": 4721 }, { "epoch": 0.9556769884638737, "grad_norm": 0.2392117828130722, "learning_rate": 0.00010710291291025465, "loss": 0.202, "step": 4722 }, { "epoch": 0.9558793766444039, "grad_norm": 0.4092789590358734, "learning_rate": 0.00010707118573413894, "loss": 0.223, "step": 4723 }, { "epoch": 0.9560817648249342, "grad_norm": 0.3075420558452606, "learning_rate": 0.00010703945784263489, "loss": 0.2257, "step": 4724 }, { "epoch": 0.9562841530054644, "grad_norm": 0.25516483187675476, "learning_rate": 0.00010700772923895235, "loss": 0.2206, "step": 4725 }, { "epoch": 0.9564865411859947, "grad_norm": 0.30991610884666443, "learning_rate": 0.00010697599992630128, "loss": 0.2224, "step": 4726 }, { "epoch": 0.9566889293665249, "grad_norm": 0.3171761631965637, "learning_rate": 0.00010694426990789174, "loss": 0.2478, "step": 4727 }, { "epoch": 0.9568913175470553, "grad_norm": 0.25520816445350647, "learning_rate": 0.00010691253918693385, "loss": 0.2256, "step": 4728 }, { "epoch": 0.9570937057275856, "grad_norm": 0.2540293335914612, "learning_rate": 0.00010688080776663778, "loss": 0.2074, "step": 4729 }, { "epoch": 0.9572960939081158, "grad_norm": 0.27108830213546753, "learning_rate": 0.00010684907565021376, "loss": 0.2256, "step": 4730 }, { "epoch": 0.9574984820886461, "grad_norm": 0.24713459610939026, "learning_rate": 0.00010681734284087215, "loss": 0.1832, "step": 4731 }, { "epoch": 0.9577008702691763, "grad_norm": 0.2772236466407776, "learning_rate": 0.00010678560934182331, "loss": 0.2324, "step": 4732 }, { "epoch": 0.9579032584497066, "grad_norm": 0.261482834815979, "learning_rate": 0.00010675387515627773, "loss": 0.2349, "step": 4733 }, { "epoch": 0.9581056466302368, "grad_norm": 0.2800223231315613, "learning_rate": 0.00010672214028744591, "loss": 0.2354, "step": 4734 }, { "epoch": 0.9583080348107671, "grad_norm": 0.2289680540561676, "learning_rate": 0.00010669040473853848, "loss": 0.1974, "step": 4735 }, { "epoch": 0.9585104229912973, "grad_norm": 0.2535719871520996, "learning_rate": 0.00010665866851276611, "loss": 0.2096, "step": 4736 }, { "epoch": 0.9587128111718276, "grad_norm": 0.25151848793029785, "learning_rate": 0.00010662693161333954, "loss": 0.2093, "step": 4737 }, { "epoch": 0.9589151993523578, "grad_norm": 0.3600228726863861, "learning_rate": 0.00010659519404346954, "loss": 0.2494, "step": 4738 }, { "epoch": 0.9591175875328881, "grad_norm": 0.26792627573013306, "learning_rate": 0.00010656345580636702, "loss": 0.2552, "step": 4739 }, { "epoch": 0.9593199757134183, "grad_norm": 0.29325777292251587, "learning_rate": 0.00010653171690524293, "loss": 0.2888, "step": 4740 }, { "epoch": 0.9595223638939486, "grad_norm": 0.2313537895679474, "learning_rate": 0.00010649997734330824, "loss": 0.2209, "step": 4741 }, { "epoch": 0.9597247520744788, "grad_norm": 0.28957894444465637, "learning_rate": 0.00010646823712377405, "loss": 0.2527, "step": 4742 }, { "epoch": 0.9599271402550091, "grad_norm": 0.242612823843956, "learning_rate": 0.00010643649624985148, "loss": 0.2044, "step": 4743 }, { "epoch": 0.9601295284355393, "grad_norm": 0.27495434880256653, "learning_rate": 0.00010640475472475178, "loss": 0.2192, "step": 4744 }, { "epoch": 0.9603319166160696, "grad_norm": 0.22725163400173187, "learning_rate": 0.00010637301255168619, "loss": 0.1944, "step": 4745 }, { "epoch": 0.9605343047965998, "grad_norm": 0.28637436032295227, "learning_rate": 0.00010634126973386607, "loss": 0.2644, "step": 4746 }, { "epoch": 0.9607366929771302, "grad_norm": 0.2781379818916321, "learning_rate": 0.00010630952627450279, "loss": 0.2483, "step": 4747 }, { "epoch": 0.9609390811576604, "grad_norm": 0.34088853001594543, "learning_rate": 0.00010627778217680786, "loss": 0.2224, "step": 4748 }, { "epoch": 0.9611414693381907, "grad_norm": 0.3108992576599121, "learning_rate": 0.00010624603744399282, "loss": 0.216, "step": 4749 }, { "epoch": 0.9613438575187209, "grad_norm": 0.30344733595848083, "learning_rate": 0.00010621429207926923, "loss": 0.2161, "step": 4750 }, { "epoch": 0.9613438575187209, "eval_loss": 0.27021822333335876, "eval_runtime": 0.7381, "eval_samples_per_second": 6.774, "eval_steps_per_second": 1.355, "step": 4750 }, { "epoch": 0.9615462456992512, "grad_norm": 0.30304837226867676, "learning_rate": 0.00010618254608584879, "loss": 0.2572, "step": 4751 }, { "epoch": 0.9617486338797814, "grad_norm": 0.2323007434606552, "learning_rate": 0.0001061507994669432, "loss": 0.2226, "step": 4752 }, { "epoch": 0.9619510220603117, "grad_norm": 0.24936886131763458, "learning_rate": 0.00010611905222576426, "loss": 0.1989, "step": 4753 }, { "epoch": 0.9621534102408419, "grad_norm": 0.2739543318748474, "learning_rate": 0.00010608730436552381, "loss": 0.2544, "step": 4754 }, { "epoch": 0.9623557984213722, "grad_norm": 0.2710932791233063, "learning_rate": 0.00010605555588943378, "loss": 0.2378, "step": 4755 }, { "epoch": 0.9625581866019024, "grad_norm": 0.284679114818573, "learning_rate": 0.00010602380680070616, "loss": 0.2244, "step": 4756 }, { "epoch": 0.9627605747824327, "grad_norm": 0.33875322341918945, "learning_rate": 0.00010599205710255298, "loss": 0.2296, "step": 4757 }, { "epoch": 0.9629629629629629, "grad_norm": 0.3009338974952698, "learning_rate": 0.00010596030679818631, "loss": 0.2089, "step": 4758 }, { "epoch": 0.9631653511434932, "grad_norm": 0.30808284878730774, "learning_rate": 0.00010592855589081838, "loss": 0.2563, "step": 4759 }, { "epoch": 0.9633677393240235, "grad_norm": 0.22831407189369202, "learning_rate": 0.00010589680438366134, "loss": 0.1959, "step": 4760 }, { "epoch": 0.9635701275045537, "grad_norm": 0.23412199318408966, "learning_rate": 0.00010586505227992752, "loss": 0.2208, "step": 4761 }, { "epoch": 0.963772515685084, "grad_norm": 0.35633623600006104, "learning_rate": 0.00010583329958282926, "loss": 0.2848, "step": 4762 }, { "epoch": 0.9639749038656142, "grad_norm": 0.2718070447444916, "learning_rate": 0.00010580154629557895, "loss": 0.2401, "step": 4763 }, { "epoch": 0.9641772920461446, "grad_norm": 0.29783880710601807, "learning_rate": 0.00010576979242138904, "loss": 0.2431, "step": 4764 }, { "epoch": 0.9643796802266748, "grad_norm": 0.24454525113105774, "learning_rate": 0.0001057380379634721, "loss": 0.1961, "step": 4765 }, { "epoch": 0.9645820684072051, "grad_norm": 0.287728875875473, "learning_rate": 0.00010570628292504068, "loss": 0.2301, "step": 4766 }, { "epoch": 0.9647844565877353, "grad_norm": 0.28134074807167053, "learning_rate": 0.00010567452730930743, "loss": 0.2446, "step": 4767 }, { "epoch": 0.9649868447682656, "grad_norm": 0.3506639897823334, "learning_rate": 0.00010564277111948501, "loss": 0.2137, "step": 4768 }, { "epoch": 0.9651892329487958, "grad_norm": 0.364789217710495, "learning_rate": 0.00010561101435878627, "loss": 0.2426, "step": 4769 }, { "epoch": 0.9653916211293261, "grad_norm": 0.3204108476638794, "learning_rate": 0.00010557925703042395, "loss": 0.2397, "step": 4770 }, { "epoch": 0.9655940093098563, "grad_norm": 0.25606632232666016, "learning_rate": 0.00010554749913761095, "loss": 0.1992, "step": 4771 }, { "epoch": 0.9657963974903866, "grad_norm": 0.28512629866600037, "learning_rate": 0.0001055157406835602, "loss": 0.2361, "step": 4772 }, { "epoch": 0.9659987856709168, "grad_norm": 0.2572082281112671, "learning_rate": 0.00010548398167148468, "loss": 0.2004, "step": 4773 }, { "epoch": 0.9662011738514471, "grad_norm": 0.2586156129837036, "learning_rate": 0.00010545222210459744, "loss": 0.2051, "step": 4774 }, { "epoch": 0.9664035620319773, "grad_norm": 0.2859581410884857, "learning_rate": 0.0001054204619861116, "loss": 0.2254, "step": 4775 }, { "epoch": 0.9666059502125076, "grad_norm": 0.23509328067302704, "learning_rate": 0.00010538870131924026, "loss": 0.2183, "step": 4776 }, { "epoch": 0.9668083383930378, "grad_norm": 0.27621352672576904, "learning_rate": 0.00010535694010719665, "loss": 0.2309, "step": 4777 }, { "epoch": 0.9670107265735681, "grad_norm": 0.2531593143939972, "learning_rate": 0.00010532517835319407, "loss": 0.1967, "step": 4778 }, { "epoch": 0.9672131147540983, "grad_norm": 0.33856093883514404, "learning_rate": 0.0001052934160604458, "loss": 0.2425, "step": 4779 }, { "epoch": 0.9674155029346286, "grad_norm": 0.27292317152023315, "learning_rate": 0.00010526165323216525, "loss": 0.2316, "step": 4780 }, { "epoch": 0.9676178911151588, "grad_norm": 0.23614521324634552, "learning_rate": 0.00010522988987156586, "loss": 0.1961, "step": 4781 }, { "epoch": 0.9678202792956891, "grad_norm": 0.2543821930885315, "learning_rate": 0.00010519812598186107, "loss": 0.215, "step": 4782 }, { "epoch": 0.9680226674762193, "grad_norm": 0.3500335216522217, "learning_rate": 0.00010516636156626445, "loss": 0.2445, "step": 4783 }, { "epoch": 0.9682250556567497, "grad_norm": 0.2474358230829239, "learning_rate": 0.00010513459662798954, "loss": 0.2219, "step": 4784 }, { "epoch": 0.9684274438372799, "grad_norm": 0.26944953203201294, "learning_rate": 0.00010510283117025008, "loss": 0.2411, "step": 4785 }, { "epoch": 0.9686298320178102, "grad_norm": 0.37569373846054077, "learning_rate": 0.00010507106519625967, "loss": 0.2564, "step": 4786 }, { "epoch": 0.9688322201983404, "grad_norm": 0.2393825501203537, "learning_rate": 0.00010503929870923208, "loss": 0.2335, "step": 4787 }, { "epoch": 0.9690346083788707, "grad_norm": 0.23142491281032562, "learning_rate": 0.00010500753171238116, "loss": 0.2066, "step": 4788 }, { "epoch": 0.9692369965594009, "grad_norm": 0.29840072989463806, "learning_rate": 0.0001049757642089207, "loss": 0.2315, "step": 4789 }, { "epoch": 0.9694393847399312, "grad_norm": 0.27222388982772827, "learning_rate": 0.00010494399620206464, "loss": 0.2222, "step": 4790 }, { "epoch": 0.9696417729204615, "grad_norm": 0.24776479601860046, "learning_rate": 0.00010491222769502688, "loss": 0.2159, "step": 4791 }, { "epoch": 0.9698441611009917, "grad_norm": 0.362596720457077, "learning_rate": 0.0001048804586910215, "loss": 0.2469, "step": 4792 }, { "epoch": 0.970046549281522, "grad_norm": 0.2624041736125946, "learning_rate": 0.00010484868919326251, "loss": 0.2227, "step": 4793 }, { "epoch": 0.9702489374620522, "grad_norm": 0.3750174343585968, "learning_rate": 0.00010481691920496404, "loss": 0.2398, "step": 4794 }, { "epoch": 0.9704513256425825, "grad_norm": 0.2520590126514435, "learning_rate": 0.00010478514872934023, "loss": 0.2154, "step": 4795 }, { "epoch": 0.9706537138231127, "grad_norm": 0.21335478127002716, "learning_rate": 0.00010475337776960528, "loss": 0.2068, "step": 4796 }, { "epoch": 0.970856102003643, "grad_norm": 0.3522154986858368, "learning_rate": 0.00010472160632897343, "loss": 0.2391, "step": 4797 }, { "epoch": 0.9710584901841732, "grad_norm": 0.4050356149673462, "learning_rate": 0.000104689834410659, "loss": 0.2603, "step": 4798 }, { "epoch": 0.9712608783647035, "grad_norm": 0.2647132873535156, "learning_rate": 0.00010465806201787634, "loss": 0.2358, "step": 4799 }, { "epoch": 0.9714632665452337, "grad_norm": 0.6313262581825256, "learning_rate": 0.00010462628915383983, "loss": 0.2435, "step": 4800 }, { "epoch": 0.9714632665452337, "eval_loss": 0.2704029083251953, "eval_runtime": 0.7409, "eval_samples_per_second": 6.749, "eval_steps_per_second": 1.35, "step": 4800 }, { "epoch": 0.971665654725764, "grad_norm": 0.3022230863571167, "learning_rate": 0.00010459451582176392, "loss": 0.2649, "step": 4801 }, { "epoch": 0.9718680429062942, "grad_norm": 0.2528679370880127, "learning_rate": 0.00010456274202486314, "loss": 0.2116, "step": 4802 }, { "epoch": 0.9720704310868246, "grad_norm": 0.6243422627449036, "learning_rate": 0.00010453096776635196, "loss": 0.2509, "step": 4803 }, { "epoch": 0.9722728192673548, "grad_norm": 0.32224878668785095, "learning_rate": 0.00010449919304944502, "loss": 0.2208, "step": 4804 }, { "epoch": 0.9724752074478851, "grad_norm": 0.3156091272830963, "learning_rate": 0.00010446741787735695, "loss": 0.2714, "step": 4805 }, { "epoch": 0.9726775956284153, "grad_norm": 0.37287795543670654, "learning_rate": 0.0001044356422533024, "loss": 0.2404, "step": 4806 }, { "epoch": 0.9728799838089456, "grad_norm": 0.26560112833976746, "learning_rate": 0.00010440386618049611, "loss": 0.2373, "step": 4807 }, { "epoch": 0.9730823719894758, "grad_norm": 0.332040399312973, "learning_rate": 0.00010437208966215286, "loss": 0.2552, "step": 4808 }, { "epoch": 0.9732847601700061, "grad_norm": 0.326384574174881, "learning_rate": 0.00010434031270148743, "loss": 0.2473, "step": 4809 }, { "epoch": 0.9734871483505363, "grad_norm": 0.26876839995384216, "learning_rate": 0.00010430853530171472, "loss": 0.2343, "step": 4810 }, { "epoch": 0.9736895365310666, "grad_norm": 0.3743020296096802, "learning_rate": 0.00010427675746604962, "loss": 0.2167, "step": 4811 }, { "epoch": 0.9738919247115968, "grad_norm": 0.30613642930984497, "learning_rate": 0.00010424497919770708, "loss": 0.1993, "step": 4812 }, { "epoch": 0.9740943128921271, "grad_norm": 0.2499884068965912, "learning_rate": 0.00010421320049990207, "loss": 0.2167, "step": 4813 }, { "epoch": 0.9742967010726573, "grad_norm": 0.2823628783226013, "learning_rate": 0.00010418142137584966, "loss": 0.2131, "step": 4814 }, { "epoch": 0.9744990892531876, "grad_norm": 0.27555301785469055, "learning_rate": 0.0001041496418287649, "loss": 0.2106, "step": 4815 }, { "epoch": 0.9747014774337178, "grad_norm": 0.29797542095184326, "learning_rate": 0.00010411786186186292, "loss": 0.2279, "step": 4816 }, { "epoch": 0.9749038656142481, "grad_norm": 0.24461086094379425, "learning_rate": 0.00010408608147835888, "loss": 0.2245, "step": 4817 }, { "epoch": 0.9751062537947783, "grad_norm": 0.2839052677154541, "learning_rate": 0.00010405430068146802, "loss": 0.2658, "step": 4818 }, { "epoch": 0.9753086419753086, "grad_norm": 0.2975747585296631, "learning_rate": 0.00010402251947440554, "loss": 0.2203, "step": 4819 }, { "epoch": 0.975511030155839, "grad_norm": 0.4260922372341156, "learning_rate": 0.00010399073786038673, "loss": 0.2286, "step": 4820 }, { "epoch": 0.9757134183363692, "grad_norm": 0.4903205633163452, "learning_rate": 0.00010395895584262696, "loss": 0.2026, "step": 4821 }, { "epoch": 0.9759158065168995, "grad_norm": 0.2526836395263672, "learning_rate": 0.00010392717342434157, "loss": 0.2056, "step": 4822 }, { "epoch": 0.9761181946974297, "grad_norm": 0.2706577777862549, "learning_rate": 0.00010389539060874598, "loss": 0.2054, "step": 4823 }, { "epoch": 0.97632058287796, "grad_norm": 0.37094250321388245, "learning_rate": 0.00010386360739905564, "loss": 0.2334, "step": 4824 }, { "epoch": 0.9765229710584902, "grad_norm": 0.4301499128341675, "learning_rate": 0.00010383182379848607, "loss": 0.2602, "step": 4825 }, { "epoch": 0.9767253592390205, "grad_norm": 0.34220004081726074, "learning_rate": 0.00010380003981025273, "loss": 0.2503, "step": 4826 }, { "epoch": 0.9769277474195507, "grad_norm": 0.2496861219406128, "learning_rate": 0.00010376825543757127, "loss": 0.1857, "step": 4827 }, { "epoch": 0.977130135600081, "grad_norm": 0.31023451685905457, "learning_rate": 0.00010373647068365724, "loss": 0.2146, "step": 4828 }, { "epoch": 0.9773325237806112, "grad_norm": 0.27009859681129456, "learning_rate": 0.00010370468555172632, "loss": 0.2307, "step": 4829 }, { "epoch": 0.9775349119611415, "grad_norm": 0.2698228061199188, "learning_rate": 0.00010367290004499419, "loss": 0.2293, "step": 4830 }, { "epoch": 0.9777373001416717, "grad_norm": 0.29861339926719666, "learning_rate": 0.00010364111416667659, "loss": 0.2261, "step": 4831 }, { "epoch": 0.977939688322202, "grad_norm": 0.28584665060043335, "learning_rate": 0.00010360932791998925, "loss": 0.2301, "step": 4832 }, { "epoch": 0.9781420765027322, "grad_norm": 0.303874135017395, "learning_rate": 0.00010357754130814798, "loss": 0.2446, "step": 4833 }, { "epoch": 0.9783444646832625, "grad_norm": 0.31630566716194153, "learning_rate": 0.00010354575433436862, "loss": 0.2516, "step": 4834 }, { "epoch": 0.9785468528637927, "grad_norm": 0.29177409410476685, "learning_rate": 0.00010351396700186705, "loss": 0.2596, "step": 4835 }, { "epoch": 0.978749241044323, "grad_norm": 0.2889035940170288, "learning_rate": 0.00010348217931385915, "loss": 0.2329, "step": 4836 }, { "epoch": 0.9789516292248532, "grad_norm": 0.3667570948600769, "learning_rate": 0.00010345039127356091, "loss": 0.2476, "step": 4837 }, { "epoch": 0.9791540174053835, "grad_norm": 0.280477911233902, "learning_rate": 0.00010341860288418827, "loss": 0.2146, "step": 4838 }, { "epoch": 0.9793564055859137, "grad_norm": 0.27934882044792175, "learning_rate": 0.00010338681414895725, "loss": 0.2283, "step": 4839 }, { "epoch": 0.9795587937664441, "grad_norm": 0.3026507496833801, "learning_rate": 0.00010335502507108396, "loss": 0.2529, "step": 4840 }, { "epoch": 0.9797611819469743, "grad_norm": 0.24515746533870697, "learning_rate": 0.00010332323565378441, "loss": 0.2388, "step": 4841 }, { "epoch": 0.9799635701275046, "grad_norm": 0.2657843232154846, "learning_rate": 0.00010329144590027474, "loss": 0.2515, "step": 4842 }, { "epoch": 0.9801659583080348, "grad_norm": 0.3656042516231537, "learning_rate": 0.00010325965581377111, "loss": 0.2654, "step": 4843 }, { "epoch": 0.9803683464885651, "grad_norm": 0.3170849680900574, "learning_rate": 0.00010322786539748972, "loss": 0.2549, "step": 4844 }, { "epoch": 0.9805707346690953, "grad_norm": 0.30992385745048523, "learning_rate": 0.00010319607465464676, "loss": 0.246, "step": 4845 }, { "epoch": 0.9807731228496256, "grad_norm": 0.26733100414276123, "learning_rate": 0.0001031642835884585, "loss": 0.2385, "step": 4846 }, { "epoch": 0.9809755110301558, "grad_norm": 0.34407472610473633, "learning_rate": 0.00010313249220214126, "loss": 0.2372, "step": 4847 }, { "epoch": 0.9811778992106861, "grad_norm": 0.35374516248703003, "learning_rate": 0.00010310070049891129, "loss": 0.2311, "step": 4848 }, { "epoch": 0.9813802873912163, "grad_norm": 0.2595251500606537, "learning_rate": 0.000103068908481985, "loss": 0.1729, "step": 4849 }, { "epoch": 0.9815826755717466, "grad_norm": 0.2830277383327484, "learning_rate": 0.00010303711615457876, "loss": 0.2336, "step": 4850 }, { "epoch": 0.9815826755717466, "eval_loss": 0.26844725012779236, "eval_runtime": 0.7388, "eval_samples_per_second": 6.768, "eval_steps_per_second": 1.354, "step": 4850 }, { "epoch": 0.9817850637522769, "grad_norm": 0.2586294114589691, "learning_rate": 0.00010300532351990899, "loss": 0.2333, "step": 4851 }, { "epoch": 0.9819874519328071, "grad_norm": 0.3211621344089508, "learning_rate": 0.00010297353058119208, "loss": 0.2247, "step": 4852 }, { "epoch": 0.9821898401133374, "grad_norm": 0.2804816961288452, "learning_rate": 0.00010294173734164456, "loss": 0.2364, "step": 4853 }, { "epoch": 0.9823922282938676, "grad_norm": 0.516215980052948, "learning_rate": 0.00010290994380448293, "loss": 0.217, "step": 4854 }, { "epoch": 0.9825946164743979, "grad_norm": 0.43761691451072693, "learning_rate": 0.00010287814997292369, "loss": 0.2468, "step": 4855 }, { "epoch": 0.9827970046549281, "grad_norm": 0.2564006745815277, "learning_rate": 0.00010284635585018348, "loss": 0.2195, "step": 4856 }, { "epoch": 0.9829993928354585, "grad_norm": 0.23958547413349152, "learning_rate": 0.0001028145614394788, "loss": 0.1811, "step": 4857 }, { "epoch": 0.9832017810159887, "grad_norm": 0.28001734614372253, "learning_rate": 0.00010278276674402638, "loss": 0.2325, "step": 4858 }, { "epoch": 0.983404169196519, "grad_norm": 0.30932843685150146, "learning_rate": 0.00010275097176704277, "loss": 0.2612, "step": 4859 }, { "epoch": 0.9836065573770492, "grad_norm": 0.26031824946403503, "learning_rate": 0.00010271917651174475, "loss": 0.2253, "step": 4860 }, { "epoch": 0.9838089455575795, "grad_norm": 0.35603246092796326, "learning_rate": 0.00010268738098134895, "loss": 0.2432, "step": 4861 }, { "epoch": 0.9840113337381097, "grad_norm": 0.3286120891571045, "learning_rate": 0.00010265558517907216, "loss": 0.2778, "step": 4862 }, { "epoch": 0.98421372191864, "grad_norm": 0.38171565532684326, "learning_rate": 0.00010262378910813116, "loss": 0.2569, "step": 4863 }, { "epoch": 0.9844161100991702, "grad_norm": 0.27509549260139465, "learning_rate": 0.00010259199277174266, "loss": 0.2217, "step": 4864 }, { "epoch": 0.9846184982797005, "grad_norm": 0.32143381237983704, "learning_rate": 0.00010256019617312353, "loss": 0.2705, "step": 4865 }, { "epoch": 0.9848208864602307, "grad_norm": 0.28621792793273926, "learning_rate": 0.00010252839931549063, "loss": 0.2186, "step": 4866 }, { "epoch": 0.985023274640761, "grad_norm": 0.26840049028396606, "learning_rate": 0.0001024966022020608, "loss": 0.2089, "step": 4867 }, { "epoch": 0.9852256628212912, "grad_norm": 0.3102000057697296, "learning_rate": 0.00010246480483605097, "loss": 0.2216, "step": 4868 }, { "epoch": 0.9854280510018215, "grad_norm": 0.28217270970344543, "learning_rate": 0.00010243300722067806, "loss": 0.2367, "step": 4869 }, { "epoch": 0.9856304391823517, "grad_norm": 0.2826128602027893, "learning_rate": 0.00010240120935915898, "loss": 0.2139, "step": 4870 }, { "epoch": 0.985832827362882, "grad_norm": 0.3096916675567627, "learning_rate": 0.00010236941125471076, "loss": 0.2775, "step": 4871 }, { "epoch": 0.9860352155434122, "grad_norm": 0.27654576301574707, "learning_rate": 0.00010233761291055035, "loss": 0.2395, "step": 4872 }, { "epoch": 0.9862376037239425, "grad_norm": 0.2547740340232849, "learning_rate": 0.0001023058143298948, "loss": 0.22, "step": 4873 }, { "epoch": 0.9864399919044727, "grad_norm": 0.4360198974609375, "learning_rate": 0.00010227401551596116, "loss": 0.2752, "step": 4874 }, { "epoch": 0.986642380085003, "grad_norm": 0.30171704292297363, "learning_rate": 0.0001022422164719665, "loss": 0.2812, "step": 4875 }, { "epoch": 0.9868447682655332, "grad_norm": 0.3533839285373688, "learning_rate": 0.00010221041720112789, "loss": 0.2314, "step": 4876 }, { "epoch": 0.9870471564460636, "grad_norm": 0.24221685528755188, "learning_rate": 0.00010217861770666246, "loss": 0.2042, "step": 4877 }, { "epoch": 0.9872495446265938, "grad_norm": 0.35867074131965637, "learning_rate": 0.00010214681799178736, "loss": 0.2592, "step": 4878 }, { "epoch": 0.9874519328071241, "grad_norm": 0.24381330609321594, "learning_rate": 0.00010211501805971973, "loss": 0.2275, "step": 4879 }, { "epoch": 0.9876543209876543, "grad_norm": 0.2809392809867859, "learning_rate": 0.00010208321791367676, "loss": 0.2372, "step": 4880 }, { "epoch": 0.9878567091681846, "grad_norm": 0.3226775825023651, "learning_rate": 0.00010205141755687566, "loss": 0.2624, "step": 4881 }, { "epoch": 0.9880590973487149, "grad_norm": 0.27383941411972046, "learning_rate": 0.00010201961699253366, "loss": 0.237, "step": 4882 }, { "epoch": 0.9882614855292451, "grad_norm": 0.35411977767944336, "learning_rate": 0.00010198781622386802, "loss": 0.2432, "step": 4883 }, { "epoch": 0.9884638737097754, "grad_norm": 0.27126365900039673, "learning_rate": 0.00010195601525409594, "loss": 0.219, "step": 4884 }, { "epoch": 0.9886662618903056, "grad_norm": 0.24518761038780212, "learning_rate": 0.00010192421408643484, "loss": 0.2377, "step": 4885 }, { "epoch": 0.9888686500708359, "grad_norm": 0.2346014827489853, "learning_rate": 0.0001018924127241019, "loss": 0.1761, "step": 4886 }, { "epoch": 0.9890710382513661, "grad_norm": 0.29043862223625183, "learning_rate": 0.00010186061117031452, "loss": 0.2313, "step": 4887 }, { "epoch": 0.9892734264318964, "grad_norm": 0.2963557839393616, "learning_rate": 0.00010182880942829001, "loss": 0.2298, "step": 4888 }, { "epoch": 0.9894758146124266, "grad_norm": 0.261079341173172, "learning_rate": 0.00010179700750124576, "loss": 0.233, "step": 4889 }, { "epoch": 0.9896782027929569, "grad_norm": 0.30412137508392334, "learning_rate": 0.00010176520539239913, "loss": 0.2331, "step": 4890 }, { "epoch": 0.9898805909734871, "grad_norm": 0.24819315969944, "learning_rate": 0.00010173340310496757, "loss": 0.2242, "step": 4891 }, { "epoch": 0.9900829791540174, "grad_norm": 0.2694309651851654, "learning_rate": 0.00010170160064216844, "loss": 0.2094, "step": 4892 }, { "epoch": 0.9902853673345476, "grad_norm": 0.3118360936641693, "learning_rate": 0.00010166979800721923, "loss": 0.2264, "step": 4893 }, { "epoch": 0.990487755515078, "grad_norm": 0.3211439251899719, "learning_rate": 0.00010163799520333739, "loss": 0.2413, "step": 4894 }, { "epoch": 0.9906901436956081, "grad_norm": 0.3218715786933899, "learning_rate": 0.00010160619223374035, "loss": 0.2514, "step": 4895 }, { "epoch": 0.9908925318761385, "grad_norm": 0.25901928544044495, "learning_rate": 0.00010157438910164568, "loss": 0.195, "step": 4896 }, { "epoch": 0.9910949200566687, "grad_norm": 0.2728163003921509, "learning_rate": 0.0001015425858102708, "loss": 0.2343, "step": 4897 }, { "epoch": 0.991297308237199, "grad_norm": 0.2993209660053253, "learning_rate": 0.00010151078236283331, "loss": 0.2104, "step": 4898 }, { "epoch": 0.9914996964177292, "grad_norm": 0.2572130560874939, "learning_rate": 0.00010147897876255068, "loss": 0.2327, "step": 4899 }, { "epoch": 0.9917020845982595, "grad_norm": 0.2839822471141815, "learning_rate": 0.00010144717501264052, "loss": 0.2537, "step": 4900 }, { "epoch": 0.9917020845982595, "eval_loss": 0.2662275433540344, "eval_runtime": 0.7372, "eval_samples_per_second": 6.782, "eval_steps_per_second": 1.356, "step": 4900 }, { "epoch": 0.9919044727787897, "grad_norm": 0.3022128939628601, "learning_rate": 0.00010141537111632036, "loss": 0.2642, "step": 4901 }, { "epoch": 0.99210686095932, "grad_norm": 0.34551921486854553, "learning_rate": 0.00010138356707680778, "loss": 0.2149, "step": 4902 }, { "epoch": 0.9923092491398502, "grad_norm": 0.27679723501205444, "learning_rate": 0.00010135176289732044, "loss": 0.216, "step": 4903 }, { "epoch": 0.9925116373203805, "grad_norm": 0.2436773031949997, "learning_rate": 0.00010131995858107591, "loss": 0.2195, "step": 4904 }, { "epoch": 0.9927140255009107, "grad_norm": 0.2481018751859665, "learning_rate": 0.0001012881541312918, "loss": 0.2017, "step": 4905 }, { "epoch": 0.992916413681441, "grad_norm": 0.26413610577583313, "learning_rate": 0.00010125634955118579, "loss": 0.207, "step": 4906 }, { "epoch": 0.9931188018619712, "grad_norm": 0.2637857496738434, "learning_rate": 0.0001012245448439755, "loss": 0.2297, "step": 4907 }, { "epoch": 0.9933211900425015, "grad_norm": 0.264993816614151, "learning_rate": 0.00010119274001287861, "loss": 0.2246, "step": 4908 }, { "epoch": 0.9935235782230317, "grad_norm": 0.3293372690677643, "learning_rate": 0.00010116093506111282, "loss": 0.2485, "step": 4909 }, { "epoch": 0.993725966403562, "grad_norm": 0.6976776719093323, "learning_rate": 0.00010112912999189579, "loss": 0.2681, "step": 4910 }, { "epoch": 0.9939283545840923, "grad_norm": 0.2869125008583069, "learning_rate": 0.00010109732480844525, "loss": 0.1983, "step": 4911 }, { "epoch": 0.9941307427646225, "grad_norm": 0.2749207019805908, "learning_rate": 0.00010106551951397887, "loss": 0.2203, "step": 4912 }, { "epoch": 0.9943331309451529, "grad_norm": 0.3956700265407562, "learning_rate": 0.00010103371411171443, "loss": 0.2762, "step": 4913 }, { "epoch": 0.994535519125683, "grad_norm": 0.29219114780426025, "learning_rate": 0.00010100190860486964, "loss": 0.2497, "step": 4914 }, { "epoch": 0.9947379073062134, "grad_norm": 0.3515431582927704, "learning_rate": 0.00010097010299666226, "loss": 0.2287, "step": 4915 }, { "epoch": 0.9949402954867436, "grad_norm": 0.3369796872138977, "learning_rate": 0.00010093829729031002, "loss": 0.2184, "step": 4916 }, { "epoch": 0.9951426836672739, "grad_norm": 0.2912400960922241, "learning_rate": 0.00010090649148903071, "loss": 0.1893, "step": 4917 }, { "epoch": 0.9953450718478041, "grad_norm": 0.24918058514595032, "learning_rate": 0.00010087468559604212, "loss": 0.2059, "step": 4918 }, { "epoch": 0.9955474600283344, "grad_norm": 0.35527414083480835, "learning_rate": 0.000100842879614562, "loss": 0.2088, "step": 4919 }, { "epoch": 0.9957498482088646, "grad_norm": 0.2517155408859253, "learning_rate": 0.00010081107354780816, "loss": 0.2061, "step": 4920 }, { "epoch": 0.9959522363893949, "grad_norm": 0.3073742985725403, "learning_rate": 0.00010077926739899842, "loss": 0.2299, "step": 4921 }, { "epoch": 0.9961546245699251, "grad_norm": 0.2773192226886749, "learning_rate": 0.00010074746117135057, "loss": 0.2428, "step": 4922 }, { "epoch": 0.9963570127504554, "grad_norm": 0.33836686611175537, "learning_rate": 0.00010071565486808245, "loss": 0.2654, "step": 4923 }, { "epoch": 0.9965594009309856, "grad_norm": 0.2811589539051056, "learning_rate": 0.00010068384849241188, "loss": 0.2649, "step": 4924 }, { "epoch": 0.9967617891115159, "grad_norm": 0.2710583508014679, "learning_rate": 0.00010065204204755669, "loss": 0.2459, "step": 4925 }, { "epoch": 0.9969641772920461, "grad_norm": 0.3035294711589813, "learning_rate": 0.00010062023553673474, "loss": 0.2342, "step": 4926 }, { "epoch": 0.9971665654725764, "grad_norm": 0.28561583161354065, "learning_rate": 0.00010058842896316385, "loss": 0.2556, "step": 4927 }, { "epoch": 0.9973689536531066, "grad_norm": 0.26396119594573975, "learning_rate": 0.00010055662233006192, "loss": 0.2304, "step": 4928 }, { "epoch": 0.9975713418336369, "grad_norm": 0.2924419343471527, "learning_rate": 0.00010052481564064678, "loss": 0.2298, "step": 4929 }, { "epoch": 0.9977737300141671, "grad_norm": 0.27983298897743225, "learning_rate": 0.00010049300889813627, "loss": 0.2354, "step": 4930 }, { "epoch": 0.9979761181946974, "grad_norm": 0.23018218576908112, "learning_rate": 0.00010046120210574827, "loss": 0.202, "step": 4931 }, { "epoch": 0.9981785063752276, "grad_norm": 0.22712524235248566, "learning_rate": 0.00010042939526670071, "loss": 0.2301, "step": 4932 }, { "epoch": 0.998380894555758, "grad_norm": 0.26420795917510986, "learning_rate": 0.00010039758838421148, "loss": 0.2161, "step": 4933 }, { "epoch": 0.9985832827362882, "grad_norm": 0.31919482350349426, "learning_rate": 0.00010036578146149838, "loss": 0.2308, "step": 4934 }, { "epoch": 0.9987856709168185, "grad_norm": 0.28096866607666016, "learning_rate": 0.00010033397450177936, "loss": 0.226, "step": 4935 }, { "epoch": 0.9989880590973487, "grad_norm": 0.26480206847190857, "learning_rate": 0.00010030216750827232, "loss": 0.2491, "step": 4936 }, { "epoch": 0.999190447277879, "grad_norm": 0.42638319730758667, "learning_rate": 0.00010027036048419513, "loss": 0.2322, "step": 4937 }, { "epoch": 0.9993928354584092, "grad_norm": 0.3904888927936554, "learning_rate": 0.00010023855343276572, "loss": 0.2139, "step": 4938 }, { "epoch": 0.9995952236389395, "grad_norm": 0.27836623787879944, "learning_rate": 0.00010020674635720195, "loss": 0.222, "step": 4939 }, { "epoch": 0.9997976118194697, "grad_norm": 0.2650602459907532, "learning_rate": 0.00010017493926072179, "loss": 0.1917, "step": 4940 }, { "epoch": 1.0, "grad_norm": 0.33457136154174805, "learning_rate": 0.00010014313214654309, "loss": 0.2885, "step": 4941 }, { "epoch": 1.0002023881805302, "grad_norm": 0.47695091366767883, "learning_rate": 0.00010011132501788379, "loss": 0.1844, "step": 4942 }, { "epoch": 1.0004047763610606, "grad_norm": 0.28054702281951904, "learning_rate": 0.00010007951787796178, "loss": 0.2, "step": 4943 }, { "epoch": 1.0006071645415908, "grad_norm": 0.215849831700325, "learning_rate": 0.000100047710729995, "loss": 0.1708, "step": 4944 }, { "epoch": 1.000809552722121, "grad_norm": 0.48324742913246155, "learning_rate": 0.00010001590357720133, "loss": 0.1951, "step": 4945 }, { "epoch": 1.0010119409026512, "grad_norm": 0.24173052608966827, "learning_rate": 9.99840964227987e-05, "loss": 0.2146, "step": 4946 }, { "epoch": 1.0012143290831816, "grad_norm": 0.2449575513601303, "learning_rate": 9.995228927000504e-05, "loss": 0.1778, "step": 4947 }, { "epoch": 1.0014167172637118, "grad_norm": 0.3132277727127075, "learning_rate": 9.992048212203823e-05, "loss": 0.207, "step": 4948 }, { "epoch": 1.001619105444242, "grad_norm": 0.5420759916305542, "learning_rate": 9.988867498211624e-05, "loss": 0.2027, "step": 4949 }, { "epoch": 1.0018214936247722, "grad_norm": 0.22170519828796387, "learning_rate": 9.985686785345693e-05, "loss": 0.18, "step": 4950 }, { "epoch": 1.0018214936247722, "eval_loss": 0.26427409052848816, "eval_runtime": 0.7388, "eval_samples_per_second": 6.768, "eval_steps_per_second": 1.354, "step": 4950 }, { "epoch": 1.0020238818053027, "grad_norm": 0.277170866727829, "learning_rate": 9.982506073927822e-05, "loss": 0.1889, "step": 4951 }, { "epoch": 1.0022262699858329, "grad_norm": 0.24262042343616486, "learning_rate": 9.979325364279803e-05, "loss": 0.1763, "step": 4952 }, { "epoch": 1.002428658166363, "grad_norm": 0.2713542580604553, "learning_rate": 9.976144656723429e-05, "loss": 0.2073, "step": 4953 }, { "epoch": 1.0026310463468933, "grad_norm": 0.27338194847106934, "learning_rate": 9.972963951580486e-05, "loss": 0.1863, "step": 4954 }, { "epoch": 1.0028334345274237, "grad_norm": 0.29679569602012634, "learning_rate": 9.969783249172767e-05, "loss": 0.2305, "step": 4955 }, { "epoch": 1.0030358227079539, "grad_norm": 0.28192949295043945, "learning_rate": 9.966602549822063e-05, "loss": 0.1952, "step": 4956 }, { "epoch": 1.003238210888484, "grad_norm": 0.2806640863418579, "learning_rate": 9.963421853850163e-05, "loss": 0.2047, "step": 4957 }, { "epoch": 1.0034405990690143, "grad_norm": 0.32658886909484863, "learning_rate": 9.960241161578855e-05, "loss": 0.2308, "step": 4958 }, { "epoch": 1.0036429872495447, "grad_norm": 0.4036096930503845, "learning_rate": 9.95706047332993e-05, "loss": 0.2399, "step": 4959 }, { "epoch": 1.003845375430075, "grad_norm": 0.2438739687204361, "learning_rate": 9.953879789425174e-05, "loss": 0.1893, "step": 4960 }, { "epoch": 1.004047763610605, "grad_norm": 0.39129194617271423, "learning_rate": 9.950699110186378e-05, "loss": 0.2041, "step": 4961 }, { "epoch": 1.0042501517911353, "grad_norm": 0.25112760066986084, "learning_rate": 9.947518435935328e-05, "loss": 0.182, "step": 4962 }, { "epoch": 1.0044525399716657, "grad_norm": 0.4767323434352875, "learning_rate": 9.944337766993812e-05, "loss": 0.2276, "step": 4963 }, { "epoch": 1.004654928152196, "grad_norm": 0.28244414925575256, "learning_rate": 9.941157103683617e-05, "loss": 0.195, "step": 4964 }, { "epoch": 1.0048573163327261, "grad_norm": 0.3557490408420563, "learning_rate": 9.937976446326529e-05, "loss": 0.2082, "step": 4965 }, { "epoch": 1.0050597045132563, "grad_norm": 0.3751201927661896, "learning_rate": 9.934795795244333e-05, "loss": 0.2105, "step": 4966 }, { "epoch": 1.0052620926937867, "grad_norm": 0.3247845470905304, "learning_rate": 9.931615150758814e-05, "loss": 0.1828, "step": 4967 }, { "epoch": 1.005464480874317, "grad_norm": 0.3247244358062744, "learning_rate": 9.928434513191757e-05, "loss": 0.1867, "step": 4968 }, { "epoch": 1.0056668690548471, "grad_norm": 0.2909950911998749, "learning_rate": 9.925253882864944e-05, "loss": 0.2101, "step": 4969 }, { "epoch": 1.0058692572353773, "grad_norm": 0.29122984409332275, "learning_rate": 9.922073260100161e-05, "loss": 0.1965, "step": 4970 }, { "epoch": 1.0060716454159078, "grad_norm": 0.36983293294906616, "learning_rate": 9.918892645219187e-05, "loss": 0.2161, "step": 4971 }, { "epoch": 1.006274033596438, "grad_norm": 0.2427636981010437, "learning_rate": 9.915712038543803e-05, "loss": 0.1583, "step": 4972 }, { "epoch": 1.0064764217769682, "grad_norm": 0.30179157853126526, "learning_rate": 9.912531440395792e-05, "loss": 0.1955, "step": 4973 }, { "epoch": 1.0066788099574986, "grad_norm": 0.25663846731185913, "learning_rate": 9.90935085109693e-05, "loss": 0.2038, "step": 4974 }, { "epoch": 1.0068811981380288, "grad_norm": 0.2936505079269409, "learning_rate": 9.906170270968999e-05, "loss": 0.1888, "step": 4975 }, { "epoch": 1.007083586318559, "grad_norm": 0.24913620948791504, "learning_rate": 9.902989700333775e-05, "loss": 0.1602, "step": 4976 }, { "epoch": 1.0072859744990892, "grad_norm": 0.25665563344955444, "learning_rate": 9.899809139513037e-05, "loss": 0.189, "step": 4977 }, { "epoch": 1.0074883626796196, "grad_norm": 0.2616618573665619, "learning_rate": 9.896628588828557e-05, "loss": 0.1727, "step": 4978 }, { "epoch": 1.0076907508601498, "grad_norm": 0.35549241304397583, "learning_rate": 9.893448048602114e-05, "loss": 0.2156, "step": 4979 }, { "epoch": 1.00789313904068, "grad_norm": 0.30542701482772827, "learning_rate": 9.890267519155479e-05, "loss": 0.1956, "step": 4980 }, { "epoch": 1.0080955272212102, "grad_norm": 0.7851333618164062, "learning_rate": 9.887087000810424e-05, "loss": 0.2317, "step": 4981 }, { "epoch": 1.0082979154017406, "grad_norm": 0.2573649287223816, "learning_rate": 9.88390649388872e-05, "loss": 0.1782, "step": 4982 }, { "epoch": 1.0085003035822708, "grad_norm": 0.2208670824766159, "learning_rate": 9.880725998712141e-05, "loss": 0.1836, "step": 4983 }, { "epoch": 1.008702691762801, "grad_norm": 0.29855644702911377, "learning_rate": 9.877545515602453e-05, "loss": 0.2128, "step": 4984 }, { "epoch": 1.0089050799433312, "grad_norm": 0.24554845690727234, "learning_rate": 9.874365044881424e-05, "loss": 0.2037, "step": 4985 }, { "epoch": 1.0091074681238617, "grad_norm": 0.2785384953022003, "learning_rate": 9.871184586870822e-05, "loss": 0.2101, "step": 4986 }, { "epoch": 1.0093098563043919, "grad_norm": 0.45173802971839905, "learning_rate": 9.868004141892411e-05, "loss": 0.1869, "step": 4987 }, { "epoch": 1.009512244484922, "grad_norm": 0.2669700086116791, "learning_rate": 9.864823710267958e-05, "loss": 0.2041, "step": 4988 }, { "epoch": 1.0097146326654522, "grad_norm": 0.23057915270328522, "learning_rate": 9.861643292319223e-05, "loss": 0.1516, "step": 4989 }, { "epoch": 1.0099170208459827, "grad_norm": 0.67122882604599, "learning_rate": 9.858462888367967e-05, "loss": 0.1867, "step": 4990 }, { "epoch": 1.0101194090265129, "grad_norm": 0.4083704948425293, "learning_rate": 9.855282498735952e-05, "loss": 0.194, "step": 4991 }, { "epoch": 1.010321797207043, "grad_norm": 0.26877662539482117, "learning_rate": 9.852102123744934e-05, "loss": 0.2156, "step": 4992 }, { "epoch": 1.0105241853875733, "grad_norm": 0.27300938963890076, "learning_rate": 9.848921763716672e-05, "loss": 0.2129, "step": 4993 }, { "epoch": 1.0107265735681037, "grad_norm": 0.30437150597572327, "learning_rate": 9.845741418972921e-05, "loss": 0.2152, "step": 4994 }, { "epoch": 1.010928961748634, "grad_norm": 0.3953990340232849, "learning_rate": 9.842561089835433e-05, "loss": 0.2392, "step": 4995 }, { "epoch": 1.011131349929164, "grad_norm": 0.2798847556114197, "learning_rate": 9.839380776625963e-05, "loss": 0.1938, "step": 4996 }, { "epoch": 1.0113337381096943, "grad_norm": 0.3126393258571625, "learning_rate": 9.836200479666262e-05, "loss": 0.2221, "step": 4997 }, { "epoch": 1.0115361262902247, "grad_norm": 0.2901754081249237, "learning_rate": 9.833020199278075e-05, "loss": 0.2089, "step": 4998 }, { "epoch": 1.011738514470755, "grad_norm": 0.6395556330680847, "learning_rate": 9.829839935783155e-05, "loss": 0.213, "step": 4999 }, { "epoch": 1.0119409026512851, "grad_norm": 0.3289041519165039, "learning_rate": 9.826659689503244e-05, "loss": 0.1602, "step": 5000 }, { "epoch": 1.0119409026512851, "eval_loss": 0.2699425518512726, "eval_runtime": 0.7372, "eval_samples_per_second": 6.783, "eval_steps_per_second": 1.357, "step": 5000 }, { "epoch": 1.0121432908318153, "grad_norm": 0.2821512222290039, "learning_rate": 9.823479460760085e-05, "loss": 0.2042, "step": 5001 }, { "epoch": 1.0123456790123457, "grad_norm": 0.3883262276649475, "learning_rate": 9.820299249875429e-05, "loss": 0.194, "step": 5002 }, { "epoch": 1.012548067192876, "grad_norm": 0.30730289220809937, "learning_rate": 9.817119057171003e-05, "loss": 0.1453, "step": 5003 }, { "epoch": 1.0127504553734061, "grad_norm": 0.26515355706214905, "learning_rate": 9.813938882968552e-05, "loss": 0.1912, "step": 5004 }, { "epoch": 1.0129528435539366, "grad_norm": 0.2594118118286133, "learning_rate": 9.810758727589813e-05, "loss": 0.1755, "step": 5005 }, { "epoch": 1.0131552317344668, "grad_norm": 0.27748867869377136, "learning_rate": 9.80757859135652e-05, "loss": 0.2065, "step": 5006 }, { "epoch": 1.013357619914997, "grad_norm": 0.29483193159103394, "learning_rate": 9.804398474590407e-05, "loss": 0.2022, "step": 5007 }, { "epoch": 1.0135600080955272, "grad_norm": 0.2991485297679901, "learning_rate": 9.8012183776132e-05, "loss": 0.2201, "step": 5008 }, { "epoch": 1.0137623962760576, "grad_norm": 0.3728349506855011, "learning_rate": 9.798038300746635e-05, "loss": 0.1912, "step": 5009 }, { "epoch": 1.0139647844565878, "grad_norm": 0.32748380303382874, "learning_rate": 9.794858244312436e-05, "loss": 0.1714, "step": 5010 }, { "epoch": 1.014167172637118, "grad_norm": 0.2615242898464203, "learning_rate": 9.791678208632326e-05, "loss": 0.199, "step": 5011 }, { "epoch": 1.0143695608176482, "grad_norm": 0.28860002756118774, "learning_rate": 9.788498194028031e-05, "loss": 0.1711, "step": 5012 }, { "epoch": 1.0145719489981786, "grad_norm": 0.2724457383155823, "learning_rate": 9.785318200821267e-05, "loss": 0.1964, "step": 5013 }, { "epoch": 1.0147743371787088, "grad_norm": 0.28530627489089966, "learning_rate": 9.782138229333755e-05, "loss": 0.1776, "step": 5014 }, { "epoch": 1.014976725359239, "grad_norm": 0.2859584093093872, "learning_rate": 9.778958279887213e-05, "loss": 0.1822, "step": 5015 }, { "epoch": 1.0151791135397692, "grad_norm": 0.33056363463401794, "learning_rate": 9.775778352803352e-05, "loss": 0.1731, "step": 5016 }, { "epoch": 1.0153815017202996, "grad_norm": 0.3300340175628662, "learning_rate": 9.772598448403885e-05, "loss": 0.1751, "step": 5017 }, { "epoch": 1.0155838899008298, "grad_norm": 0.3166051506996155, "learning_rate": 9.76941856701052e-05, "loss": 0.2012, "step": 5018 }, { "epoch": 1.01578627808136, "grad_norm": 0.26906871795654297, "learning_rate": 9.766238708944965e-05, "loss": 0.1994, "step": 5019 }, { "epoch": 1.0159886662618902, "grad_norm": 0.274949848651886, "learning_rate": 9.763058874528925e-05, "loss": 0.212, "step": 5020 }, { "epoch": 1.0161910544424206, "grad_norm": 0.31172770261764526, "learning_rate": 9.759879064084102e-05, "loss": 0.2387, "step": 5021 }, { "epoch": 1.0163934426229508, "grad_norm": 0.3426840603351593, "learning_rate": 9.756699277932195e-05, "loss": 0.2097, "step": 5022 }, { "epoch": 1.016595830803481, "grad_norm": 0.48346537351608276, "learning_rate": 9.753519516394903e-05, "loss": 0.1998, "step": 5023 }, { "epoch": 1.0167982189840112, "grad_norm": 0.3097973167896271, "learning_rate": 9.750339779793923e-05, "loss": 0.2075, "step": 5024 }, { "epoch": 1.0170006071645417, "grad_norm": 0.2547002136707306, "learning_rate": 9.74716006845094e-05, "loss": 0.187, "step": 5025 }, { "epoch": 1.0172029953450719, "grad_norm": 0.26773279905319214, "learning_rate": 9.743980382687651e-05, "loss": 0.1875, "step": 5026 }, { "epoch": 1.017405383525602, "grad_norm": 0.25491225719451904, "learning_rate": 9.74080072282574e-05, "loss": 0.1675, "step": 5027 }, { "epoch": 1.0176077717061323, "grad_norm": 0.35699644684791565, "learning_rate": 9.73762108918689e-05, "loss": 0.1824, "step": 5028 }, { "epoch": 1.0178101598866627, "grad_norm": 0.29440072178840637, "learning_rate": 9.734441482092786e-05, "loss": 0.1621, "step": 5029 }, { "epoch": 1.0180125480671929, "grad_norm": 0.3017171323299408, "learning_rate": 9.731261901865107e-05, "loss": 0.1913, "step": 5030 }, { "epoch": 1.018214936247723, "grad_norm": 0.3333732783794403, "learning_rate": 9.728082348825526e-05, "loss": 0.2166, "step": 5031 }, { "epoch": 1.0184173244282535, "grad_norm": 0.3623688519001007, "learning_rate": 9.724902823295724e-05, "loss": 0.2236, "step": 5032 }, { "epoch": 1.0186197126087837, "grad_norm": 0.2425580769777298, "learning_rate": 9.721723325597365e-05, "loss": 0.191, "step": 5033 }, { "epoch": 1.018822100789314, "grad_norm": 0.23994003236293793, "learning_rate": 9.71854385605212e-05, "loss": 0.217, "step": 5034 }, { "epoch": 1.019024488969844, "grad_norm": 0.26265624165534973, "learning_rate": 9.715364414981656e-05, "loss": 0.2137, "step": 5035 }, { "epoch": 1.0192268771503745, "grad_norm": 0.26903507113456726, "learning_rate": 9.712185002707634e-05, "loss": 0.1842, "step": 5036 }, { "epoch": 1.0194292653309047, "grad_norm": 0.30898502469062805, "learning_rate": 9.709005619551709e-05, "loss": 0.1837, "step": 5037 }, { "epoch": 1.019631653511435, "grad_norm": 0.3141460418701172, "learning_rate": 9.705826265835547e-05, "loss": 0.1913, "step": 5038 }, { "epoch": 1.0198340416919651, "grad_norm": 0.29945555329322815, "learning_rate": 9.702646941880794e-05, "loss": 0.2066, "step": 5039 }, { "epoch": 1.0200364298724955, "grad_norm": 0.2980521023273468, "learning_rate": 9.699467648009105e-05, "loss": 0.1909, "step": 5040 }, { "epoch": 1.0202388180530257, "grad_norm": 0.31508907675743103, "learning_rate": 9.696288384542125e-05, "loss": 0.2089, "step": 5041 }, { "epoch": 1.020441206233556, "grad_norm": 0.29253244400024414, "learning_rate": 9.693109151801499e-05, "loss": 0.1968, "step": 5042 }, { "epoch": 1.0206435944140861, "grad_norm": 0.2746085226535797, "learning_rate": 9.68992995010887e-05, "loss": 0.1936, "step": 5043 }, { "epoch": 1.0208459825946166, "grad_norm": 0.3697469234466553, "learning_rate": 9.686750779785875e-05, "loss": 0.193, "step": 5044 }, { "epoch": 1.0210483707751468, "grad_norm": 0.3167676031589508, "learning_rate": 9.683571641154149e-05, "loss": 0.1672, "step": 5045 }, { "epoch": 1.021250758955677, "grad_norm": 0.3159421384334564, "learning_rate": 9.680392534535328e-05, "loss": 0.2296, "step": 5046 }, { "epoch": 1.0214531471362072, "grad_norm": 0.2636842429637909, "learning_rate": 9.677213460251033e-05, "loss": 0.1887, "step": 5047 }, { "epoch": 1.0216555353167376, "grad_norm": 0.2759999930858612, "learning_rate": 9.674034418622894e-05, "loss": 0.1876, "step": 5048 }, { "epoch": 1.0218579234972678, "grad_norm": 0.3974986970424652, "learning_rate": 9.67085540997253e-05, "loss": 0.2078, "step": 5049 }, { "epoch": 1.022060311677798, "grad_norm": 0.336736798286438, "learning_rate": 9.667676434621564e-05, "loss": 0.1857, "step": 5050 }, { "epoch": 1.022060311677798, "eval_loss": 0.27779608964920044, "eval_runtime": 0.7405, "eval_samples_per_second": 6.753, "eval_steps_per_second": 1.351, "step": 5050 }, { "epoch": 1.0222626998583282, "grad_norm": 0.27973130345344543, "learning_rate": 9.664497492891607e-05, "loss": 0.2119, "step": 5051 }, { "epoch": 1.0224650880388586, "grad_norm": 0.3350655734539032, "learning_rate": 9.661318585104276e-05, "loss": 0.1885, "step": 5052 }, { "epoch": 1.0226674762193888, "grad_norm": 0.22865338623523712, "learning_rate": 9.658139711581175e-05, "loss": 0.1665, "step": 5053 }, { "epoch": 1.022869864399919, "grad_norm": 0.3354731500148773, "learning_rate": 9.654960872643913e-05, "loss": 0.2064, "step": 5054 }, { "epoch": 1.0230722525804492, "grad_norm": 0.24740594625473022, "learning_rate": 9.651782068614087e-05, "loss": 0.1743, "step": 5055 }, { "epoch": 1.0232746407609796, "grad_norm": 0.2912404537200928, "learning_rate": 9.648603299813298e-05, "loss": 0.1991, "step": 5056 }, { "epoch": 1.0234770289415098, "grad_norm": 0.3296900689601898, "learning_rate": 9.64542456656314e-05, "loss": 0.2185, "step": 5057 }, { "epoch": 1.02367941712204, "grad_norm": 0.2634105682373047, "learning_rate": 9.642245869185204e-05, "loss": 0.1859, "step": 5058 }, { "epoch": 1.0238818053025702, "grad_norm": 0.311507910490036, "learning_rate": 9.639067208001077e-05, "loss": 0.2318, "step": 5059 }, { "epoch": 1.0240841934831006, "grad_norm": 0.3055361211299896, "learning_rate": 9.635888583332344e-05, "loss": 0.2004, "step": 5060 }, { "epoch": 1.0242865816636308, "grad_norm": 0.276246577501297, "learning_rate": 9.632709995500583e-05, "loss": 0.2175, "step": 5061 }, { "epoch": 1.024488969844161, "grad_norm": 0.2705673277378082, "learning_rate": 9.629531444827369e-05, "loss": 0.1903, "step": 5062 }, { "epoch": 1.0246913580246915, "grad_norm": 0.322270005941391, "learning_rate": 9.626352931634279e-05, "loss": 0.2274, "step": 5063 }, { "epoch": 1.0248937462052217, "grad_norm": 0.2642096281051636, "learning_rate": 9.623174456242875e-05, "loss": 0.1844, "step": 5064 }, { "epoch": 1.0250961343857519, "grad_norm": 0.285813570022583, "learning_rate": 9.619996018974728e-05, "loss": 0.1688, "step": 5065 }, { "epoch": 1.025298522566282, "grad_norm": 0.2848520874977112, "learning_rate": 9.616817620151394e-05, "loss": 0.2199, "step": 5066 }, { "epoch": 1.0255009107468125, "grad_norm": 0.3013867139816284, "learning_rate": 9.613639260094436e-05, "loss": 0.197, "step": 5067 }, { "epoch": 1.0257032989273427, "grad_norm": 0.30691197514533997, "learning_rate": 9.610460939125407e-05, "loss": 0.1734, "step": 5068 }, { "epoch": 1.0259056871078729, "grad_norm": 0.26753026247024536, "learning_rate": 9.607282657565848e-05, "loss": 0.1984, "step": 5069 }, { "epoch": 1.026108075288403, "grad_norm": 0.300430029630661, "learning_rate": 9.604104415737308e-05, "loss": 0.2048, "step": 5070 }, { "epoch": 1.0263104634689335, "grad_norm": 0.3072032034397125, "learning_rate": 9.60092621396133e-05, "loss": 0.2279, "step": 5071 }, { "epoch": 1.0265128516494637, "grad_norm": 0.2822709083557129, "learning_rate": 9.597748052559451e-05, "loss": 0.2083, "step": 5072 }, { "epoch": 1.026715239829994, "grad_norm": 0.29260656237602234, "learning_rate": 9.594569931853203e-05, "loss": 0.2037, "step": 5073 }, { "epoch": 1.026917628010524, "grad_norm": 0.291266530752182, "learning_rate": 9.591391852164114e-05, "loss": 0.1882, "step": 5074 }, { "epoch": 1.0271200161910545, "grad_norm": 0.3024618327617645, "learning_rate": 9.58821381381371e-05, "loss": 0.2165, "step": 5075 }, { "epoch": 1.0273224043715847, "grad_norm": 0.2564701437950134, "learning_rate": 9.585035817123513e-05, "loss": 0.1867, "step": 5076 }, { "epoch": 1.027524792552115, "grad_norm": 0.28993088006973267, "learning_rate": 9.581857862415037e-05, "loss": 0.22, "step": 5077 }, { "epoch": 1.0277271807326451, "grad_norm": 0.386116087436676, "learning_rate": 9.578679950009794e-05, "loss": 0.2035, "step": 5078 }, { "epoch": 1.0279295689131756, "grad_norm": 1.0133898258209229, "learning_rate": 9.575502080229295e-05, "loss": 0.2139, "step": 5079 }, { "epoch": 1.0281319570937058, "grad_norm": 0.2750903069972992, "learning_rate": 9.57232425339504e-05, "loss": 0.1953, "step": 5080 }, { "epoch": 1.028334345274236, "grad_norm": 0.3563201129436493, "learning_rate": 9.56914646982853e-05, "loss": 0.1939, "step": 5081 }, { "epoch": 1.0285367334547661, "grad_norm": 0.2231059968471527, "learning_rate": 9.565968729851258e-05, "loss": 0.1811, "step": 5082 }, { "epoch": 1.0287391216352966, "grad_norm": 0.2834724187850952, "learning_rate": 9.562791033784718e-05, "loss": 0.2136, "step": 5083 }, { "epoch": 1.0289415098158268, "grad_norm": 0.3485587239265442, "learning_rate": 9.559613381950391e-05, "loss": 0.2326, "step": 5084 }, { "epoch": 1.029143897996357, "grad_norm": 0.31048068404197693, "learning_rate": 9.556435774669763e-05, "loss": 0.2059, "step": 5085 }, { "epoch": 1.0293462861768872, "grad_norm": 0.24974344670772552, "learning_rate": 9.553258212264308e-05, "loss": 0.1914, "step": 5086 }, { "epoch": 1.0295486743574176, "grad_norm": 0.2568418085575104, "learning_rate": 9.5500806950555e-05, "loss": 0.205, "step": 5087 }, { "epoch": 1.0297510625379478, "grad_norm": 0.36947062611579895, "learning_rate": 9.546903223364806e-05, "loss": 0.2097, "step": 5088 }, { "epoch": 1.029953450718478, "grad_norm": 0.3109140694141388, "learning_rate": 9.54372579751369e-05, "loss": 0.1712, "step": 5089 }, { "epoch": 1.0301558388990082, "grad_norm": 0.2524207532405853, "learning_rate": 9.540548417823609e-05, "loss": 0.2061, "step": 5090 }, { "epoch": 1.0303582270795386, "grad_norm": 0.6381499171257019, "learning_rate": 9.537371084616021e-05, "loss": 0.1702, "step": 5091 }, { "epoch": 1.0305606152600688, "grad_norm": 0.29690369963645935, "learning_rate": 9.53419379821237e-05, "loss": 0.1703, "step": 5092 }, { "epoch": 1.030763003440599, "grad_norm": 0.312467485666275, "learning_rate": 9.531016558934103e-05, "loss": 0.2054, "step": 5093 }, { "epoch": 1.0309653916211294, "grad_norm": 0.30818724632263184, "learning_rate": 9.527839367102661e-05, "loss": 0.1933, "step": 5094 }, { "epoch": 1.0311677798016596, "grad_norm": 0.2714020013809204, "learning_rate": 9.524662223039476e-05, "loss": 0.1909, "step": 5095 }, { "epoch": 1.0313701679821898, "grad_norm": 0.2716444730758667, "learning_rate": 9.52148512706598e-05, "loss": 0.1573, "step": 5096 }, { "epoch": 1.03157255616272, "grad_norm": 0.25801146030426025, "learning_rate": 9.518308079503599e-05, "loss": 0.1843, "step": 5097 }, { "epoch": 1.0317749443432505, "grad_norm": 0.3397352993488312, "learning_rate": 9.515131080673751e-05, "loss": 0.2089, "step": 5098 }, { "epoch": 1.0319773325237807, "grad_norm": 0.31652987003326416, "learning_rate": 9.511954130897851e-05, "loss": 0.1899, "step": 5099 }, { "epoch": 1.0321797207043109, "grad_norm": 0.7190924286842346, "learning_rate": 9.508777230497313e-05, "loss": 0.2048, "step": 5100 }, { "epoch": 1.0321797207043109, "eval_loss": 0.2794642746448517, "eval_runtime": 0.7373, "eval_samples_per_second": 6.781, "eval_steps_per_second": 1.356, "step": 5100 }, { "epoch": 1.032382108884841, "grad_norm": 0.3107394874095917, "learning_rate": 9.50560037979354e-05, "loss": 0.2168, "step": 5101 }, { "epoch": 1.0325844970653715, "grad_norm": 0.28215718269348145, "learning_rate": 9.502423579107933e-05, "loss": 0.19, "step": 5102 }, { "epoch": 1.0327868852459017, "grad_norm": 0.2528868317604065, "learning_rate": 9.499246828761887e-05, "loss": 0.1703, "step": 5103 }, { "epoch": 1.0329892734264319, "grad_norm": 0.3218041956424713, "learning_rate": 9.496070129076793e-05, "loss": 0.2025, "step": 5104 }, { "epoch": 1.033191661606962, "grad_norm": 0.3038148880004883, "learning_rate": 9.492893480374035e-05, "loss": 0.1984, "step": 5105 }, { "epoch": 1.0333940497874925, "grad_norm": 0.9580700993537903, "learning_rate": 9.489716882974994e-05, "loss": 0.1923, "step": 5106 }, { "epoch": 1.0335964379680227, "grad_norm": 0.4019380807876587, "learning_rate": 9.486540337201046e-05, "loss": 0.1793, "step": 5107 }, { "epoch": 1.033798826148553, "grad_norm": 0.3231680989265442, "learning_rate": 9.483363843373556e-05, "loss": 0.2016, "step": 5108 }, { "epoch": 1.034001214329083, "grad_norm": 0.30073419213294983, "learning_rate": 9.480187401813893e-05, "loss": 0.173, "step": 5109 }, { "epoch": 1.0342036025096135, "grad_norm": 0.31227320432662964, "learning_rate": 9.477011012843414e-05, "loss": 0.1934, "step": 5110 }, { "epoch": 1.0344059906901437, "grad_norm": 0.3224979043006897, "learning_rate": 9.473834676783473e-05, "loss": 0.1954, "step": 5111 }, { "epoch": 1.034608378870674, "grad_norm": 0.2588309347629547, "learning_rate": 9.470658393955419e-05, "loss": 0.201, "step": 5112 }, { "epoch": 1.0348107670512041, "grad_norm": 0.44500279426574707, "learning_rate": 9.467482164680597e-05, "loss": 0.2162, "step": 5113 }, { "epoch": 1.0350131552317345, "grad_norm": 0.2538471817970276, "learning_rate": 9.464305989280337e-05, "loss": 0.1749, "step": 5114 }, { "epoch": 1.0352155434122647, "grad_norm": 0.41678211092948914, "learning_rate": 9.461129868075979e-05, "loss": 0.1989, "step": 5115 }, { "epoch": 1.035417931592795, "grad_norm": 0.3123410940170288, "learning_rate": 9.457953801388846e-05, "loss": 0.2083, "step": 5116 }, { "epoch": 1.0356203197733251, "grad_norm": 0.3037175238132477, "learning_rate": 9.45477778954026e-05, "loss": 0.2191, "step": 5117 }, { "epoch": 1.0358227079538556, "grad_norm": 0.2959064245223999, "learning_rate": 9.451601832851534e-05, "loss": 0.2312, "step": 5118 }, { "epoch": 1.0360250961343858, "grad_norm": 0.292764276266098, "learning_rate": 9.448425931643982e-05, "loss": 0.2208, "step": 5119 }, { "epoch": 1.036227484314916, "grad_norm": 0.3166069984436035, "learning_rate": 9.445250086238908e-05, "loss": 0.1858, "step": 5120 }, { "epoch": 1.0364298724954462, "grad_norm": 0.2685317099094391, "learning_rate": 9.442074296957607e-05, "loss": 0.2162, "step": 5121 }, { "epoch": 1.0366322606759766, "grad_norm": 0.30619198083877563, "learning_rate": 9.438898564121375e-05, "loss": 0.1973, "step": 5122 }, { "epoch": 1.0368346488565068, "grad_norm": 0.33436110615730286, "learning_rate": 9.4357228880515e-05, "loss": 0.2431, "step": 5123 }, { "epoch": 1.037037037037037, "grad_norm": 0.31361180543899536, "learning_rate": 9.432547269069261e-05, "loss": 0.2134, "step": 5124 }, { "epoch": 1.0372394252175674, "grad_norm": 0.3122353255748749, "learning_rate": 9.429371707495935e-05, "loss": 0.1999, "step": 5125 }, { "epoch": 1.0374418133980976, "grad_norm": 0.2919948697090149, "learning_rate": 9.426196203652793e-05, "loss": 0.194, "step": 5126 }, { "epoch": 1.0376442015786278, "grad_norm": 0.33420529961586, "learning_rate": 9.423020757861097e-05, "loss": 0.2183, "step": 5127 }, { "epoch": 1.037846589759158, "grad_norm": 0.2902718186378479, "learning_rate": 9.419845370442107e-05, "loss": 0.2048, "step": 5128 }, { "epoch": 1.0380489779396884, "grad_norm": 0.3235926032066345, "learning_rate": 9.416670041717076e-05, "loss": 0.1843, "step": 5129 }, { "epoch": 1.0382513661202186, "grad_norm": 0.2709651589393616, "learning_rate": 9.413494772007248e-05, "loss": 0.1803, "step": 5130 }, { "epoch": 1.0384537543007488, "grad_norm": 0.290262371301651, "learning_rate": 9.410319561633866e-05, "loss": 0.1983, "step": 5131 }, { "epoch": 1.038656142481279, "grad_norm": 0.2669159173965454, "learning_rate": 9.407144410918163e-05, "loss": 0.2022, "step": 5132 }, { "epoch": 1.0388585306618094, "grad_norm": 0.3024842143058777, "learning_rate": 9.403969320181367e-05, "loss": 0.2085, "step": 5133 }, { "epoch": 1.0390609188423396, "grad_norm": 0.30416321754455566, "learning_rate": 9.400794289744702e-05, "loss": 0.2032, "step": 5134 }, { "epoch": 1.0392633070228698, "grad_norm": 0.270829439163208, "learning_rate": 9.397619319929385e-05, "loss": 0.2162, "step": 5135 }, { "epoch": 1.0394656952034, "grad_norm": 0.27344194054603577, "learning_rate": 9.394444411056623e-05, "loss": 0.176, "step": 5136 }, { "epoch": 1.0396680833839305, "grad_norm": 0.33522239327430725, "learning_rate": 9.391269563447622e-05, "loss": 0.2199, "step": 5137 }, { "epoch": 1.0398704715644607, "grad_norm": 0.2570640742778778, "learning_rate": 9.388094777423578e-05, "loss": 0.2067, "step": 5138 }, { "epoch": 1.0400728597449909, "grad_norm": 0.26825082302093506, "learning_rate": 9.384920053305682e-05, "loss": 0.2067, "step": 5139 }, { "epoch": 1.040275247925521, "grad_norm": 0.2695053815841675, "learning_rate": 9.381745391415125e-05, "loss": 0.2057, "step": 5140 }, { "epoch": 1.0404776361060515, "grad_norm": 0.35252243280410767, "learning_rate": 9.37857079207308e-05, "loss": 0.1945, "step": 5141 }, { "epoch": 1.0406800242865817, "grad_norm": 0.3360033333301544, "learning_rate": 9.37539625560072e-05, "loss": 0.2287, "step": 5142 }, { "epoch": 1.0408824124671119, "grad_norm": 0.34644338488578796, "learning_rate": 9.372221782319215e-05, "loss": 0.1944, "step": 5143 }, { "epoch": 1.041084800647642, "grad_norm": 0.2628285884857178, "learning_rate": 9.369047372549723e-05, "loss": 0.1871, "step": 5144 }, { "epoch": 1.0412871888281725, "grad_norm": 0.26992303133010864, "learning_rate": 9.365873026613397e-05, "loss": 0.1871, "step": 5145 }, { "epoch": 1.0414895770087027, "grad_norm": 0.2952551245689392, "learning_rate": 9.362698744831385e-05, "loss": 0.2167, "step": 5146 }, { "epoch": 1.041691965189233, "grad_norm": 0.23648761212825775, "learning_rate": 9.359524527524825e-05, "loss": 0.1759, "step": 5147 }, { "epoch": 1.041894353369763, "grad_norm": 0.2490091472864151, "learning_rate": 9.356350375014854e-05, "loss": 0.1999, "step": 5148 }, { "epoch": 1.0420967415502935, "grad_norm": 0.25964388251304626, "learning_rate": 9.353176287622599e-05, "loss": 0.1793, "step": 5149 }, { "epoch": 1.0422991297308237, "grad_norm": 0.28901317715644836, "learning_rate": 9.350002265669179e-05, "loss": 0.204, "step": 5150 }, { "epoch": 1.0422991297308237, "eval_loss": 0.2759149372577667, "eval_runtime": 0.7382, "eval_samples_per_second": 6.774, "eval_steps_per_second": 1.355, "step": 5150 }, { "epoch": 1.042501517911354, "grad_norm": 0.27219051122665405, "learning_rate": 9.346828309475709e-05, "loss": 0.1853, "step": 5151 }, { "epoch": 1.0427039060918841, "grad_norm": 0.31162458658218384, "learning_rate": 9.343654419363298e-05, "loss": 0.2339, "step": 5152 }, { "epoch": 1.0429062942724145, "grad_norm": 0.27195677161216736, "learning_rate": 9.340480595653047e-05, "loss": 0.2015, "step": 5153 }, { "epoch": 1.0431086824529447, "grad_norm": 0.26143282651901245, "learning_rate": 9.337306838666047e-05, "loss": 0.1737, "step": 5154 }, { "epoch": 1.043311070633475, "grad_norm": 0.36458563804626465, "learning_rate": 9.334133148723387e-05, "loss": 0.2331, "step": 5155 }, { "epoch": 1.0435134588140054, "grad_norm": 0.30646729469299316, "learning_rate": 9.33095952614615e-05, "loss": 0.2053, "step": 5156 }, { "epoch": 1.0437158469945356, "grad_norm": 0.3230549991130829, "learning_rate": 9.327785971255413e-05, "loss": 0.2252, "step": 5157 }, { "epoch": 1.0439182351750658, "grad_norm": 0.24788984656333923, "learning_rate": 9.324612484372231e-05, "loss": 0.1804, "step": 5158 }, { "epoch": 1.044120623355596, "grad_norm": 0.2826426327228546, "learning_rate": 9.321439065817673e-05, "loss": 0.1773, "step": 5159 }, { "epoch": 1.0443230115361264, "grad_norm": 0.2371816188097, "learning_rate": 9.318265715912791e-05, "loss": 0.22, "step": 5160 }, { "epoch": 1.0445253997166566, "grad_norm": 0.3433065116405487, "learning_rate": 9.315092434978626e-05, "loss": 0.1837, "step": 5161 }, { "epoch": 1.0447277878971868, "grad_norm": 0.29012125730514526, "learning_rate": 9.311919223336225e-05, "loss": 0.2242, "step": 5162 }, { "epoch": 1.044930176077717, "grad_norm": 0.406108021736145, "learning_rate": 9.308746081306617e-05, "loss": 0.2046, "step": 5163 }, { "epoch": 1.0451325642582474, "grad_norm": 0.3668637275695801, "learning_rate": 9.305573009210827e-05, "loss": 0.1964, "step": 5164 }, { "epoch": 1.0453349524387776, "grad_norm": 0.3359313905239105, "learning_rate": 9.302400007369873e-05, "loss": 0.2094, "step": 5165 }, { "epoch": 1.0455373406193078, "grad_norm": 0.4924279451370239, "learning_rate": 9.299227076104769e-05, "loss": 0.2138, "step": 5166 }, { "epoch": 1.045739728799838, "grad_norm": 0.40203621983528137, "learning_rate": 9.296054215736514e-05, "loss": 0.1813, "step": 5167 }, { "epoch": 1.0459421169803684, "grad_norm": 0.3456156551837921, "learning_rate": 9.292881426586108e-05, "loss": 0.2265, "step": 5168 }, { "epoch": 1.0461445051608986, "grad_norm": 0.26270124316215515, "learning_rate": 9.289708708974538e-05, "loss": 0.186, "step": 5169 }, { "epoch": 1.0463468933414288, "grad_norm": 0.28756922483444214, "learning_rate": 9.28653606322279e-05, "loss": 0.205, "step": 5170 }, { "epoch": 1.046549281521959, "grad_norm": 0.3029641807079315, "learning_rate": 9.283363489651834e-05, "loss": 0.1955, "step": 5171 }, { "epoch": 1.0467516697024895, "grad_norm": 0.3574798107147217, "learning_rate": 9.280190988582643e-05, "loss": 0.2166, "step": 5172 }, { "epoch": 1.0469540578830197, "grad_norm": 0.3494110107421875, "learning_rate": 9.277018560336174e-05, "loss": 0.1967, "step": 5173 }, { "epoch": 1.0471564460635499, "grad_norm": 0.29691949486732483, "learning_rate": 9.27384620523338e-05, "loss": 0.2357, "step": 5174 }, { "epoch": 1.04735883424408, "grad_norm": 0.19874915480613708, "learning_rate": 9.27067392359521e-05, "loss": 0.1364, "step": 5175 }, { "epoch": 1.0475612224246105, "grad_norm": 0.30270639061927795, "learning_rate": 9.267501715742598e-05, "loss": 0.1933, "step": 5176 }, { "epoch": 1.0477636106051407, "grad_norm": 0.2565094828605652, "learning_rate": 9.264329581996476e-05, "loss": 0.1948, "step": 5177 }, { "epoch": 1.0479659987856709, "grad_norm": 0.321664035320282, "learning_rate": 9.261157522677768e-05, "loss": 0.1872, "step": 5178 }, { "epoch": 1.048168386966201, "grad_norm": 0.3833097517490387, "learning_rate": 9.25798553810739e-05, "loss": 0.1946, "step": 5179 }, { "epoch": 1.0483707751467315, "grad_norm": 0.31259262561798096, "learning_rate": 9.254813628606254e-05, "loss": 0.2, "step": 5180 }, { "epoch": 1.0485731633272617, "grad_norm": 0.31232592463493347, "learning_rate": 9.251641794495251e-05, "loss": 0.2189, "step": 5181 }, { "epoch": 1.048775551507792, "grad_norm": 0.26915574073791504, "learning_rate": 9.248470036095278e-05, "loss": 0.1807, "step": 5182 }, { "epoch": 1.048977939688322, "grad_norm": 0.2748313546180725, "learning_rate": 9.24529835372722e-05, "loss": 0.1838, "step": 5183 }, { "epoch": 1.0491803278688525, "grad_norm": 0.28373873233795166, "learning_rate": 9.242126747711958e-05, "loss": 0.2122, "step": 5184 }, { "epoch": 1.0493827160493827, "grad_norm": 0.2703567445278168, "learning_rate": 9.238955218370359e-05, "loss": 0.1792, "step": 5185 }, { "epoch": 1.049585104229913, "grad_norm": 0.30673250555992126, "learning_rate": 9.235783766023285e-05, "loss": 0.2202, "step": 5186 }, { "epoch": 1.0497874924104433, "grad_norm": 0.27138593792915344, "learning_rate": 9.232612390991591e-05, "loss": 0.2394, "step": 5187 }, { "epoch": 1.0499898805909735, "grad_norm": 0.3554273247718811, "learning_rate": 9.229441093596122e-05, "loss": 0.2146, "step": 5188 }, { "epoch": 1.0501922687715037, "grad_norm": 0.2618618309497833, "learning_rate": 9.226269874157719e-05, "loss": 0.1822, "step": 5189 }, { "epoch": 1.050394656952034, "grad_norm": 0.27396267652511597, "learning_rate": 9.223098732997208e-05, "loss": 0.1736, "step": 5190 }, { "epoch": 1.0505970451325644, "grad_norm": 0.23274348676204681, "learning_rate": 9.21992767043542e-05, "loss": 0.1799, "step": 5191 }, { "epoch": 1.0507994333130946, "grad_norm": 0.26986899971961975, "learning_rate": 9.216756686793164e-05, "loss": 0.1664, "step": 5192 }, { "epoch": 1.0510018214936248, "grad_norm": 0.2661428451538086, "learning_rate": 9.213585782391246e-05, "loss": 0.206, "step": 5193 }, { "epoch": 1.051204209674155, "grad_norm": 0.25959786772727966, "learning_rate": 9.210414957550469e-05, "loss": 0.2015, "step": 5194 }, { "epoch": 1.0514065978546854, "grad_norm": 0.250900536775589, "learning_rate": 9.207244212591621e-05, "loss": 0.1823, "step": 5195 }, { "epoch": 1.0516089860352156, "grad_norm": 0.2858547270298004, "learning_rate": 9.204073547835485e-05, "loss": 0.196, "step": 5196 }, { "epoch": 1.0518113742157458, "grad_norm": 0.28630852699279785, "learning_rate": 9.200902963602835e-05, "loss": 0.2221, "step": 5197 }, { "epoch": 1.052013762396276, "grad_norm": 0.3132813274860382, "learning_rate": 9.19773246021444e-05, "loss": 0.1934, "step": 5198 }, { "epoch": 1.0522161505768064, "grad_norm": 0.32697784900665283, "learning_rate": 9.194562037991056e-05, "loss": 0.2026, "step": 5199 }, { "epoch": 1.0524185387573366, "grad_norm": 0.30263441801071167, "learning_rate": 9.191391697253433e-05, "loss": 0.2074, "step": 5200 }, { "epoch": 1.0524185387573366, "eval_loss": 0.2719886004924774, "eval_runtime": 0.7381, "eval_samples_per_second": 6.774, "eval_steps_per_second": 1.355, "step": 5200 }, { "epoch": 1.0526209269378668, "grad_norm": 0.2719072103500366, "learning_rate": 9.188221438322314e-05, "loss": 0.1895, "step": 5201 }, { "epoch": 1.052823315118397, "grad_norm": 0.2846873700618744, "learning_rate": 9.185051261518436e-05, "loss": 0.2252, "step": 5202 }, { "epoch": 1.0530257032989274, "grad_norm": 0.267671674489975, "learning_rate": 9.181881167162516e-05, "loss": 0.1791, "step": 5203 }, { "epoch": 1.0532280914794576, "grad_norm": 0.33255940675735474, "learning_rate": 9.178711155575276e-05, "loss": 0.2234, "step": 5204 }, { "epoch": 1.0534304796599878, "grad_norm": 0.28519174456596375, "learning_rate": 9.175541227077422e-05, "loss": 0.1832, "step": 5205 }, { "epoch": 1.053632867840518, "grad_norm": 0.26935869455337524, "learning_rate": 9.172371381989657e-05, "loss": 0.1925, "step": 5206 }, { "epoch": 1.0538352560210484, "grad_norm": 0.28792282938957214, "learning_rate": 9.16920162063267e-05, "loss": 0.2105, "step": 5207 }, { "epoch": 1.0540376442015786, "grad_norm": 0.291089802980423, "learning_rate": 9.166031943327147e-05, "loss": 0.2215, "step": 5208 }, { "epoch": 1.0542400323821088, "grad_norm": 0.4079270362854004, "learning_rate": 9.16286235039376e-05, "loss": 0.1799, "step": 5209 }, { "epoch": 1.054442420562639, "grad_norm": 0.39470669627189636, "learning_rate": 9.159692842153177e-05, "loss": 0.1637, "step": 5210 }, { "epoch": 1.0546448087431695, "grad_norm": 0.3241277039051056, "learning_rate": 9.156523418926055e-05, "loss": 0.2344, "step": 5211 }, { "epoch": 1.0548471969236997, "grad_norm": 0.26196640729904175, "learning_rate": 9.153354081033043e-05, "loss": 0.1965, "step": 5212 }, { "epoch": 1.0550495851042299, "grad_norm": 0.2696246802806854, "learning_rate": 9.150184828794782e-05, "loss": 0.1988, "step": 5213 }, { "epoch": 1.05525197328476, "grad_norm": 0.30082598328590393, "learning_rate": 9.147015662531903e-05, "loss": 0.2245, "step": 5214 }, { "epoch": 1.0554543614652905, "grad_norm": 0.2307879775762558, "learning_rate": 9.143846582565027e-05, "loss": 0.162, "step": 5215 }, { "epoch": 1.0556567496458207, "grad_norm": 0.28145650029182434, "learning_rate": 9.140677589214772e-05, "loss": 0.1562, "step": 5216 }, { "epoch": 1.0558591378263509, "grad_norm": 0.341371089220047, "learning_rate": 9.137508682801742e-05, "loss": 0.2095, "step": 5217 }, { "epoch": 1.0560615260068813, "grad_norm": 0.2670517861843109, "learning_rate": 9.134339863646533e-05, "loss": 0.1889, "step": 5218 }, { "epoch": 1.0562639141874115, "grad_norm": 0.3274596631526947, "learning_rate": 9.13117113206973e-05, "loss": 0.212, "step": 5219 }, { "epoch": 1.0564663023679417, "grad_norm": 0.2960319221019745, "learning_rate": 9.128002488391919e-05, "loss": 0.2007, "step": 5220 }, { "epoch": 1.056668690548472, "grad_norm": 0.40096405148506165, "learning_rate": 9.124833932933665e-05, "loss": 0.1928, "step": 5221 }, { "epoch": 1.0568710787290023, "grad_norm": 0.28344669938087463, "learning_rate": 9.121665466015533e-05, "loss": 0.1538, "step": 5222 }, { "epoch": 1.0570734669095325, "grad_norm": 0.2522047758102417, "learning_rate": 9.118497087958071e-05, "loss": 0.1903, "step": 5223 }, { "epoch": 1.0572758550900627, "grad_norm": 0.2611517906188965, "learning_rate": 9.11532879908183e-05, "loss": 0.1681, "step": 5224 }, { "epoch": 1.057478243270593, "grad_norm": 0.2570279538631439, "learning_rate": 9.112160599707332e-05, "loss": 0.1745, "step": 5225 }, { "epoch": 1.0576806314511233, "grad_norm": 0.29664525389671326, "learning_rate": 9.10899249015511e-05, "loss": 0.2311, "step": 5226 }, { "epoch": 1.0578830196316535, "grad_norm": 0.2817818820476532, "learning_rate": 9.105824470745678e-05, "loss": 0.186, "step": 5227 }, { "epoch": 1.0580854078121837, "grad_norm": 0.2797265350818634, "learning_rate": 9.102656541799543e-05, "loss": 0.1979, "step": 5228 }, { "epoch": 1.058287795992714, "grad_norm": 0.3641483187675476, "learning_rate": 9.099488703637205e-05, "loss": 0.1944, "step": 5229 }, { "epoch": 1.0584901841732444, "grad_norm": 0.3078802227973938, "learning_rate": 9.09632095657915e-05, "loss": 0.1997, "step": 5230 }, { "epoch": 1.0586925723537746, "grad_norm": 0.3237381875514984, "learning_rate": 9.093153300945858e-05, "loss": 0.1899, "step": 5231 }, { "epoch": 1.0588949605343048, "grad_norm": 0.2978192865848541, "learning_rate": 9.0899857370578e-05, "loss": 0.1889, "step": 5232 }, { "epoch": 1.059097348714835, "grad_norm": 0.28448426723480225, "learning_rate": 9.086818265235437e-05, "loss": 0.2004, "step": 5233 }, { "epoch": 1.0592997368953654, "grad_norm": 0.34776198863983154, "learning_rate": 9.083650885799218e-05, "loss": 0.2632, "step": 5234 }, { "epoch": 1.0595021250758956, "grad_norm": 0.22447577118873596, "learning_rate": 9.080483599069589e-05, "loss": 0.1518, "step": 5235 }, { "epoch": 1.0597045132564258, "grad_norm": 0.2891514003276825, "learning_rate": 9.077316405366981e-05, "loss": 0.2074, "step": 5236 }, { "epoch": 1.059906901436956, "grad_norm": 0.26560258865356445, "learning_rate": 9.074149305011818e-05, "loss": 0.1851, "step": 5237 }, { "epoch": 1.0601092896174864, "grad_norm": 0.3098355531692505, "learning_rate": 9.070982298324513e-05, "loss": 0.1936, "step": 5238 }, { "epoch": 1.0603116777980166, "grad_norm": 0.2618989050388336, "learning_rate": 9.067815385625471e-05, "loss": 0.2084, "step": 5239 }, { "epoch": 1.0605140659785468, "grad_norm": 0.2665063738822937, "learning_rate": 9.064648567235087e-05, "loss": 0.189, "step": 5240 }, { "epoch": 1.060716454159077, "grad_norm": 0.3155873715877533, "learning_rate": 9.061481843473746e-05, "loss": 0.2045, "step": 5241 }, { "epoch": 1.0609188423396074, "grad_norm": 0.2748834490776062, "learning_rate": 9.058315214661824e-05, "loss": 0.2126, "step": 5242 }, { "epoch": 1.0611212305201376, "grad_norm": 0.2678951025009155, "learning_rate": 9.055148681119688e-05, "loss": 0.1881, "step": 5243 }, { "epoch": 1.0613236187006678, "grad_norm": 0.2735554873943329, "learning_rate": 9.051982243167695e-05, "loss": 0.2194, "step": 5244 }, { "epoch": 1.061526006881198, "grad_norm": 0.36983054876327515, "learning_rate": 9.04881590112619e-05, "loss": 0.1863, "step": 5245 }, { "epoch": 1.0617283950617284, "grad_norm": 0.3115123510360718, "learning_rate": 9.045649655315515e-05, "loss": 0.2507, "step": 5246 }, { "epoch": 1.0619307832422586, "grad_norm": 0.33893606066703796, "learning_rate": 9.04248350605599e-05, "loss": 0.2123, "step": 5247 }, { "epoch": 1.0621331714227888, "grad_norm": 0.2772587835788727, "learning_rate": 9.039317453667938e-05, "loss": 0.2104, "step": 5248 }, { "epoch": 1.0623355596033193, "grad_norm": 0.3373522460460663, "learning_rate": 9.036151498471665e-05, "loss": 0.2273, "step": 5249 }, { "epoch": 1.0625379477838495, "grad_norm": 0.3072078227996826, "learning_rate": 9.03298564078747e-05, "loss": 0.2511, "step": 5250 }, { "epoch": 1.0625379477838495, "eval_loss": 0.2735154926776886, "eval_runtime": 0.7369, "eval_samples_per_second": 6.785, "eval_steps_per_second": 1.357, "step": 5250 }, { "epoch": 1.0627403359643797, "grad_norm": 0.2639048397541046, "learning_rate": 9.029819880935642e-05, "loss": 0.173, "step": 5251 }, { "epoch": 1.0629427241449099, "grad_norm": 0.2672668695449829, "learning_rate": 9.026654219236458e-05, "loss": 0.1915, "step": 5252 }, { "epoch": 1.0631451123254403, "grad_norm": 0.28084275126457214, "learning_rate": 9.023488656010188e-05, "loss": 0.2238, "step": 5253 }, { "epoch": 1.0633475005059705, "grad_norm": 0.2745071053504944, "learning_rate": 9.020323191577087e-05, "loss": 0.1724, "step": 5254 }, { "epoch": 1.0635498886865007, "grad_norm": 0.26804065704345703, "learning_rate": 9.017157826257407e-05, "loss": 0.1646, "step": 5255 }, { "epoch": 1.0637522768670309, "grad_norm": 0.3335314691066742, "learning_rate": 9.013992560371385e-05, "loss": 0.1889, "step": 5256 }, { "epoch": 1.0639546650475613, "grad_norm": 0.2998206913471222, "learning_rate": 9.010827394239249e-05, "loss": 0.2184, "step": 5257 }, { "epoch": 1.0641570532280915, "grad_norm": 0.26737990975379944, "learning_rate": 9.00766232818122e-05, "loss": 0.2071, "step": 5258 }, { "epoch": 1.0643594414086217, "grad_norm": 0.3056652545928955, "learning_rate": 9.004497362517504e-05, "loss": 0.204, "step": 5259 }, { "epoch": 1.064561829589152, "grad_norm": 0.26440155506134033, "learning_rate": 9.001332497568298e-05, "loss": 0.163, "step": 5260 }, { "epoch": 1.0647642177696823, "grad_norm": 0.25069424510002136, "learning_rate": 8.998167733653791e-05, "loss": 0.1881, "step": 5261 }, { "epoch": 1.0649666059502125, "grad_norm": 0.2964242696762085, "learning_rate": 8.99500307109416e-05, "loss": 0.2169, "step": 5262 }, { "epoch": 1.0651689941307427, "grad_norm": 0.26181793212890625, "learning_rate": 8.991838510209575e-05, "loss": 0.1768, "step": 5263 }, { "epoch": 1.065371382311273, "grad_norm": 0.26684558391571045, "learning_rate": 8.988674051320189e-05, "loss": 0.1906, "step": 5264 }, { "epoch": 1.0655737704918034, "grad_norm": 0.23875440657138824, "learning_rate": 8.985509694746152e-05, "loss": 0.1528, "step": 5265 }, { "epoch": 1.0657761586723336, "grad_norm": 0.22852183878421783, "learning_rate": 8.982345440807598e-05, "loss": 0.1609, "step": 5266 }, { "epoch": 1.0659785468528638, "grad_norm": 0.28481584787368774, "learning_rate": 8.979181289824655e-05, "loss": 0.2069, "step": 5267 }, { "epoch": 1.066180935033394, "grad_norm": 0.26229703426361084, "learning_rate": 8.976017242117438e-05, "loss": 0.2085, "step": 5268 }, { "epoch": 1.0663833232139244, "grad_norm": 0.2593900263309479, "learning_rate": 8.972853298006054e-05, "loss": 0.1716, "step": 5269 }, { "epoch": 1.0665857113944546, "grad_norm": 0.30314525961875916, "learning_rate": 8.969689457810593e-05, "loss": 0.2148, "step": 5270 }, { "epoch": 1.0667880995749848, "grad_norm": 0.27090978622436523, "learning_rate": 8.966525721851143e-05, "loss": 0.1875, "step": 5271 }, { "epoch": 1.0669904877555152, "grad_norm": 0.2738669514656067, "learning_rate": 8.963362090447775e-05, "loss": 0.1887, "step": 5272 }, { "epoch": 1.0671928759360454, "grad_norm": 0.2838186025619507, "learning_rate": 8.960198563920553e-05, "loss": 0.185, "step": 5273 }, { "epoch": 1.0673952641165756, "grad_norm": 0.24561458826065063, "learning_rate": 8.95703514258953e-05, "loss": 0.1436, "step": 5274 }, { "epoch": 1.0675976522971058, "grad_norm": 0.28942060470581055, "learning_rate": 8.95387182677475e-05, "loss": 0.1846, "step": 5275 }, { "epoch": 1.067800040477636, "grad_norm": 0.30993714928627014, "learning_rate": 8.950708616796238e-05, "loss": 0.21, "step": 5276 }, { "epoch": 1.0680024286581664, "grad_norm": 0.3060511350631714, "learning_rate": 8.947545512974019e-05, "loss": 0.1867, "step": 5277 }, { "epoch": 1.0682048168386966, "grad_norm": 0.23082970082759857, "learning_rate": 8.944382515628104e-05, "loss": 0.1672, "step": 5278 }, { "epoch": 1.0684072050192268, "grad_norm": 0.3195563554763794, "learning_rate": 8.941219625078487e-05, "loss": 0.237, "step": 5279 }, { "epoch": 1.0686095931997572, "grad_norm": 0.2811078131198883, "learning_rate": 8.93805684164516e-05, "loss": 0.1512, "step": 5280 }, { "epoch": 1.0688119813802874, "grad_norm": 0.37472841143608093, "learning_rate": 8.9348941656481e-05, "loss": 0.2245, "step": 5281 }, { "epoch": 1.0690143695608176, "grad_norm": 0.24966472387313843, "learning_rate": 8.931731597407268e-05, "loss": 0.1916, "step": 5282 }, { "epoch": 1.0692167577413478, "grad_norm": 0.30108532309532166, "learning_rate": 8.928569137242628e-05, "loss": 0.1838, "step": 5283 }, { "epoch": 1.0694191459218783, "grad_norm": 0.30620646476745605, "learning_rate": 8.925406785474119e-05, "loss": 0.1921, "step": 5284 }, { "epoch": 1.0696215341024085, "grad_norm": 0.2843058407306671, "learning_rate": 8.922244542421676e-05, "loss": 0.2085, "step": 5285 }, { "epoch": 1.0698239222829387, "grad_norm": 0.27315425872802734, "learning_rate": 8.919082408405221e-05, "loss": 0.1745, "step": 5286 }, { "epoch": 1.0700263104634689, "grad_norm": 0.2726389169692993, "learning_rate": 8.915920383744667e-05, "loss": 0.2096, "step": 5287 }, { "epoch": 1.0702286986439993, "grad_norm": 0.276395320892334, "learning_rate": 8.912758468759912e-05, "loss": 0.202, "step": 5288 }, { "epoch": 1.0704310868245295, "grad_norm": 0.30593031644821167, "learning_rate": 8.909596663770849e-05, "loss": 0.2287, "step": 5289 }, { "epoch": 1.0706334750050597, "grad_norm": 0.4061230719089508, "learning_rate": 8.906434969097351e-05, "loss": 0.1944, "step": 5290 }, { "epoch": 1.0708358631855899, "grad_norm": 0.3001694679260254, "learning_rate": 8.903273385059293e-05, "loss": 0.2023, "step": 5291 }, { "epoch": 1.0710382513661203, "grad_norm": 0.24743273854255676, "learning_rate": 8.900111911976524e-05, "loss": 0.1624, "step": 5292 }, { "epoch": 1.0712406395466505, "grad_norm": 0.28860440850257874, "learning_rate": 8.896950550168888e-05, "loss": 0.2338, "step": 5293 }, { "epoch": 1.0714430277271807, "grad_norm": 0.30887338519096375, "learning_rate": 8.893789299956223e-05, "loss": 0.2008, "step": 5294 }, { "epoch": 1.071645415907711, "grad_norm": 0.3015650808811188, "learning_rate": 8.890628161658349e-05, "loss": 0.2064, "step": 5295 }, { "epoch": 1.0718478040882413, "grad_norm": 0.24830302596092224, "learning_rate": 8.887467135595078e-05, "loss": 0.1907, "step": 5296 }, { "epoch": 1.0720501922687715, "grad_norm": 0.27147096395492554, "learning_rate": 8.884306222086208e-05, "loss": 0.1608, "step": 5297 }, { "epoch": 1.0722525804493017, "grad_norm": 0.4186892807483673, "learning_rate": 8.881145421451527e-05, "loss": 0.2069, "step": 5298 }, { "epoch": 1.072454968629832, "grad_norm": 0.28978657722473145, "learning_rate": 8.877984734010812e-05, "loss": 0.1996, "step": 5299 }, { "epoch": 1.0726573568103623, "grad_norm": 0.25510936975479126, "learning_rate": 8.874824160083829e-05, "loss": 0.1706, "step": 5300 }, { "epoch": 1.0726573568103623, "eval_loss": 0.27265626192092896, "eval_runtime": 0.7384, "eval_samples_per_second": 6.772, "eval_steps_per_second": 1.354, "step": 5300 }, { "epoch": 1.0728597449908925, "grad_norm": 0.2712719440460205, "learning_rate": 8.871663699990331e-05, "loss": 0.1873, "step": 5301 }, { "epoch": 1.0730621331714227, "grad_norm": 0.27994102239608765, "learning_rate": 8.86850335405006e-05, "loss": 0.2115, "step": 5302 }, { "epoch": 1.0732645213519532, "grad_norm": 0.294344037771225, "learning_rate": 8.865343122582749e-05, "loss": 0.2024, "step": 5303 }, { "epoch": 1.0734669095324834, "grad_norm": 0.30034956336021423, "learning_rate": 8.862183005908114e-05, "loss": 0.1942, "step": 5304 }, { "epoch": 1.0736692977130136, "grad_norm": 0.28707626461982727, "learning_rate": 8.859023004345862e-05, "loss": 0.2016, "step": 5305 }, { "epoch": 1.0738716858935438, "grad_norm": 0.2606486976146698, "learning_rate": 8.855863118215692e-05, "loss": 0.2008, "step": 5306 }, { "epoch": 1.074074074074074, "grad_norm": 0.2927470803260803, "learning_rate": 8.852703347837285e-05, "loss": 0.1894, "step": 5307 }, { "epoch": 1.0742764622546044, "grad_norm": 0.2737131714820862, "learning_rate": 8.849543693530315e-05, "loss": 0.2034, "step": 5308 }, { "epoch": 1.0744788504351346, "grad_norm": 0.25730380415916443, "learning_rate": 8.846384155614443e-05, "loss": 0.1926, "step": 5309 }, { "epoch": 1.0746812386156648, "grad_norm": 0.3297762870788574, "learning_rate": 8.843224734409317e-05, "loss": 0.1842, "step": 5310 }, { "epoch": 1.0748836267961952, "grad_norm": 0.2969495356082916, "learning_rate": 8.840065430234576e-05, "loss": 0.2472, "step": 5311 }, { "epoch": 1.0750860149767254, "grad_norm": 0.25306811928749084, "learning_rate": 8.836906243409843e-05, "loss": 0.1767, "step": 5312 }, { "epoch": 1.0752884031572556, "grad_norm": 0.25566038489341736, "learning_rate": 8.833747174254736e-05, "loss": 0.1653, "step": 5313 }, { "epoch": 1.0754907913377858, "grad_norm": 0.27874088287353516, "learning_rate": 8.830588223088846e-05, "loss": 0.2032, "step": 5314 }, { "epoch": 1.0756931795183162, "grad_norm": 0.251594215631485, "learning_rate": 8.82742939023177e-05, "loss": 0.1993, "step": 5315 }, { "epoch": 1.0758955676988464, "grad_norm": 0.3001585900783539, "learning_rate": 8.824270676003087e-05, "loss": 0.196, "step": 5316 }, { "epoch": 1.0760979558793766, "grad_norm": 0.23913423717021942, "learning_rate": 8.821112080722359e-05, "loss": 0.1715, "step": 5317 }, { "epoch": 1.0763003440599068, "grad_norm": 0.3131200075149536, "learning_rate": 8.817953604709141e-05, "loss": 0.1991, "step": 5318 }, { "epoch": 1.0765027322404372, "grad_norm": 0.30624499917030334, "learning_rate": 8.814795248282974e-05, "loss": 0.2247, "step": 5319 }, { "epoch": 1.0767051204209674, "grad_norm": 0.30333131551742554, "learning_rate": 8.811637011763388e-05, "loss": 0.2114, "step": 5320 }, { "epoch": 1.0769075086014976, "grad_norm": 0.25488388538360596, "learning_rate": 8.8084788954699e-05, "loss": 0.2014, "step": 5321 }, { "epoch": 1.0771098967820278, "grad_norm": 0.2921813428401947, "learning_rate": 8.805320899722014e-05, "loss": 0.2073, "step": 5322 }, { "epoch": 1.0773122849625583, "grad_norm": 0.3513261675834656, "learning_rate": 8.802163024839224e-05, "loss": 0.1927, "step": 5323 }, { "epoch": 1.0775146731430885, "grad_norm": 0.29600194096565247, "learning_rate": 8.799005271141011e-05, "loss": 0.1911, "step": 5324 }, { "epoch": 1.0777170613236187, "grad_norm": 0.33089059591293335, "learning_rate": 8.795847638946841e-05, "loss": 0.1839, "step": 5325 }, { "epoch": 1.0779194495041489, "grad_norm": 0.35707876086235046, "learning_rate": 8.792690128576175e-05, "loss": 0.2012, "step": 5326 }, { "epoch": 1.0781218376846793, "grad_norm": 0.2894890308380127, "learning_rate": 8.78953274034845e-05, "loss": 0.2285, "step": 5327 }, { "epoch": 1.0783242258652095, "grad_norm": 0.2733427882194519, "learning_rate": 8.786375474583104e-05, "loss": 0.1767, "step": 5328 }, { "epoch": 1.0785266140457397, "grad_norm": 0.29151982069015503, "learning_rate": 8.783218331599552e-05, "loss": 0.2081, "step": 5329 }, { "epoch": 1.0787290022262699, "grad_norm": 0.2583785653114319, "learning_rate": 8.7800613117172e-05, "loss": 0.2054, "step": 5330 }, { "epoch": 1.0789313904068003, "grad_norm": 0.24573227763175964, "learning_rate": 8.776904415255448e-05, "loss": 0.153, "step": 5331 }, { "epoch": 1.0791337785873305, "grad_norm": 0.30387353897094727, "learning_rate": 8.77374764253367e-05, "loss": 0.1989, "step": 5332 }, { "epoch": 1.0793361667678607, "grad_norm": 0.27814245223999023, "learning_rate": 8.770590993871238e-05, "loss": 0.1832, "step": 5333 }, { "epoch": 1.0795385549483911, "grad_norm": 0.2869229018688202, "learning_rate": 8.767434469587512e-05, "loss": 0.1993, "step": 5334 }, { "epoch": 1.0797409431289213, "grad_norm": 0.28225669264793396, "learning_rate": 8.764278070001836e-05, "loss": 0.2109, "step": 5335 }, { "epoch": 1.0799433313094515, "grad_norm": 0.2703625559806824, "learning_rate": 8.761121795433534e-05, "loss": 0.1905, "step": 5336 }, { "epoch": 1.0801457194899817, "grad_norm": 0.2503441870212555, "learning_rate": 8.75796564620193e-05, "loss": 0.1854, "step": 5337 }, { "epoch": 1.080348107670512, "grad_norm": 0.29169875383377075, "learning_rate": 8.754809622626328e-05, "loss": 0.1762, "step": 5338 }, { "epoch": 1.0805504958510423, "grad_norm": 0.37799257040023804, "learning_rate": 8.751653725026025e-05, "loss": 0.216, "step": 5339 }, { "epoch": 1.0807528840315725, "grad_norm": 0.2747899293899536, "learning_rate": 8.748497953720298e-05, "loss": 0.1783, "step": 5340 }, { "epoch": 1.0809552722121027, "grad_norm": 0.3033110201358795, "learning_rate": 8.745342309028417e-05, "loss": 0.2141, "step": 5341 }, { "epoch": 1.0811576603926332, "grad_norm": 0.29245656728744507, "learning_rate": 8.742186791269636e-05, "loss": 0.2019, "step": 5342 }, { "epoch": 1.0813600485731634, "grad_norm": 0.25983068346977234, "learning_rate": 8.739031400763194e-05, "loss": 0.1859, "step": 5343 }, { "epoch": 1.0815624367536936, "grad_norm": 0.2650619149208069, "learning_rate": 8.735876137828327e-05, "loss": 0.2064, "step": 5344 }, { "epoch": 1.0817648249342238, "grad_norm": 0.2759478688240051, "learning_rate": 8.732721002784247e-05, "loss": 0.2229, "step": 5345 }, { "epoch": 1.0819672131147542, "grad_norm": 0.30478212237358093, "learning_rate": 8.72956599595016e-05, "loss": 0.2038, "step": 5346 }, { "epoch": 1.0821696012952844, "grad_norm": 0.349378764629364, "learning_rate": 8.726411117645255e-05, "loss": 0.2141, "step": 5347 }, { "epoch": 1.0823719894758146, "grad_norm": 0.3019539713859558, "learning_rate": 8.723256368188708e-05, "loss": 0.222, "step": 5348 }, { "epoch": 1.0825743776563448, "grad_norm": 0.2606692612171173, "learning_rate": 8.720101747899685e-05, "loss": 0.172, "step": 5349 }, { "epoch": 1.0827767658368752, "grad_norm": 0.29608097672462463, "learning_rate": 8.716947257097339e-05, "loss": 0.1961, "step": 5350 }, { "epoch": 1.0827767658368752, "eval_loss": 0.26948341727256775, "eval_runtime": 0.741, "eval_samples_per_second": 6.748, "eval_steps_per_second": 1.35, "step": 5350 }, { "epoch": 1.0829791540174054, "grad_norm": 0.2929864823818207, "learning_rate": 8.713792896100806e-05, "loss": 0.2005, "step": 5351 }, { "epoch": 1.0831815421979356, "grad_norm": 0.3245900869369507, "learning_rate": 8.710638665229211e-05, "loss": 0.2013, "step": 5352 }, { "epoch": 1.0833839303784658, "grad_norm": 0.2754800021648407, "learning_rate": 8.707484564801667e-05, "loss": 0.1602, "step": 5353 }, { "epoch": 1.0835863185589962, "grad_norm": 0.3620850741863251, "learning_rate": 8.704330595137273e-05, "loss": 0.2033, "step": 5354 }, { "epoch": 1.0837887067395264, "grad_norm": 0.31015318632125854, "learning_rate": 8.701176756555114e-05, "loss": 0.2109, "step": 5355 }, { "epoch": 1.0839910949200566, "grad_norm": 0.2776755094528198, "learning_rate": 8.698023049374262e-05, "loss": 0.2029, "step": 5356 }, { "epoch": 1.0841934831005868, "grad_norm": 0.2785882353782654, "learning_rate": 8.694869473913775e-05, "loss": 0.1942, "step": 5357 }, { "epoch": 1.0843958712811173, "grad_norm": 0.23001353442668915, "learning_rate": 8.691716030492707e-05, "loss": 0.1692, "step": 5358 }, { "epoch": 1.0845982594616475, "grad_norm": 0.23214296996593475, "learning_rate": 8.688562719430077e-05, "loss": 0.1735, "step": 5359 }, { "epoch": 1.0848006476421777, "grad_norm": 0.3181362450122833, "learning_rate": 8.685409541044912e-05, "loss": 0.1579, "step": 5360 }, { "epoch": 1.0850030358227079, "grad_norm": 0.25155559182167053, "learning_rate": 8.682256495656215e-05, "loss": 0.1945, "step": 5361 }, { "epoch": 1.0852054240032383, "grad_norm": 0.2986045181751251, "learning_rate": 8.679103583582979e-05, "loss": 0.1929, "step": 5362 }, { "epoch": 1.0854078121837685, "grad_norm": 0.2835460603237152, "learning_rate": 8.675950805144183e-05, "loss": 0.1953, "step": 5363 }, { "epoch": 1.0856102003642987, "grad_norm": 0.30347853899002075, "learning_rate": 8.672798160658791e-05, "loss": 0.221, "step": 5364 }, { "epoch": 1.085812588544829, "grad_norm": 0.31518036127090454, "learning_rate": 8.669645650445755e-05, "loss": 0.2007, "step": 5365 }, { "epoch": 1.0860149767253593, "grad_norm": 0.2516017556190491, "learning_rate": 8.666493274824012e-05, "loss": 0.1734, "step": 5366 }, { "epoch": 1.0862173649058895, "grad_norm": 0.255487322807312, "learning_rate": 8.663341034112487e-05, "loss": 0.1795, "step": 5367 }, { "epoch": 1.0864197530864197, "grad_norm": 0.2932477593421936, "learning_rate": 8.660188928630092e-05, "loss": 0.2065, "step": 5368 }, { "epoch": 1.08662214126695, "grad_norm": 0.3519365191459656, "learning_rate": 8.657036958695721e-05, "loss": 0.1865, "step": 5369 }, { "epoch": 1.0868245294474803, "grad_norm": 0.2964744567871094, "learning_rate": 8.65388512462826e-05, "loss": 0.2013, "step": 5370 }, { "epoch": 1.0870269176280105, "grad_norm": 0.3263915181159973, "learning_rate": 8.650733426746579e-05, "loss": 0.2065, "step": 5371 }, { "epoch": 1.0872293058085407, "grad_norm": 0.26859644055366516, "learning_rate": 8.647581865369529e-05, "loss": 0.1972, "step": 5372 }, { "epoch": 1.0874316939890711, "grad_norm": 0.31503501534461975, "learning_rate": 8.644430440815956e-05, "loss": 0.2107, "step": 5373 }, { "epoch": 1.0876340821696013, "grad_norm": 0.2556743025779724, "learning_rate": 8.641279153404688e-05, "loss": 0.1937, "step": 5374 }, { "epoch": 1.0878364703501315, "grad_norm": 0.2752053439617157, "learning_rate": 8.638128003454538e-05, "loss": 0.1779, "step": 5375 }, { "epoch": 1.0880388585306617, "grad_norm": 0.25501495599746704, "learning_rate": 8.634976991284307e-05, "loss": 0.1801, "step": 5376 }, { "epoch": 1.0882412467111922, "grad_norm": 0.31966909766197205, "learning_rate": 8.631826117212781e-05, "loss": 0.2226, "step": 5377 }, { "epoch": 1.0884436348917224, "grad_norm": 0.3165377974510193, "learning_rate": 8.628675381558732e-05, "loss": 0.2003, "step": 5378 }, { "epoch": 1.0886460230722526, "grad_norm": 0.27411338686943054, "learning_rate": 8.62552478464092e-05, "loss": 0.207, "step": 5379 }, { "epoch": 1.0888484112527828, "grad_norm": 0.30073752999305725, "learning_rate": 8.62237432677809e-05, "loss": 0.2094, "step": 5380 }, { "epoch": 1.0890507994333132, "grad_norm": 0.2878848612308502, "learning_rate": 8.619224008288969e-05, "loss": 0.1836, "step": 5381 }, { "epoch": 1.0892531876138434, "grad_norm": 0.3357694447040558, "learning_rate": 8.616073829492273e-05, "loss": 0.226, "step": 5382 }, { "epoch": 1.0894555757943736, "grad_norm": 0.28499335050582886, "learning_rate": 8.612923790706707e-05, "loss": 0.2015, "step": 5383 }, { "epoch": 1.0896579639749038, "grad_norm": 0.3176371455192566, "learning_rate": 8.609773892250955e-05, "loss": 0.1925, "step": 5384 }, { "epoch": 1.0898603521554342, "grad_norm": 0.3828350901603699, "learning_rate": 8.606624134443695e-05, "loss": 0.2082, "step": 5385 }, { "epoch": 1.0900627403359644, "grad_norm": 0.340249240398407, "learning_rate": 8.603474517603584e-05, "loss": 0.2003, "step": 5386 }, { "epoch": 1.0902651285164946, "grad_norm": 0.3102370798587799, "learning_rate": 8.600325042049266e-05, "loss": 0.2237, "step": 5387 }, { "epoch": 1.0904675166970248, "grad_norm": 0.25209981203079224, "learning_rate": 8.597175708099377e-05, "loss": 0.1913, "step": 5388 }, { "epoch": 1.0906699048775552, "grad_norm": 0.30471372604370117, "learning_rate": 8.594026516072528e-05, "loss": 0.2305, "step": 5389 }, { "epoch": 1.0908722930580854, "grad_norm": 0.35013729333877563, "learning_rate": 8.590877466287323e-05, "loss": 0.228, "step": 5390 }, { "epoch": 1.0910746812386156, "grad_norm": 0.277339369058609, "learning_rate": 8.587728559062352e-05, "loss": 0.2104, "step": 5391 }, { "epoch": 1.0912770694191458, "grad_norm": 0.2655515968799591, "learning_rate": 8.584579794716184e-05, "loss": 0.1828, "step": 5392 }, { "epoch": 1.0914794575996762, "grad_norm": 0.2481672465801239, "learning_rate": 8.581431173567383e-05, "loss": 0.1907, "step": 5393 }, { "epoch": 1.0916818457802064, "grad_norm": 0.42780938744544983, "learning_rate": 8.57828269593449e-05, "loss": 0.2154, "step": 5394 }, { "epoch": 1.0918842339607366, "grad_norm": 0.2682367265224457, "learning_rate": 8.575134362136034e-05, "loss": 0.2119, "step": 5395 }, { "epoch": 1.092086622141267, "grad_norm": 0.2849738299846649, "learning_rate": 8.571986172490536e-05, "loss": 0.2048, "step": 5396 }, { "epoch": 1.0922890103217973, "grad_norm": 0.29217204451560974, "learning_rate": 8.56883812731649e-05, "loss": 0.2057, "step": 5397 }, { "epoch": 1.0924913985023275, "grad_norm": 0.3060377538204193, "learning_rate": 8.565690226932388e-05, "loss": 0.2257, "step": 5398 }, { "epoch": 1.0926937866828577, "grad_norm": 0.317339152097702, "learning_rate": 8.562542471656698e-05, "loss": 0.1812, "step": 5399 }, { "epoch": 1.092896174863388, "grad_norm": 0.4128323197364807, "learning_rate": 8.559394861807876e-05, "loss": 0.2057, "step": 5400 }, { "epoch": 1.092896174863388, "eval_loss": 0.26553845405578613, "eval_runtime": 0.7388, "eval_samples_per_second": 6.767, "eval_steps_per_second": 1.353, "step": 5400 }, { "epoch": 1.0930985630439183, "grad_norm": 0.29070258140563965, "learning_rate": 8.556247397704364e-05, "loss": 0.1732, "step": 5401 }, { "epoch": 1.0933009512244485, "grad_norm": 0.2655215859413147, "learning_rate": 8.553100079664598e-05, "loss": 0.196, "step": 5402 }, { "epoch": 1.0935033394049787, "grad_norm": 0.2547283172607422, "learning_rate": 8.549952908006981e-05, "loss": 0.1807, "step": 5403 }, { "epoch": 1.093705727585509, "grad_norm": 0.26383882761001587, "learning_rate": 8.546805883049912e-05, "loss": 0.1459, "step": 5404 }, { "epoch": 1.0939081157660393, "grad_norm": 0.2940872311592102, "learning_rate": 8.543659005111776e-05, "loss": 0.1863, "step": 5405 }, { "epoch": 1.0941105039465695, "grad_norm": 0.33460089564323425, "learning_rate": 8.54051227451094e-05, "loss": 0.2038, "step": 5406 }, { "epoch": 1.0943128921270997, "grad_norm": 0.3572755455970764, "learning_rate": 8.53736569156576e-05, "loss": 0.2042, "step": 5407 }, { "epoch": 1.0945152803076301, "grad_norm": 0.3915098011493683, "learning_rate": 8.534219256594569e-05, "loss": 0.1748, "step": 5408 }, { "epoch": 1.0947176684881603, "grad_norm": 0.31204935908317566, "learning_rate": 8.531072969915696e-05, "loss": 0.232, "step": 5409 }, { "epoch": 1.0949200566686905, "grad_norm": 0.2606232464313507, "learning_rate": 8.527926831847445e-05, "loss": 0.1788, "step": 5410 }, { "epoch": 1.0951224448492207, "grad_norm": 0.27161234617233276, "learning_rate": 8.524780842708112e-05, "loss": 0.1911, "step": 5411 }, { "epoch": 1.0953248330297511, "grad_norm": 0.382242351770401, "learning_rate": 8.521635002815973e-05, "loss": 0.2008, "step": 5412 }, { "epoch": 1.0955272212102813, "grad_norm": 0.31286850571632385, "learning_rate": 8.518489312489293e-05, "loss": 0.2099, "step": 5413 }, { "epoch": 1.0957296093908115, "grad_norm": 0.28045061230659485, "learning_rate": 8.515343772046318e-05, "loss": 0.2031, "step": 5414 }, { "epoch": 1.0959319975713417, "grad_norm": 0.32816460728645325, "learning_rate": 8.512198381805282e-05, "loss": 0.2097, "step": 5415 }, { "epoch": 1.0961343857518722, "grad_norm": 0.27461788058280945, "learning_rate": 8.509053142084402e-05, "loss": 0.1847, "step": 5416 }, { "epoch": 1.0963367739324024, "grad_norm": 0.2543027997016907, "learning_rate": 8.505908053201882e-05, "loss": 0.2017, "step": 5417 }, { "epoch": 1.0965391621129326, "grad_norm": 0.29116228222846985, "learning_rate": 8.502763115475908e-05, "loss": 0.1862, "step": 5418 }, { "epoch": 1.0967415502934628, "grad_norm": 0.5247349143028259, "learning_rate": 8.499618329224652e-05, "loss": 0.2204, "step": 5419 }, { "epoch": 1.0969439384739932, "grad_norm": 0.27782142162323, "learning_rate": 8.496473694766268e-05, "loss": 0.1908, "step": 5420 }, { "epoch": 1.0971463266545234, "grad_norm": 0.27946728467941284, "learning_rate": 8.4933292124189e-05, "loss": 0.2034, "step": 5421 }, { "epoch": 1.0973487148350536, "grad_norm": 0.26838362216949463, "learning_rate": 8.490184882500674e-05, "loss": 0.2023, "step": 5422 }, { "epoch": 1.0975511030155838, "grad_norm": 0.2386351376771927, "learning_rate": 8.487040705329699e-05, "loss": 0.182, "step": 5423 }, { "epoch": 1.0977534911961142, "grad_norm": 0.31640326976776123, "learning_rate": 8.483896681224072e-05, "loss": 0.1941, "step": 5424 }, { "epoch": 1.0979558793766444, "grad_norm": 0.3387574553489685, "learning_rate": 8.480752810501868e-05, "loss": 0.221, "step": 5425 }, { "epoch": 1.0981582675571746, "grad_norm": 0.25791817903518677, "learning_rate": 8.477609093481154e-05, "loss": 0.1801, "step": 5426 }, { "epoch": 1.098360655737705, "grad_norm": 0.3015289902687073, "learning_rate": 8.474465530479976e-05, "loss": 0.2132, "step": 5427 }, { "epoch": 1.0985630439182352, "grad_norm": 0.3095368444919586, "learning_rate": 8.47132212181637e-05, "loss": 0.1777, "step": 5428 }, { "epoch": 1.0987654320987654, "grad_norm": 0.2534477412700653, "learning_rate": 8.468178867808352e-05, "loss": 0.2022, "step": 5429 }, { "epoch": 1.0989678202792956, "grad_norm": 0.3415544927120209, "learning_rate": 8.465035768773921e-05, "loss": 0.1755, "step": 5430 }, { "epoch": 1.099170208459826, "grad_norm": 0.2486085444688797, "learning_rate": 8.461892825031066e-05, "loss": 0.2079, "step": 5431 }, { "epoch": 1.0993725966403562, "grad_norm": 0.3206421136856079, "learning_rate": 8.458750036897756e-05, "loss": 0.2082, "step": 5432 }, { "epoch": 1.0995749848208864, "grad_norm": 0.2875009775161743, "learning_rate": 8.455607404691944e-05, "loss": 0.1765, "step": 5433 }, { "epoch": 1.0997773730014166, "grad_norm": 0.3375379741191864, "learning_rate": 8.45246492873157e-05, "loss": 0.2048, "step": 5434 }, { "epoch": 1.099979761181947, "grad_norm": 0.3194500207901001, "learning_rate": 8.449322609334558e-05, "loss": 0.1952, "step": 5435 }, { "epoch": 1.1001821493624773, "grad_norm": 0.308965802192688, "learning_rate": 8.446180446818813e-05, "loss": 0.243, "step": 5436 }, { "epoch": 1.1003845375430075, "grad_norm": 0.3235142230987549, "learning_rate": 8.443038441502225e-05, "loss": 0.2185, "step": 5437 }, { "epoch": 1.1005869257235377, "grad_norm": 0.23256078362464905, "learning_rate": 8.43989659370267e-05, "loss": 0.1578, "step": 5438 }, { "epoch": 1.100789313904068, "grad_norm": 0.5195769667625427, "learning_rate": 8.43675490373801e-05, "loss": 0.2077, "step": 5439 }, { "epoch": 1.1009917020845983, "grad_norm": 0.42865368723869324, "learning_rate": 8.433613371926083e-05, "loss": 0.21, "step": 5440 }, { "epoch": 1.1011940902651285, "grad_norm": 0.30132371187210083, "learning_rate": 8.430471998584721e-05, "loss": 0.2243, "step": 5441 }, { "epoch": 1.1013964784456587, "grad_norm": 0.26939135789871216, "learning_rate": 8.427330784031732e-05, "loss": 0.1697, "step": 5442 }, { "epoch": 1.101598866626189, "grad_norm": 0.320512980222702, "learning_rate": 8.424189728584915e-05, "loss": 0.2202, "step": 5443 }, { "epoch": 1.1018012548067193, "grad_norm": 0.2802683711051941, "learning_rate": 8.421048832562044e-05, "loss": 0.196, "step": 5444 }, { "epoch": 1.1020036429872495, "grad_norm": 0.33519646525382996, "learning_rate": 8.417908096280885e-05, "loss": 0.2054, "step": 5445 }, { "epoch": 1.1022060311677797, "grad_norm": 0.3033265769481659, "learning_rate": 8.414767520059184e-05, "loss": 0.2043, "step": 5446 }, { "epoch": 1.1024084193483101, "grad_norm": 0.24827246367931366, "learning_rate": 8.411627104214674e-05, "loss": 0.2004, "step": 5447 }, { "epoch": 1.1026108075288403, "grad_norm": 0.34880825877189636, "learning_rate": 8.408486849065066e-05, "loss": 0.2216, "step": 5448 }, { "epoch": 1.1028131957093705, "grad_norm": 0.3037523031234741, "learning_rate": 8.405346754928057e-05, "loss": 0.2109, "step": 5449 }, { "epoch": 1.1030155838899007, "grad_norm": 0.25012820959091187, "learning_rate": 8.402206822121332e-05, "loss": 0.1732, "step": 5450 }, { "epoch": 1.1030155838899007, "eval_loss": 0.26805707812309265, "eval_runtime": 0.7385, "eval_samples_per_second": 6.77, "eval_steps_per_second": 1.354, "step": 5450 }, { "epoch": 1.1032179720704312, "grad_norm": 0.2542113959789276, "learning_rate": 8.399067050962555e-05, "loss": 0.2117, "step": 5451 }, { "epoch": 1.1034203602509614, "grad_norm": 0.32532787322998047, "learning_rate": 8.395927441769376e-05, "loss": 0.1961, "step": 5452 }, { "epoch": 1.1036227484314916, "grad_norm": 0.2972103953361511, "learning_rate": 8.392787994859427e-05, "loss": 0.2223, "step": 5453 }, { "epoch": 1.1038251366120218, "grad_norm": 0.312966912984848, "learning_rate": 8.389648710550324e-05, "loss": 0.2092, "step": 5454 }, { "epoch": 1.1040275247925522, "grad_norm": 0.2706752121448517, "learning_rate": 8.386509589159666e-05, "loss": 0.2111, "step": 5455 }, { "epoch": 1.1042299129730824, "grad_norm": 0.3043646216392517, "learning_rate": 8.38337063100504e-05, "loss": 0.1995, "step": 5456 }, { "epoch": 1.1044323011536126, "grad_norm": 0.24441584944725037, "learning_rate": 8.38023183640401e-05, "loss": 0.1391, "step": 5457 }, { "epoch": 1.104634689334143, "grad_norm": 0.243491068482399, "learning_rate": 8.377093205674124e-05, "loss": 0.1765, "step": 5458 }, { "epoch": 1.1048370775146732, "grad_norm": 0.2956346273422241, "learning_rate": 8.373954739132922e-05, "loss": 0.2106, "step": 5459 }, { "epoch": 1.1050394656952034, "grad_norm": 0.2790488302707672, "learning_rate": 8.370816437097915e-05, "loss": 0.1816, "step": 5460 }, { "epoch": 1.1052418538757336, "grad_norm": 0.25901350378990173, "learning_rate": 8.367678299886608e-05, "loss": 0.1998, "step": 5461 }, { "epoch": 1.105444242056264, "grad_norm": 0.5830068588256836, "learning_rate": 8.364540327816483e-05, "loss": 0.2226, "step": 5462 }, { "epoch": 1.1056466302367942, "grad_norm": 0.2617352604866028, "learning_rate": 8.361402521205005e-05, "loss": 0.1773, "step": 5463 }, { "epoch": 1.1058490184173244, "grad_norm": 0.30582985281944275, "learning_rate": 8.358264880369629e-05, "loss": 0.1983, "step": 5464 }, { "epoch": 1.1060514065978546, "grad_norm": 0.2862445116043091, "learning_rate": 8.355127405627783e-05, "loss": 0.1742, "step": 5465 }, { "epoch": 1.106253794778385, "grad_norm": 0.26939576864242554, "learning_rate": 8.351990097296888e-05, "loss": 0.1604, "step": 5466 }, { "epoch": 1.1064561829589152, "grad_norm": 0.42266708612442017, "learning_rate": 8.348852955694342e-05, "loss": 0.198, "step": 5467 }, { "epoch": 1.1066585711394454, "grad_norm": 0.2866937816143036, "learning_rate": 8.34571598113753e-05, "loss": 0.2114, "step": 5468 }, { "epoch": 1.1068609593199756, "grad_norm": 0.30896636843681335, "learning_rate": 8.342579173943818e-05, "loss": 0.2039, "step": 5469 }, { "epoch": 1.107063347500506, "grad_norm": 0.25059643387794495, "learning_rate": 8.339442534430552e-05, "loss": 0.1659, "step": 5470 }, { "epoch": 1.1072657356810363, "grad_norm": 0.3479415774345398, "learning_rate": 8.336306062915066e-05, "loss": 0.1989, "step": 5471 }, { "epoch": 1.1074681238615665, "grad_norm": 0.34983018040657043, "learning_rate": 8.333169759714676e-05, "loss": 0.196, "step": 5472 }, { "epoch": 1.1076705120420967, "grad_norm": 0.33293071389198303, "learning_rate": 8.33003362514668e-05, "loss": 0.2463, "step": 5473 }, { "epoch": 1.107872900222627, "grad_norm": 0.3174991309642792, "learning_rate": 8.32689765952836e-05, "loss": 0.1945, "step": 5474 }, { "epoch": 1.1080752884031573, "grad_norm": 0.3204096853733063, "learning_rate": 8.323761863176977e-05, "loss": 0.1979, "step": 5475 }, { "epoch": 1.1082776765836875, "grad_norm": 0.3265916705131531, "learning_rate": 8.32062623640978e-05, "loss": 0.1879, "step": 5476 }, { "epoch": 1.1084800647642177, "grad_norm": 0.2553459703922272, "learning_rate": 8.317490779544e-05, "loss": 0.1804, "step": 5477 }, { "epoch": 1.108682452944748, "grad_norm": 0.26617133617401123, "learning_rate": 8.314355492896849e-05, "loss": 0.1855, "step": 5478 }, { "epoch": 1.1088848411252783, "grad_norm": 0.2975890040397644, "learning_rate": 8.311220376785521e-05, "loss": 0.1883, "step": 5479 }, { "epoch": 1.1090872293058085, "grad_norm": 0.27596038579940796, "learning_rate": 8.308085431527197e-05, "loss": 0.2003, "step": 5480 }, { "epoch": 1.1092896174863387, "grad_norm": 0.32845911383628845, "learning_rate": 8.304950657439033e-05, "loss": 0.2082, "step": 5481 }, { "epoch": 1.1094920056668691, "grad_norm": 0.30826571583747864, "learning_rate": 8.301816054838178e-05, "loss": 0.2075, "step": 5482 }, { "epoch": 1.1096943938473993, "grad_norm": 0.33803990483283997, "learning_rate": 8.298681624041755e-05, "loss": 0.1956, "step": 5483 }, { "epoch": 1.1098967820279295, "grad_norm": 0.312499463558197, "learning_rate": 8.295547365366873e-05, "loss": 0.1909, "step": 5484 }, { "epoch": 1.1100991702084597, "grad_norm": 0.24040253460407257, "learning_rate": 8.292413279130624e-05, "loss": 0.1785, "step": 5485 }, { "epoch": 1.1103015583889901, "grad_norm": 0.30531415343284607, "learning_rate": 8.289279365650084e-05, "loss": 0.211, "step": 5486 }, { "epoch": 1.1105039465695203, "grad_norm": 0.2846146523952484, "learning_rate": 8.286145625242305e-05, "loss": 0.1942, "step": 5487 }, { "epoch": 1.1107063347500505, "grad_norm": 0.3238934278488159, "learning_rate": 8.283012058224329e-05, "loss": 0.2314, "step": 5488 }, { "epoch": 1.110908722930581, "grad_norm": 0.26682621240615845, "learning_rate": 8.279878664913177e-05, "loss": 0.1915, "step": 5489 }, { "epoch": 1.1111111111111112, "grad_norm": 0.28587329387664795, "learning_rate": 8.276745445625852e-05, "loss": 0.2053, "step": 5490 }, { "epoch": 1.1113134992916414, "grad_norm": 0.25702354311943054, "learning_rate": 8.273612400679346e-05, "loss": 0.1832, "step": 5491 }, { "epoch": 1.1115158874721716, "grad_norm": 0.39831334352493286, "learning_rate": 8.270479530390617e-05, "loss": 0.1789, "step": 5492 }, { "epoch": 1.111718275652702, "grad_norm": 0.28756701946258545, "learning_rate": 8.267346835076624e-05, "loss": 0.2076, "step": 5493 }, { "epoch": 1.1119206638332322, "grad_norm": 0.2621087431907654, "learning_rate": 8.264214315054295e-05, "loss": 0.2131, "step": 5494 }, { "epoch": 1.1121230520137624, "grad_norm": 0.30722442269325256, "learning_rate": 8.261081970640549e-05, "loss": 0.1999, "step": 5495 }, { "epoch": 1.1123254401942926, "grad_norm": 0.2630216181278229, "learning_rate": 8.257949802152282e-05, "loss": 0.2014, "step": 5496 }, { "epoch": 1.112527828374823, "grad_norm": 0.3075388967990875, "learning_rate": 8.254817809906377e-05, "loss": 0.214, "step": 5497 }, { "epoch": 1.1127302165553532, "grad_norm": 0.2784331440925598, "learning_rate": 8.251685994219693e-05, "loss": 0.2134, "step": 5498 }, { "epoch": 1.1129326047358834, "grad_norm": 0.28776970505714417, "learning_rate": 8.248554355409076e-05, "loss": 0.1863, "step": 5499 }, { "epoch": 1.1131349929164136, "grad_norm": 0.3288789689540863, "learning_rate": 8.24542289379135e-05, "loss": 0.2347, "step": 5500 }, { "epoch": 1.1131349929164136, "eval_loss": 0.266431599855423, "eval_runtime": 0.7362, "eval_samples_per_second": 6.792, "eval_steps_per_second": 1.358, "step": 5500 }, { "epoch": 1.113337381096944, "grad_norm": 0.2643654942512512, "learning_rate": 8.242291609683326e-05, "loss": 0.1786, "step": 5501 }, { "epoch": 1.1135397692774742, "grad_norm": 0.30738750100135803, "learning_rate": 8.239160503401794e-05, "loss": 0.1953, "step": 5502 }, { "epoch": 1.1137421574580044, "grad_norm": 0.2640698552131653, "learning_rate": 8.236029575263525e-05, "loss": 0.1898, "step": 5503 }, { "epoch": 1.1139445456385346, "grad_norm": 0.27601543068885803, "learning_rate": 8.232898825585275e-05, "loss": 0.188, "step": 5504 }, { "epoch": 1.114146933819065, "grad_norm": 0.2772660255432129, "learning_rate": 8.22976825468378e-05, "loss": 0.2217, "step": 5505 }, { "epoch": 1.1143493219995952, "grad_norm": 0.27593713998794556, "learning_rate": 8.226637862875758e-05, "loss": 0.1725, "step": 5506 }, { "epoch": 1.1145517101801254, "grad_norm": 0.2594548463821411, "learning_rate": 8.22350765047791e-05, "loss": 0.1799, "step": 5507 }, { "epoch": 1.1147540983606556, "grad_norm": 0.3393990397453308, "learning_rate": 8.220377617806916e-05, "loss": 0.189, "step": 5508 }, { "epoch": 1.114956486541186, "grad_norm": 0.28749755024909973, "learning_rate": 8.217247765179442e-05, "loss": 0.1873, "step": 5509 }, { "epoch": 1.1151588747217163, "grad_norm": 0.27113446593284607, "learning_rate": 8.214118092912133e-05, "loss": 0.2105, "step": 5510 }, { "epoch": 1.1153612629022465, "grad_norm": 0.2811520993709564, "learning_rate": 8.210988601321616e-05, "loss": 0.2098, "step": 5511 }, { "epoch": 1.1155636510827767, "grad_norm": 0.29087939858436584, "learning_rate": 8.207859290724501e-05, "loss": 0.1824, "step": 5512 }, { "epoch": 1.115766039263307, "grad_norm": 0.2858453392982483, "learning_rate": 8.204730161437383e-05, "loss": 0.212, "step": 5513 }, { "epoch": 1.1159684274438373, "grad_norm": 0.31338831782341003, "learning_rate": 8.201601213776824e-05, "loss": 0.2128, "step": 5514 }, { "epoch": 1.1161708156243675, "grad_norm": 0.32885226607322693, "learning_rate": 8.198472448059385e-05, "loss": 0.1949, "step": 5515 }, { "epoch": 1.1163732038048977, "grad_norm": 0.27789217233657837, "learning_rate": 8.1953438646016e-05, "loss": 0.1884, "step": 5516 }, { "epoch": 1.116575591985428, "grad_norm": 0.29066577553749084, "learning_rate": 8.192215463719986e-05, "loss": 0.1751, "step": 5517 }, { "epoch": 1.1167779801659583, "grad_norm": 0.2519930899143219, "learning_rate": 8.189087245731045e-05, "loss": 0.1754, "step": 5518 }, { "epoch": 1.1169803683464885, "grad_norm": 0.26689547300338745, "learning_rate": 8.185959210951252e-05, "loss": 0.1657, "step": 5519 }, { "epoch": 1.117182756527019, "grad_norm": 0.2732199728488922, "learning_rate": 8.182831359697071e-05, "loss": 0.1926, "step": 5520 }, { "epoch": 1.1173851447075491, "grad_norm": 0.2849332392215729, "learning_rate": 8.179703692284948e-05, "loss": 0.1994, "step": 5521 }, { "epoch": 1.1175875328880793, "grad_norm": 0.30812686681747437, "learning_rate": 8.176576209031304e-05, "loss": 0.2105, "step": 5522 }, { "epoch": 1.1177899210686095, "grad_norm": 0.301147997379303, "learning_rate": 8.173448910252548e-05, "loss": 0.1888, "step": 5523 }, { "epoch": 1.11799230924914, "grad_norm": 0.2884552478790283, "learning_rate": 8.170321796265064e-05, "loss": 0.1824, "step": 5524 }, { "epoch": 1.1181946974296701, "grad_norm": 0.28129085898399353, "learning_rate": 8.16719486738522e-05, "loss": 0.2102, "step": 5525 }, { "epoch": 1.1183970856102003, "grad_norm": 0.2771812081336975, "learning_rate": 8.16406812392937e-05, "loss": 0.1954, "step": 5526 }, { "epoch": 1.1185994737907305, "grad_norm": 0.4494684934616089, "learning_rate": 8.160941566213843e-05, "loss": 0.2242, "step": 5527 }, { "epoch": 1.118801861971261, "grad_norm": 0.29533445835113525, "learning_rate": 8.157815194554952e-05, "loss": 0.2021, "step": 5528 }, { "epoch": 1.1190042501517912, "grad_norm": 0.34723520278930664, "learning_rate": 8.154689009268988e-05, "loss": 0.2154, "step": 5529 }, { "epoch": 1.1192066383323214, "grad_norm": 0.3085881173610687, "learning_rate": 8.15156301067223e-05, "loss": 0.2047, "step": 5530 }, { "epoch": 1.1194090265128516, "grad_norm": 0.2965511977672577, "learning_rate": 8.14843719908093e-05, "loss": 0.1804, "step": 5531 }, { "epoch": 1.119611414693382, "grad_norm": 0.2811489403247833, "learning_rate": 8.145311574811325e-05, "loss": 0.2255, "step": 5532 }, { "epoch": 1.1198138028739122, "grad_norm": 0.2637896239757538, "learning_rate": 8.142186138179635e-05, "loss": 0.1785, "step": 5533 }, { "epoch": 1.1200161910544424, "grad_norm": 0.28036361932754517, "learning_rate": 8.139060889502056e-05, "loss": 0.1853, "step": 5534 }, { "epoch": 1.1202185792349726, "grad_norm": 0.2461838275194168, "learning_rate": 8.135935829094772e-05, "loss": 0.1899, "step": 5535 }, { "epoch": 1.120420967415503, "grad_norm": 0.296796977519989, "learning_rate": 8.132810957273944e-05, "loss": 0.1901, "step": 5536 }, { "epoch": 1.1206233555960332, "grad_norm": 0.25972190499305725, "learning_rate": 8.129686274355709e-05, "loss": 0.168, "step": 5537 }, { "epoch": 1.1208257437765634, "grad_norm": 0.26876628398895264, "learning_rate": 8.12656178065619e-05, "loss": 0.224, "step": 5538 }, { "epoch": 1.1210281319570936, "grad_norm": 0.35002103447914124, "learning_rate": 8.123437476491492e-05, "loss": 0.1862, "step": 5539 }, { "epoch": 1.121230520137624, "grad_norm": 0.26477545499801636, "learning_rate": 8.1203133621777e-05, "loss": 0.1967, "step": 5540 }, { "epoch": 1.1214329083181542, "grad_norm": 0.303501158952713, "learning_rate": 8.117189438030879e-05, "loss": 0.2196, "step": 5541 }, { "epoch": 1.1216352964986844, "grad_norm": 0.2881081998348236, "learning_rate": 8.114065704367074e-05, "loss": 0.1848, "step": 5542 }, { "epoch": 1.1218376846792146, "grad_norm": 0.33362969756126404, "learning_rate": 8.110942161502313e-05, "loss": 0.228, "step": 5543 }, { "epoch": 1.122040072859745, "grad_norm": 0.28015798330307007, "learning_rate": 8.107818809752602e-05, "loss": 0.2227, "step": 5544 }, { "epoch": 1.1222424610402753, "grad_norm": 0.29228246212005615, "learning_rate": 8.104695649433928e-05, "loss": 0.2313, "step": 5545 }, { "epoch": 1.1224448492208055, "grad_norm": 0.2991606295108795, "learning_rate": 8.101572680862264e-05, "loss": 0.2007, "step": 5546 }, { "epoch": 1.1226472374013357, "grad_norm": 0.31790077686309814, "learning_rate": 8.098449904353554e-05, "loss": 0.2082, "step": 5547 }, { "epoch": 1.122849625581866, "grad_norm": 0.2826329171657562, "learning_rate": 8.09532732022373e-05, "loss": 0.1866, "step": 5548 }, { "epoch": 1.1230520137623963, "grad_norm": 0.26081180572509766, "learning_rate": 8.092204928788703e-05, "loss": 0.203, "step": 5549 }, { "epoch": 1.1232544019429265, "grad_norm": 0.2624530792236328, "learning_rate": 8.089082730364363e-05, "loss": 0.2001, "step": 5550 }, { "epoch": 1.1232544019429265, "eval_loss": 0.2674531638622284, "eval_runtime": 0.7399, "eval_samples_per_second": 6.758, "eval_steps_per_second": 1.352, "step": 5550 }, { "epoch": 1.123456790123457, "grad_norm": 0.26963090896606445, "learning_rate": 8.085960725266581e-05, "loss": 0.1897, "step": 5551 }, { "epoch": 1.123659178303987, "grad_norm": 0.26589468121528625, "learning_rate": 8.08283891381121e-05, "loss": 0.1715, "step": 5552 }, { "epoch": 1.1238615664845173, "grad_norm": 0.24697373807430267, "learning_rate": 8.079717296314079e-05, "loss": 0.1569, "step": 5553 }, { "epoch": 1.1240639546650475, "grad_norm": 0.26703882217407227, "learning_rate": 8.076595873091001e-05, "loss": 0.1884, "step": 5554 }, { "epoch": 1.124266342845578, "grad_norm": 0.30867865681648254, "learning_rate": 8.073474644457774e-05, "loss": 0.2118, "step": 5555 }, { "epoch": 1.1244687310261081, "grad_norm": 0.2832573354244232, "learning_rate": 8.070353610730168e-05, "loss": 0.212, "step": 5556 }, { "epoch": 1.1246711192066383, "grad_norm": 0.31668218970298767, "learning_rate": 8.067232772223934e-05, "loss": 0.1914, "step": 5557 }, { "epoch": 1.1248735073871685, "grad_norm": 0.3052870035171509, "learning_rate": 8.064112129254814e-05, "loss": 0.2067, "step": 5558 }, { "epoch": 1.125075895567699, "grad_norm": 0.25671231746673584, "learning_rate": 8.06099168213851e-05, "loss": 0.1761, "step": 5559 }, { "epoch": 1.1252782837482291, "grad_norm": 0.37939679622650146, "learning_rate": 8.057871431190723e-05, "loss": 0.2041, "step": 5560 }, { "epoch": 1.1254806719287593, "grad_norm": 0.276947021484375, "learning_rate": 8.054751376727125e-05, "loss": 0.176, "step": 5561 }, { "epoch": 1.1256830601092895, "grad_norm": 0.26697877049446106, "learning_rate": 8.051631519063372e-05, "loss": 0.1857, "step": 5562 }, { "epoch": 1.12588544828982, "grad_norm": 0.33952033519744873, "learning_rate": 8.048511858515099e-05, "loss": 0.1875, "step": 5563 }, { "epoch": 1.1260878364703502, "grad_norm": 0.35365763306617737, "learning_rate": 8.045392395397919e-05, "loss": 0.1963, "step": 5564 }, { "epoch": 1.1262902246508804, "grad_norm": 0.30936864018440247, "learning_rate": 8.042273130027425e-05, "loss": 0.2162, "step": 5565 }, { "epoch": 1.1264926128314106, "grad_norm": 0.23229964077472687, "learning_rate": 8.039154062719195e-05, "loss": 0.1731, "step": 5566 }, { "epoch": 1.126695001011941, "grad_norm": 0.2607414424419403, "learning_rate": 8.036035193788782e-05, "loss": 0.1897, "step": 5567 }, { "epoch": 1.1268973891924712, "grad_norm": 0.29121124744415283, "learning_rate": 8.03291652355172e-05, "loss": 0.2099, "step": 5568 }, { "epoch": 1.1270997773730014, "grad_norm": 0.2816689610481262, "learning_rate": 8.029798052323524e-05, "loss": 0.1858, "step": 5569 }, { "epoch": 1.1273021655535316, "grad_norm": 0.29074355959892273, "learning_rate": 8.026679780419689e-05, "loss": 0.1957, "step": 5570 }, { "epoch": 1.127504553734062, "grad_norm": 0.2680456340312958, "learning_rate": 8.023561708155687e-05, "loss": 0.1999, "step": 5571 }, { "epoch": 1.1277069419145922, "grad_norm": 0.371389776468277, "learning_rate": 8.020443835846973e-05, "loss": 0.2162, "step": 5572 }, { "epoch": 1.1279093300951224, "grad_norm": 0.27187761664390564, "learning_rate": 8.017326163808981e-05, "loss": 0.1823, "step": 5573 }, { "epoch": 1.1281117182756528, "grad_norm": 0.2521708011627197, "learning_rate": 8.014208692357121e-05, "loss": 0.1898, "step": 5574 }, { "epoch": 1.128314106456183, "grad_norm": 0.26570048928260803, "learning_rate": 8.011091421806792e-05, "loss": 0.1737, "step": 5575 }, { "epoch": 1.1285164946367132, "grad_norm": 0.29539164900779724, "learning_rate": 8.007974352473362e-05, "loss": 0.2042, "step": 5576 }, { "epoch": 1.1287188828172434, "grad_norm": 0.3029521107673645, "learning_rate": 8.004857484672186e-05, "loss": 0.2217, "step": 5577 }, { "epoch": 1.1289212709977736, "grad_norm": 0.22743040323257446, "learning_rate": 8.001740818718595e-05, "loss": 0.1799, "step": 5578 }, { "epoch": 1.129123659178304, "grad_norm": 0.2918127179145813, "learning_rate": 7.9986243549279e-05, "loss": 0.2112, "step": 5579 }, { "epoch": 1.1293260473588342, "grad_norm": 0.2503848671913147, "learning_rate": 7.995508093615394e-05, "loss": 0.1691, "step": 5580 }, { "epoch": 1.1295284355393644, "grad_norm": 0.3075704276561737, "learning_rate": 7.992392035096345e-05, "loss": 0.1898, "step": 5581 }, { "epoch": 1.1297308237198949, "grad_norm": 0.22540538012981415, "learning_rate": 7.989276179686002e-05, "loss": 0.1709, "step": 5582 }, { "epoch": 1.129933211900425, "grad_norm": 0.3081169128417969, "learning_rate": 7.9861605276996e-05, "loss": 0.1968, "step": 5583 }, { "epoch": 1.1301356000809553, "grad_norm": 0.2457832396030426, "learning_rate": 7.983045079452344e-05, "loss": 0.1791, "step": 5584 }, { "epoch": 1.1303379882614855, "grad_norm": 0.3851803243160248, "learning_rate": 7.979929835259422e-05, "loss": 0.1885, "step": 5585 }, { "epoch": 1.1305403764420159, "grad_norm": 0.29105520248413086, "learning_rate": 7.976814795436004e-05, "loss": 0.1779, "step": 5586 }, { "epoch": 1.130742764622546, "grad_norm": 0.9472611546516418, "learning_rate": 7.973699960297236e-05, "loss": 0.1643, "step": 5587 }, { "epoch": 1.1309451528030763, "grad_norm": 0.2521185576915741, "learning_rate": 7.970585330158244e-05, "loss": 0.1771, "step": 5588 }, { "epoch": 1.1311475409836065, "grad_norm": 0.2751530706882477, "learning_rate": 7.967470905334133e-05, "loss": 0.1961, "step": 5589 }, { "epoch": 1.131349929164137, "grad_norm": 0.29079005122184753, "learning_rate": 7.96435668613999e-05, "loss": 0.1781, "step": 5590 }, { "epoch": 1.131552317344667, "grad_norm": 0.27955174446105957, "learning_rate": 7.961242672890877e-05, "loss": 0.1799, "step": 5591 }, { "epoch": 1.1317547055251973, "grad_norm": 0.3343000113964081, "learning_rate": 7.958128865901838e-05, "loss": 0.1942, "step": 5592 }, { "epoch": 1.1319570937057275, "grad_norm": 0.2868947982788086, "learning_rate": 7.955015265487895e-05, "loss": 0.2287, "step": 5593 }, { "epoch": 1.132159481886258, "grad_norm": 0.33234843611717224, "learning_rate": 7.95190187196405e-05, "loss": 0.2211, "step": 5594 }, { "epoch": 1.1323618700667881, "grad_norm": 0.3007453382015228, "learning_rate": 7.948788685645284e-05, "loss": 0.1778, "step": 5595 }, { "epoch": 1.1325642582473183, "grad_norm": 0.30757924914360046, "learning_rate": 7.945675706846555e-05, "loss": 0.2153, "step": 5596 }, { "epoch": 1.1327666464278485, "grad_norm": 0.2718496024608612, "learning_rate": 7.942562935882803e-05, "loss": 0.1933, "step": 5597 }, { "epoch": 1.132969034608379, "grad_norm": 0.2909453511238098, "learning_rate": 7.939450373068942e-05, "loss": 0.1942, "step": 5598 }, { "epoch": 1.1331714227889091, "grad_norm": 0.27906984090805054, "learning_rate": 7.936338018719873e-05, "loss": 0.1797, "step": 5599 }, { "epoch": 1.1333738109694393, "grad_norm": 0.29126763343811035, "learning_rate": 7.93322587315047e-05, "loss": 0.1891, "step": 5600 }, { "epoch": 1.1333738109694393, "eval_loss": 0.27354076504707336, "eval_runtime": 0.7387, "eval_samples_per_second": 6.768, "eval_steps_per_second": 1.354, "step": 5600 }, { "epoch": 1.1335761991499695, "grad_norm": 0.3037993311882019, "learning_rate": 7.930113936675587e-05, "loss": 0.1996, "step": 5601 }, { "epoch": 1.1337785873305, "grad_norm": 0.2610785663127899, "learning_rate": 7.927002209610058e-05, "loss": 0.1559, "step": 5602 }, { "epoch": 1.1339809755110302, "grad_norm": 0.31370899081230164, "learning_rate": 7.923890692268692e-05, "loss": 0.2117, "step": 5603 }, { "epoch": 1.1341833636915604, "grad_norm": 0.3044915497303009, "learning_rate": 7.92077938496628e-05, "loss": 0.1878, "step": 5604 }, { "epoch": 1.1343857518720908, "grad_norm": 0.26320895552635193, "learning_rate": 7.917668288017595e-05, "loss": 0.1882, "step": 5605 }, { "epoch": 1.134588140052621, "grad_norm": 0.2812601923942566, "learning_rate": 7.914557401737381e-05, "loss": 0.1816, "step": 5606 }, { "epoch": 1.1347905282331512, "grad_norm": 0.36465582251548767, "learning_rate": 7.911446726440369e-05, "loss": 0.234, "step": 5607 }, { "epoch": 1.1349929164136814, "grad_norm": 0.39917242527008057, "learning_rate": 7.908336262441261e-05, "loss": 0.2317, "step": 5608 }, { "epoch": 1.1351953045942116, "grad_norm": 0.2993834316730499, "learning_rate": 7.905226010054741e-05, "loss": 0.2286, "step": 5609 }, { "epoch": 1.135397692774742, "grad_norm": 0.24275238811969757, "learning_rate": 7.902115969595474e-05, "loss": 0.1344, "step": 5610 }, { "epoch": 1.1356000809552722, "grad_norm": 0.2810470461845398, "learning_rate": 7.899006141378102e-05, "loss": 0.186, "step": 5611 }, { "epoch": 1.1358024691358024, "grad_norm": 0.24735689163208008, "learning_rate": 7.895896525717241e-05, "loss": 0.198, "step": 5612 }, { "epoch": 1.1360048573163328, "grad_norm": 0.2500055134296417, "learning_rate": 7.892787122927493e-05, "loss": 0.1961, "step": 5613 }, { "epoch": 1.136207245496863, "grad_norm": 0.2348107546567917, "learning_rate": 7.889677933323431e-05, "loss": 0.175, "step": 5614 }, { "epoch": 1.1364096336773932, "grad_norm": 0.27956265211105347, "learning_rate": 7.886568957219615e-05, "loss": 0.1484, "step": 5615 }, { "epoch": 1.1366120218579234, "grad_norm": 0.3258465528488159, "learning_rate": 7.883460194930575e-05, "loss": 0.2197, "step": 5616 }, { "epoch": 1.1368144100384538, "grad_norm": 0.3220537602901459, "learning_rate": 7.880351646770824e-05, "loss": 0.2107, "step": 5617 }, { "epoch": 1.137016798218984, "grad_norm": 0.30312204360961914, "learning_rate": 7.877243313054851e-05, "loss": 0.1828, "step": 5618 }, { "epoch": 1.1372191863995142, "grad_norm": 0.3163522183895111, "learning_rate": 7.874135194097128e-05, "loss": 0.1994, "step": 5619 }, { "epoch": 1.1374215745800444, "grad_norm": 0.2842673063278198, "learning_rate": 7.871027290212097e-05, "loss": 0.2129, "step": 5620 }, { "epoch": 1.1376239627605749, "grad_norm": 0.3099961578845978, "learning_rate": 7.867919601714186e-05, "loss": 0.2129, "step": 5621 }, { "epoch": 1.137826350941105, "grad_norm": 0.26963162422180176, "learning_rate": 7.8648121289178e-05, "loss": 0.1578, "step": 5622 }, { "epoch": 1.1380287391216353, "grad_norm": 0.27318212389945984, "learning_rate": 7.861704872137318e-05, "loss": 0.184, "step": 5623 }, { "epoch": 1.1382311273021655, "grad_norm": 0.26080846786499023, "learning_rate": 7.858597831687102e-05, "loss": 0.168, "step": 5624 }, { "epoch": 1.138433515482696, "grad_norm": 0.3094097077846527, "learning_rate": 7.855491007881485e-05, "loss": 0.2037, "step": 5625 }, { "epoch": 1.138635903663226, "grad_norm": 0.33787310123443604, "learning_rate": 7.852384401034785e-05, "loss": 0.2268, "step": 5626 }, { "epoch": 1.1388382918437563, "grad_norm": 0.2730053663253784, "learning_rate": 7.849278011461298e-05, "loss": 0.1785, "step": 5627 }, { "epoch": 1.1390406800242865, "grad_norm": 0.27871865034103394, "learning_rate": 7.846171839475295e-05, "loss": 0.2044, "step": 5628 }, { "epoch": 1.139243068204817, "grad_norm": 0.2883583605289459, "learning_rate": 7.843065885391025e-05, "loss": 0.2119, "step": 5629 }, { "epoch": 1.139445456385347, "grad_norm": 0.2682492733001709, "learning_rate": 7.839960149522715e-05, "loss": 0.1624, "step": 5630 }, { "epoch": 1.1396478445658773, "grad_norm": 0.27851811051368713, "learning_rate": 7.836854632184575e-05, "loss": 0.2059, "step": 5631 }, { "epoch": 1.1398502327464075, "grad_norm": 0.3172750771045685, "learning_rate": 7.833749333690783e-05, "loss": 0.1817, "step": 5632 }, { "epoch": 1.140052620926938, "grad_norm": 0.3163038492202759, "learning_rate": 7.830644254355504e-05, "loss": 0.1772, "step": 5633 }, { "epoch": 1.1402550091074681, "grad_norm": 0.3040738105773926, "learning_rate": 7.827539394492878e-05, "loss": 0.2129, "step": 5634 }, { "epoch": 1.1404573972879983, "grad_norm": 0.29727524518966675, "learning_rate": 7.824434754417018e-05, "loss": 0.1933, "step": 5635 }, { "epoch": 1.1406597854685288, "grad_norm": 0.2724418044090271, "learning_rate": 7.821330334442023e-05, "loss": 0.1936, "step": 5636 }, { "epoch": 1.140862173649059, "grad_norm": 0.30874359607696533, "learning_rate": 7.818226134881965e-05, "loss": 0.1836, "step": 5637 }, { "epoch": 1.1410645618295892, "grad_norm": 0.2828230559825897, "learning_rate": 7.815122156050893e-05, "loss": 0.1892, "step": 5638 }, { "epoch": 1.1412669500101194, "grad_norm": 0.24108606576919556, "learning_rate": 7.812018398262834e-05, "loss": 0.179, "step": 5639 }, { "epoch": 1.1414693381906496, "grad_norm": 0.30316025018692017, "learning_rate": 7.808914861831797e-05, "loss": 0.1994, "step": 5640 }, { "epoch": 1.14167172637118, "grad_norm": 0.2653295695781708, "learning_rate": 7.805811547071763e-05, "loss": 0.1821, "step": 5641 }, { "epoch": 1.1418741145517102, "grad_norm": 0.2590709328651428, "learning_rate": 7.802708454296694e-05, "loss": 0.1874, "step": 5642 }, { "epoch": 1.1420765027322404, "grad_norm": 0.2897018790245056, "learning_rate": 7.799605583820527e-05, "loss": 0.1935, "step": 5643 }, { "epoch": 1.1422788909127708, "grad_norm": 0.2743369936943054, "learning_rate": 7.796502935957178e-05, "loss": 0.1983, "step": 5644 }, { "epoch": 1.142481279093301, "grad_norm": 0.3204341530799866, "learning_rate": 7.793400511020541e-05, "loss": 0.2016, "step": 5645 }, { "epoch": 1.1426836672738312, "grad_norm": 0.2664974629878998, "learning_rate": 7.790298309324489e-05, "loss": 0.1804, "step": 5646 }, { "epoch": 1.1428860554543614, "grad_norm": 0.3154468536376953, "learning_rate": 7.787196331182869e-05, "loss": 0.2191, "step": 5647 }, { "epoch": 1.1430884436348918, "grad_norm": 0.2492353469133377, "learning_rate": 7.784094576909503e-05, "loss": 0.1782, "step": 5648 }, { "epoch": 1.143290831815422, "grad_norm": 0.25519439578056335, "learning_rate": 7.780993046818194e-05, "loss": 0.1779, "step": 5649 }, { "epoch": 1.1434932199959522, "grad_norm": 0.28019189834594727, "learning_rate": 7.777891741222727e-05, "loss": 0.1686, "step": 5650 }, { "epoch": 1.1434932199959522, "eval_loss": 0.26935452222824097, "eval_runtime": 0.7379, "eval_samples_per_second": 6.776, "eval_steps_per_second": 1.355, "step": 5650 }, { "epoch": 1.1436956081764824, "grad_norm": 0.303603857755661, "learning_rate": 7.774790660436858e-05, "loss": 0.1917, "step": 5651 }, { "epoch": 1.1438979963570128, "grad_norm": 0.27201592922210693, "learning_rate": 7.77168980477432e-05, "loss": 0.1909, "step": 5652 }, { "epoch": 1.144100384537543, "grad_norm": 0.24054737389087677, "learning_rate": 7.768589174548826e-05, "loss": 0.1763, "step": 5653 }, { "epoch": 1.1443027727180732, "grad_norm": 0.3936968743801117, "learning_rate": 7.765488770074066e-05, "loss": 0.1579, "step": 5654 }, { "epoch": 1.1445051608986034, "grad_norm": 0.3285905420780182, "learning_rate": 7.762388591663705e-05, "loss": 0.2147, "step": 5655 }, { "epoch": 1.1447075490791339, "grad_norm": 0.2840346693992615, "learning_rate": 7.759288639631388e-05, "loss": 0.2141, "step": 5656 }, { "epoch": 1.144909937259664, "grad_norm": 0.3871752619743347, "learning_rate": 7.756188914290736e-05, "loss": 0.2267, "step": 5657 }, { "epoch": 1.1451123254401943, "grad_norm": 0.2643132507801056, "learning_rate": 7.753089415955343e-05, "loss": 0.2084, "step": 5658 }, { "epoch": 1.1453147136207245, "grad_norm": 0.22197052836418152, "learning_rate": 7.749990144938788e-05, "loss": 0.181, "step": 5659 }, { "epoch": 1.1455171018012549, "grad_norm": 0.36441561579704285, "learning_rate": 7.74689110155462e-05, "loss": 0.2048, "step": 5660 }, { "epoch": 1.145719489981785, "grad_norm": 0.2794022560119629, "learning_rate": 7.743792286116372e-05, "loss": 0.1842, "step": 5661 }, { "epoch": 1.1459218781623153, "grad_norm": 0.2635161876678467, "learning_rate": 7.740693698937542e-05, "loss": 0.1906, "step": 5662 }, { "epoch": 1.1461242663428455, "grad_norm": 0.30082035064697266, "learning_rate": 7.73759534033162e-05, "loss": 0.2073, "step": 5663 }, { "epoch": 1.146326654523376, "grad_norm": 0.3124663829803467, "learning_rate": 7.73449721061206e-05, "loss": 0.2077, "step": 5664 }, { "epoch": 1.146529042703906, "grad_norm": 0.26671409606933594, "learning_rate": 7.731399310092303e-05, "loss": 0.2005, "step": 5665 }, { "epoch": 1.1467314308844363, "grad_norm": 0.27739301323890686, "learning_rate": 7.728301639085758e-05, "loss": 0.1975, "step": 5666 }, { "epoch": 1.1469338190649667, "grad_norm": 0.3559306859970093, "learning_rate": 7.725204197905818e-05, "loss": 0.1811, "step": 5667 }, { "epoch": 1.147136207245497, "grad_norm": 0.27497610449790955, "learning_rate": 7.722106986865846e-05, "loss": 0.1797, "step": 5668 }, { "epoch": 1.1473385954260271, "grad_norm": 0.2708543837070465, "learning_rate": 7.719010006279193e-05, "loss": 0.2243, "step": 5669 }, { "epoch": 1.1475409836065573, "grad_norm": 0.28557464480400085, "learning_rate": 7.715913256459168e-05, "loss": 0.1541, "step": 5670 }, { "epoch": 1.1477433717870875, "grad_norm": 0.24130353331565857, "learning_rate": 7.712816737719075e-05, "loss": 0.1835, "step": 5671 }, { "epoch": 1.147945759967618, "grad_norm": 0.32948946952819824, "learning_rate": 7.709720450372184e-05, "loss": 0.2035, "step": 5672 }, { "epoch": 1.1481481481481481, "grad_norm": 0.2876203954219818, "learning_rate": 7.706624394731746e-05, "loss": 0.1851, "step": 5673 }, { "epoch": 1.1483505363286783, "grad_norm": 0.281044065952301, "learning_rate": 7.703528571110988e-05, "loss": 0.1947, "step": 5674 }, { "epoch": 1.1485529245092088, "grad_norm": 0.3992454707622528, "learning_rate": 7.700432979823113e-05, "loss": 0.2331, "step": 5675 }, { "epoch": 1.148755312689739, "grad_norm": 0.2621214687824249, "learning_rate": 7.6973376211813e-05, "loss": 0.179, "step": 5676 }, { "epoch": 1.1489577008702692, "grad_norm": 0.26856037974357605, "learning_rate": 7.694242495498705e-05, "loss": 0.1815, "step": 5677 }, { "epoch": 1.1491600890507994, "grad_norm": 0.28116244077682495, "learning_rate": 7.691147603088456e-05, "loss": 0.1928, "step": 5678 }, { "epoch": 1.1493624772313298, "grad_norm": 0.2723270654678345, "learning_rate": 7.68805294426367e-05, "loss": 0.1825, "step": 5679 }, { "epoch": 1.14956486541186, "grad_norm": 0.29364144802093506, "learning_rate": 7.684958519337429e-05, "loss": 0.1983, "step": 5680 }, { "epoch": 1.1497672535923902, "grad_norm": 0.3394298851490021, "learning_rate": 7.681864328622792e-05, "loss": 0.1917, "step": 5681 }, { "epoch": 1.1499696417729204, "grad_norm": 0.2952880263328552, "learning_rate": 7.678770372432799e-05, "loss": 0.1874, "step": 5682 }, { "epoch": 1.1501720299534508, "grad_norm": 0.2824990153312683, "learning_rate": 7.675676651080464e-05, "loss": 0.177, "step": 5683 }, { "epoch": 1.150374418133981, "grad_norm": 0.29086270928382874, "learning_rate": 7.672583164878775e-05, "loss": 0.1867, "step": 5684 }, { "epoch": 1.1505768063145112, "grad_norm": 0.27432793378829956, "learning_rate": 7.669489914140701e-05, "loss": 0.1921, "step": 5685 }, { "epoch": 1.1507791944950414, "grad_norm": 0.32622668147087097, "learning_rate": 7.666396899179183e-05, "loss": 0.2174, "step": 5686 }, { "epoch": 1.1509815826755718, "grad_norm": 0.3424051105976105, "learning_rate": 7.663304120307141e-05, "loss": 0.2197, "step": 5687 }, { "epoch": 1.151183970856102, "grad_norm": 0.3257910907268524, "learning_rate": 7.660211577837469e-05, "loss": 0.2088, "step": 5688 }, { "epoch": 1.1513863590366322, "grad_norm": 0.27886244654655457, "learning_rate": 7.657119272083039e-05, "loss": 0.1725, "step": 5689 }, { "epoch": 1.1515887472171624, "grad_norm": 0.3835498094558716, "learning_rate": 7.654027203356699e-05, "loss": 0.2182, "step": 5690 }, { "epoch": 1.1517911353976928, "grad_norm": 0.3601949214935303, "learning_rate": 7.650935371971272e-05, "loss": 0.1524, "step": 5691 }, { "epoch": 1.151993523578223, "grad_norm": 0.3818325400352478, "learning_rate": 7.647843778239554e-05, "loss": 0.2032, "step": 5692 }, { "epoch": 1.1521959117587532, "grad_norm": 0.24166607856750488, "learning_rate": 7.64475242247432e-05, "loss": 0.1826, "step": 5693 }, { "epoch": 1.1523982999392834, "grad_norm": 0.2476678043603897, "learning_rate": 7.641661304988322e-05, "loss": 0.1766, "step": 5694 }, { "epoch": 1.1526006881198139, "grad_norm": 0.282958447933197, "learning_rate": 7.638570426094288e-05, "loss": 0.2119, "step": 5695 }, { "epoch": 1.152803076300344, "grad_norm": 0.251973420381546, "learning_rate": 7.63547978610492e-05, "loss": 0.1762, "step": 5696 }, { "epoch": 1.1530054644808743, "grad_norm": 0.32014891505241394, "learning_rate": 7.632389385332898e-05, "loss": 0.1977, "step": 5697 }, { "epoch": 1.1532078526614047, "grad_norm": 0.2665617763996124, "learning_rate": 7.629299224090873e-05, "loss": 0.1813, "step": 5698 }, { "epoch": 1.1534102408419349, "grad_norm": 0.31495678424835205, "learning_rate": 7.626209302691478e-05, "loss": 0.2024, "step": 5699 }, { "epoch": 1.153612629022465, "grad_norm": 0.26400405168533325, "learning_rate": 7.623119621447317e-05, "loss": 0.1825, "step": 5700 }, { "epoch": 1.153612629022465, "eval_loss": 0.27036550641059875, "eval_runtime": 0.7387, "eval_samples_per_second": 6.769, "eval_steps_per_second": 1.354, "step": 5700 }, { "epoch": 1.1538150172029953, "grad_norm": 0.3164843022823334, "learning_rate": 7.620030180670975e-05, "loss": 0.1982, "step": 5701 }, { "epoch": 1.1540174053835255, "grad_norm": 0.25917619466781616, "learning_rate": 7.616940980675004e-05, "loss": 0.1793, "step": 5702 }, { "epoch": 1.154219793564056, "grad_norm": 0.2807494103908539, "learning_rate": 7.613852021771939e-05, "loss": 0.1985, "step": 5703 }, { "epoch": 1.154422181744586, "grad_norm": 0.32781749963760376, "learning_rate": 7.610763304274291e-05, "loss": 0.2268, "step": 5704 }, { "epoch": 1.1546245699251163, "grad_norm": 0.2813950479030609, "learning_rate": 7.60767482849454e-05, "loss": 0.1821, "step": 5705 }, { "epoch": 1.1548269581056467, "grad_norm": 0.34443584084510803, "learning_rate": 7.604586594745149e-05, "loss": 0.2046, "step": 5706 }, { "epoch": 1.155029346286177, "grad_norm": 0.3148466646671295, "learning_rate": 7.601498603338548e-05, "loss": 0.1884, "step": 5707 }, { "epoch": 1.1552317344667071, "grad_norm": 0.29084309935569763, "learning_rate": 7.598410854587155e-05, "loss": 0.1952, "step": 5708 }, { "epoch": 1.1554341226472373, "grad_norm": 0.2680000364780426, "learning_rate": 7.595323348803351e-05, "loss": 0.1603, "step": 5709 }, { "epoch": 1.1556365108277677, "grad_norm": 0.32312873005867004, "learning_rate": 7.592236086299499e-05, "loss": 0.2157, "step": 5710 }, { "epoch": 1.155838899008298, "grad_norm": 0.43009570240974426, "learning_rate": 7.589149067387934e-05, "loss": 0.2189, "step": 5711 }, { "epoch": 1.1560412871888281, "grad_norm": 0.22385583817958832, "learning_rate": 7.586062292380971e-05, "loss": 0.1582, "step": 5712 }, { "epoch": 1.1562436753693583, "grad_norm": 0.2577807605266571, "learning_rate": 7.582975761590901e-05, "loss": 0.1635, "step": 5713 }, { "epoch": 1.1564460635498888, "grad_norm": 0.27913615107536316, "learning_rate": 7.579889475329975e-05, "loss": 0.1903, "step": 5714 }, { "epoch": 1.156648451730419, "grad_norm": 0.34618043899536133, "learning_rate": 7.576803433910439e-05, "loss": 0.1793, "step": 5715 }, { "epoch": 1.1568508399109492, "grad_norm": 0.2728598713874817, "learning_rate": 7.573717637644508e-05, "loss": 0.2004, "step": 5716 }, { "epoch": 1.1570532280914794, "grad_norm": 0.2509997487068176, "learning_rate": 7.570632086844364e-05, "loss": 0.1848, "step": 5717 }, { "epoch": 1.1572556162720098, "grad_norm": 0.2825949788093567, "learning_rate": 7.567546781822177e-05, "loss": 0.1845, "step": 5718 }, { "epoch": 1.15745800445254, "grad_norm": 0.3017740547657013, "learning_rate": 7.564461722890081e-05, "loss": 0.1882, "step": 5719 }, { "epoch": 1.1576603926330702, "grad_norm": 0.25365105271339417, "learning_rate": 7.561376910360194e-05, "loss": 0.174, "step": 5720 }, { "epoch": 1.1578627808136004, "grad_norm": 0.6366367936134338, "learning_rate": 7.558292344544603e-05, "loss": 0.1798, "step": 5721 }, { "epoch": 1.1580651689941308, "grad_norm": 0.28383052349090576, "learning_rate": 7.555208025755372e-05, "loss": 0.2105, "step": 5722 }, { "epoch": 1.158267557174661, "grad_norm": 0.30744868516921997, "learning_rate": 7.552123954304539e-05, "loss": 0.1909, "step": 5723 }, { "epoch": 1.1584699453551912, "grad_norm": 0.35757702589035034, "learning_rate": 7.54904013050412e-05, "loss": 0.2017, "step": 5724 }, { "epoch": 1.1586723335357214, "grad_norm": 0.28952452540397644, "learning_rate": 7.5459565546661e-05, "loss": 0.177, "step": 5725 }, { "epoch": 1.1588747217162518, "grad_norm": 0.3115183115005493, "learning_rate": 7.54287322710245e-05, "loss": 0.2081, "step": 5726 }, { "epoch": 1.159077109896782, "grad_norm": 0.23546357452869415, "learning_rate": 7.539790148125103e-05, "loss": 0.1674, "step": 5727 }, { "epoch": 1.1592794980773122, "grad_norm": 0.2618536353111267, "learning_rate": 7.536707318045972e-05, "loss": 0.1827, "step": 5728 }, { "epoch": 1.1594818862578427, "grad_norm": 0.31117162108421326, "learning_rate": 7.53362473717695e-05, "loss": 0.1912, "step": 5729 }, { "epoch": 1.1596842744383729, "grad_norm": 0.28290385007858276, "learning_rate": 7.530542405829897e-05, "loss": 0.1975, "step": 5730 }, { "epoch": 1.159886662618903, "grad_norm": 0.27966201305389404, "learning_rate": 7.527460324316652e-05, "loss": 0.1771, "step": 5731 }, { "epoch": 1.1600890507994333, "grad_norm": 0.30912506580352783, "learning_rate": 7.524378492949027e-05, "loss": 0.2024, "step": 5732 }, { "epoch": 1.1602914389799635, "grad_norm": 0.2371935397386551, "learning_rate": 7.52129691203881e-05, "loss": 0.1673, "step": 5733 }, { "epoch": 1.1604938271604939, "grad_norm": 0.2745453715324402, "learning_rate": 7.518215581897763e-05, "loss": 0.1787, "step": 5734 }, { "epoch": 1.160696215341024, "grad_norm": 0.3203498125076294, "learning_rate": 7.51513450283762e-05, "loss": 0.1605, "step": 5735 }, { "epoch": 1.1608986035215543, "grad_norm": 0.313037633895874, "learning_rate": 7.5120536751701e-05, "loss": 0.2123, "step": 5736 }, { "epoch": 1.1611009917020847, "grad_norm": 0.308227002620697, "learning_rate": 7.50897309920688e-05, "loss": 0.1832, "step": 5737 }, { "epoch": 1.161303379882615, "grad_norm": 0.34059271216392517, "learning_rate": 7.505892775259624e-05, "loss": 0.2515, "step": 5738 }, { "epoch": 1.161505768063145, "grad_norm": 0.3082710802555084, "learning_rate": 7.502812703639966e-05, "loss": 0.2191, "step": 5739 }, { "epoch": 1.1617081562436753, "grad_norm": 0.26456886529922485, "learning_rate": 7.499732884659516e-05, "loss": 0.1952, "step": 5740 }, { "epoch": 1.1619105444242057, "grad_norm": 0.2999734878540039, "learning_rate": 7.496653318629857e-05, "loss": 0.2187, "step": 5741 }, { "epoch": 1.162112932604736, "grad_norm": 0.248836487531662, "learning_rate": 7.493574005862549e-05, "loss": 0.1693, "step": 5742 }, { "epoch": 1.1623153207852661, "grad_norm": 0.2821309268474579, "learning_rate": 7.490494946669124e-05, "loss": 0.1903, "step": 5743 }, { "epoch": 1.1625177089657963, "grad_norm": 0.3294956684112549, "learning_rate": 7.487416141361087e-05, "loss": 0.1664, "step": 5744 }, { "epoch": 1.1627200971463267, "grad_norm": 0.2846418619155884, "learning_rate": 7.48433759024992e-05, "loss": 0.1768, "step": 5745 }, { "epoch": 1.162922485326857, "grad_norm": 0.29694685339927673, "learning_rate": 7.481259293647081e-05, "loss": 0.2119, "step": 5746 }, { "epoch": 1.1631248735073871, "grad_norm": 0.31481388211250305, "learning_rate": 7.478181251863998e-05, "loss": 0.184, "step": 5747 }, { "epoch": 1.1633272616879173, "grad_norm": 0.29571497440338135, "learning_rate": 7.475103465212073e-05, "loss": 0.1918, "step": 5748 }, { "epoch": 1.1635296498684478, "grad_norm": 0.30913057923316956, "learning_rate": 7.472025934002686e-05, "loss": 0.2121, "step": 5749 }, { "epoch": 1.163732038048978, "grad_norm": 0.40576112270355225, "learning_rate": 7.468948658547191e-05, "loss": 0.1983, "step": 5750 }, { "epoch": 1.163732038048978, "eval_loss": 0.26926201581954956, "eval_runtime": 0.7385, "eval_samples_per_second": 6.77, "eval_steps_per_second": 1.354, "step": 5750 }, { "epoch": 1.1639344262295082, "grad_norm": 0.2734135091304779, "learning_rate": 7.465871639156912e-05, "loss": 0.1961, "step": 5751 }, { "epoch": 1.1641368144100386, "grad_norm": 0.2998042404651642, "learning_rate": 7.462794876143151e-05, "loss": 0.2001, "step": 5752 }, { "epoch": 1.1643392025905688, "grad_norm": 0.2595381736755371, "learning_rate": 7.45971836981718e-05, "loss": 0.1578, "step": 5753 }, { "epoch": 1.164541590771099, "grad_norm": 0.2931159436702728, "learning_rate": 7.45664212049025e-05, "loss": 0.1983, "step": 5754 }, { "epoch": 1.1647439789516292, "grad_norm": 0.26160353422164917, "learning_rate": 7.453566128473584e-05, "loss": 0.1849, "step": 5755 }, { "epoch": 1.1649463671321594, "grad_norm": 0.35950490832328796, "learning_rate": 7.450490394078377e-05, "loss": 0.1843, "step": 5756 }, { "epoch": 1.1651487553126898, "grad_norm": 0.26078858971595764, "learning_rate": 7.4474149176158e-05, "loss": 0.1849, "step": 5757 }, { "epoch": 1.16535114349322, "grad_norm": 0.29962724447250366, "learning_rate": 7.444339699397001e-05, "loss": 0.2012, "step": 5758 }, { "epoch": 1.1655535316737502, "grad_norm": 0.2402295470237732, "learning_rate": 7.441264739733091e-05, "loss": 0.1849, "step": 5759 }, { "epoch": 1.1657559198542806, "grad_norm": 0.33564725518226624, "learning_rate": 7.438190038935168e-05, "loss": 0.2195, "step": 5760 }, { "epoch": 1.1659583080348108, "grad_norm": 0.25888490676879883, "learning_rate": 7.435115597314295e-05, "loss": 0.1859, "step": 5761 }, { "epoch": 1.166160696215341, "grad_norm": 0.3285493552684784, "learning_rate": 7.432041415181513e-05, "loss": 0.2098, "step": 5762 }, { "epoch": 1.1663630843958712, "grad_norm": 0.3084118664264679, "learning_rate": 7.428967492847836e-05, "loss": 0.1942, "step": 5763 }, { "epoch": 1.1665654725764014, "grad_norm": 0.2995312511920929, "learning_rate": 7.425893830624248e-05, "loss": 0.1952, "step": 5764 }, { "epoch": 1.1667678607569318, "grad_norm": 0.2527577877044678, "learning_rate": 7.422820428821716e-05, "loss": 0.2032, "step": 5765 }, { "epoch": 1.166970248937462, "grad_norm": 0.3025054931640625, "learning_rate": 7.419747287751169e-05, "loss": 0.2194, "step": 5766 }, { "epoch": 1.1671726371179922, "grad_norm": 0.311558336019516, "learning_rate": 7.416674407723518e-05, "loss": 0.2092, "step": 5767 }, { "epoch": 1.1673750252985227, "grad_norm": 0.2646612226963043, "learning_rate": 7.413601789049644e-05, "loss": 0.2167, "step": 5768 }, { "epoch": 1.1675774134790529, "grad_norm": 0.284598708152771, "learning_rate": 7.410529432040401e-05, "loss": 0.1923, "step": 5769 }, { "epoch": 1.167779801659583, "grad_norm": 0.2764895260334015, "learning_rate": 7.40745733700662e-05, "loss": 0.1709, "step": 5770 }, { "epoch": 1.1679821898401133, "grad_norm": 0.271132230758667, "learning_rate": 7.404385504259102e-05, "loss": 0.1925, "step": 5771 }, { "epoch": 1.1681845780206437, "grad_norm": 0.2631952464580536, "learning_rate": 7.401313934108622e-05, "loss": 0.1893, "step": 5772 }, { "epoch": 1.1683869662011739, "grad_norm": 0.2543254494667053, "learning_rate": 7.398242626865932e-05, "loss": 0.15, "step": 5773 }, { "epoch": 1.168589354381704, "grad_norm": 0.26711201667785645, "learning_rate": 7.395171582841753e-05, "loss": 0.1725, "step": 5774 }, { "epoch": 1.1687917425622343, "grad_norm": 0.29442912340164185, "learning_rate": 7.392100802346782e-05, "loss": 0.2194, "step": 5775 }, { "epoch": 1.1689941307427647, "grad_norm": 0.2915812134742737, "learning_rate": 7.389030285691684e-05, "loss": 0.2029, "step": 5776 }, { "epoch": 1.169196518923295, "grad_norm": 0.3449213206768036, "learning_rate": 7.385960033187109e-05, "loss": 0.2279, "step": 5777 }, { "epoch": 1.169398907103825, "grad_norm": 0.312273234128952, "learning_rate": 7.382890045143667e-05, "loss": 0.2489, "step": 5778 }, { "epoch": 1.1696012952843553, "grad_norm": 0.28574004769325256, "learning_rate": 7.379820321871951e-05, "loss": 0.1972, "step": 5779 }, { "epoch": 1.1698036834648857, "grad_norm": 0.2767406404018402, "learning_rate": 7.376750863682522e-05, "loss": 0.1987, "step": 5780 }, { "epoch": 1.170006071645416, "grad_norm": 0.26038801670074463, "learning_rate": 7.373681670885912e-05, "loss": 0.1837, "step": 5781 }, { "epoch": 1.1702084598259461, "grad_norm": 0.2696447968482971, "learning_rate": 7.370612743792636e-05, "loss": 0.1758, "step": 5782 }, { "epoch": 1.1704108480064765, "grad_norm": 0.2397984117269516, "learning_rate": 7.36754408271317e-05, "loss": 0.1637, "step": 5783 }, { "epoch": 1.1706132361870067, "grad_norm": 0.25606390833854675, "learning_rate": 7.364475687957972e-05, "loss": 0.1731, "step": 5784 }, { "epoch": 1.170815624367537, "grad_norm": 0.267202764749527, "learning_rate": 7.361407559837472e-05, "loss": 0.1717, "step": 5785 }, { "epoch": 1.1710180125480671, "grad_norm": 0.2734069228172302, "learning_rate": 7.358339698662066e-05, "loss": 0.2113, "step": 5786 }, { "epoch": 1.1712204007285973, "grad_norm": 0.25181517004966736, "learning_rate": 7.355272104742132e-05, "loss": 0.1635, "step": 5787 }, { "epoch": 1.1714227889091278, "grad_norm": 0.2656586170196533, "learning_rate": 7.352204778388016e-05, "loss": 0.1758, "step": 5788 }, { "epoch": 1.171625177089658, "grad_norm": 0.2795346677303314, "learning_rate": 7.349137719910037e-05, "loss": 0.2048, "step": 5789 }, { "epoch": 1.1718275652701882, "grad_norm": 0.3402021527290344, "learning_rate": 7.346070929618487e-05, "loss": 0.1918, "step": 5790 }, { "epoch": 1.1720299534507186, "grad_norm": 0.3135707378387451, "learning_rate": 7.343004407823635e-05, "loss": 0.1981, "step": 5791 }, { "epoch": 1.1722323416312488, "grad_norm": 0.3004648685455322, "learning_rate": 7.339938154835717e-05, "loss": 0.2041, "step": 5792 }, { "epoch": 1.172434729811779, "grad_norm": 0.2892182171344757, "learning_rate": 7.336872170964943e-05, "loss": 0.1955, "step": 5793 }, { "epoch": 1.1726371179923092, "grad_norm": 0.3182966411113739, "learning_rate": 7.333806456521501e-05, "loss": 0.2039, "step": 5794 }, { "epoch": 1.1728395061728394, "grad_norm": 0.2533356547355652, "learning_rate": 7.330741011815545e-05, "loss": 0.1735, "step": 5795 }, { "epoch": 1.1730418943533698, "grad_norm": 0.26306235790252686, "learning_rate": 7.327675837157206e-05, "loss": 0.1828, "step": 5796 }, { "epoch": 1.1732442825339, "grad_norm": 0.31353071331977844, "learning_rate": 7.324610932856584e-05, "loss": 0.2047, "step": 5797 }, { "epoch": 1.1734466707144302, "grad_norm": 0.2928347587585449, "learning_rate": 7.321546299223756e-05, "loss": 0.1931, "step": 5798 }, { "epoch": 1.1736490588949606, "grad_norm": 0.2831988036632538, "learning_rate": 7.318481936568768e-05, "loss": 0.1872, "step": 5799 }, { "epoch": 1.1738514470754908, "grad_norm": 0.2879961133003235, "learning_rate": 7.315417845201641e-05, "loss": 0.2032, "step": 5800 }, { "epoch": 1.1738514470754908, "eval_loss": 0.2707487642765045, "eval_runtime": 0.7405, "eval_samples_per_second": 6.752, "eval_steps_per_second": 1.35, "step": 5800 }, { "epoch": 1.174053835256021, "grad_norm": 0.27901849150657654, "learning_rate": 7.312354025432368e-05, "loss": 0.1909, "step": 5801 }, { "epoch": 1.1742562234365512, "grad_norm": 0.2636677026748657, "learning_rate": 7.309290477570916e-05, "loss": 0.1768, "step": 5802 }, { "epoch": 1.1744586116170816, "grad_norm": 0.2820374071598053, "learning_rate": 7.306227201927218e-05, "loss": 0.2055, "step": 5803 }, { "epoch": 1.1746609997976118, "grad_norm": 0.32560697197914124, "learning_rate": 7.303164198811185e-05, "loss": 0.1841, "step": 5804 }, { "epoch": 1.174863387978142, "grad_norm": 0.293254554271698, "learning_rate": 7.300101468532703e-05, "loss": 0.2125, "step": 5805 }, { "epoch": 1.1750657761586722, "grad_norm": 0.24610738456249237, "learning_rate": 7.297039011401623e-05, "loss": 0.2007, "step": 5806 }, { "epoch": 1.1752681643392027, "grad_norm": 0.2838497757911682, "learning_rate": 7.293976827727775e-05, "loss": 0.1804, "step": 5807 }, { "epoch": 1.1754705525197329, "grad_norm": 0.24666501581668854, "learning_rate": 7.290914917820957e-05, "loss": 0.1206, "step": 5808 }, { "epoch": 1.175672940700263, "grad_norm": 0.26965487003326416, "learning_rate": 7.287853281990941e-05, "loss": 0.218, "step": 5809 }, { "epoch": 1.1758753288807933, "grad_norm": 0.2837803065776825, "learning_rate": 7.284791920547472e-05, "loss": 0.1739, "step": 5810 }, { "epoch": 1.1760777170613237, "grad_norm": 0.3394570052623749, "learning_rate": 7.281730833800266e-05, "loss": 0.2183, "step": 5811 }, { "epoch": 1.176280105241854, "grad_norm": 0.28940433263778687, "learning_rate": 7.278670022059012e-05, "loss": 0.1727, "step": 5812 }, { "epoch": 1.176482493422384, "grad_norm": 0.3111019432544708, "learning_rate": 7.27560948563337e-05, "loss": 0.1958, "step": 5813 }, { "epoch": 1.1766848816029145, "grad_norm": 0.26451563835144043, "learning_rate": 7.272549224832974e-05, "loss": 0.1971, "step": 5814 }, { "epoch": 1.1768872697834447, "grad_norm": 0.29414859414100647, "learning_rate": 7.269489239967429e-05, "loss": 0.1824, "step": 5815 }, { "epoch": 1.177089657963975, "grad_norm": 0.2569786608219147, "learning_rate": 7.266429531346313e-05, "loss": 0.1561, "step": 5816 }, { "epoch": 1.177292046144505, "grad_norm": 0.3070293962955475, "learning_rate": 7.263370099279172e-05, "loss": 0.2182, "step": 5817 }, { "epoch": 1.1774944343250353, "grad_norm": 0.3086085617542267, "learning_rate": 7.26031094407553e-05, "loss": 0.2163, "step": 5818 }, { "epoch": 1.1776968225055657, "grad_norm": 0.9129588603973389, "learning_rate": 7.257252066044878e-05, "loss": 0.1985, "step": 5819 }, { "epoch": 1.177899210686096, "grad_norm": 0.34575098752975464, "learning_rate": 7.254193465496683e-05, "loss": 0.1907, "step": 5820 }, { "epoch": 1.1781015988666261, "grad_norm": 0.3244081437587738, "learning_rate": 7.251135142740384e-05, "loss": 0.1941, "step": 5821 }, { "epoch": 1.1783039870471566, "grad_norm": 0.2724374532699585, "learning_rate": 7.248077098085386e-05, "loss": 0.174, "step": 5822 }, { "epoch": 1.1785063752276868, "grad_norm": 0.3696213662624359, "learning_rate": 7.245019331841072e-05, "loss": 0.1876, "step": 5823 }, { "epoch": 1.178708763408217, "grad_norm": 0.26547983288764954, "learning_rate": 7.241961844316793e-05, "loss": 0.1784, "step": 5824 }, { "epoch": 1.1789111515887472, "grad_norm": 0.32802996039390564, "learning_rate": 7.238904635821882e-05, "loss": 0.2381, "step": 5825 }, { "epoch": 1.1791135397692774, "grad_norm": 0.29037603735923767, "learning_rate": 7.235847706665621e-05, "loss": 0.2041, "step": 5826 }, { "epoch": 1.1793159279498078, "grad_norm": 0.2806302607059479, "learning_rate": 7.232791057157287e-05, "loss": 0.1926, "step": 5827 }, { "epoch": 1.179518316130338, "grad_norm": 0.3043530583381653, "learning_rate": 7.229734687606118e-05, "loss": 0.2091, "step": 5828 }, { "epoch": 1.1797207043108682, "grad_norm": 0.30080434679985046, "learning_rate": 7.226678598321324e-05, "loss": 0.1938, "step": 5829 }, { "epoch": 1.1799230924913986, "grad_norm": 0.3069448471069336, "learning_rate": 7.223622789612088e-05, "loss": 0.1975, "step": 5830 }, { "epoch": 1.1801254806719288, "grad_norm": 0.27904027700424194, "learning_rate": 7.220567261787567e-05, "loss": 0.1966, "step": 5831 }, { "epoch": 1.180327868852459, "grad_norm": 0.33528608083724976, "learning_rate": 7.217512015156886e-05, "loss": 0.1983, "step": 5832 }, { "epoch": 1.1805302570329892, "grad_norm": 0.3564962148666382, "learning_rate": 7.214457050029144e-05, "loss": 0.1881, "step": 5833 }, { "epoch": 1.1807326452135196, "grad_norm": 0.2963548004627228, "learning_rate": 7.211402366713408e-05, "loss": 0.2, "step": 5834 }, { "epoch": 1.1809350333940498, "grad_norm": 0.26449206471443176, "learning_rate": 7.208347965518723e-05, "loss": 0.1658, "step": 5835 }, { "epoch": 1.18113742157458, "grad_norm": 0.2890670597553253, "learning_rate": 7.205293846754095e-05, "loss": 0.1917, "step": 5836 }, { "epoch": 1.1813398097551102, "grad_norm": 0.2905254662036896, "learning_rate": 7.202240010728514e-05, "loss": 0.2009, "step": 5837 }, { "epoch": 1.1815421979356406, "grad_norm": 0.2885470986366272, "learning_rate": 7.19918645775093e-05, "loss": 0.1886, "step": 5838 }, { "epoch": 1.1817445861161708, "grad_norm": 0.2656796872615814, "learning_rate": 7.196133188130272e-05, "loss": 0.1747, "step": 5839 }, { "epoch": 1.181946974296701, "grad_norm": 0.28297480940818787, "learning_rate": 7.19308020217544e-05, "loss": 0.184, "step": 5840 }, { "epoch": 1.1821493624772312, "grad_norm": 0.2764962911605835, "learning_rate": 7.190027500195297e-05, "loss": 0.179, "step": 5841 }, { "epoch": 1.1823517506577617, "grad_norm": 0.28837940096855164, "learning_rate": 7.186975082498689e-05, "loss": 0.2043, "step": 5842 }, { "epoch": 1.1825541388382919, "grad_norm": 0.41320478916168213, "learning_rate": 7.183922949394424e-05, "loss": 0.2055, "step": 5843 }, { "epoch": 1.182756527018822, "grad_norm": 0.32843437790870667, "learning_rate": 7.180871101191287e-05, "loss": 0.2397, "step": 5844 }, { "epoch": 1.1829589151993525, "grad_norm": 0.2618347704410553, "learning_rate": 7.17781953819803e-05, "loss": 0.1995, "step": 5845 }, { "epoch": 1.1831613033798827, "grad_norm": 0.30906206369400024, "learning_rate": 7.174768260723382e-05, "loss": 0.1996, "step": 5846 }, { "epoch": 1.1833636915604129, "grad_norm": 0.30363285541534424, "learning_rate": 7.171717269076036e-05, "loss": 0.2062, "step": 5847 }, { "epoch": 1.183566079740943, "grad_norm": 0.3194428086280823, "learning_rate": 7.168666563564661e-05, "loss": 0.1685, "step": 5848 }, { "epoch": 1.1837684679214733, "grad_norm": 0.2629626393318176, "learning_rate": 7.16561614449789e-05, "loss": 0.193, "step": 5849 }, { "epoch": 1.1839708561020037, "grad_norm": 0.34626123309135437, "learning_rate": 7.16256601218434e-05, "loss": 0.2522, "step": 5850 }, { "epoch": 1.1839708561020037, "eval_loss": 0.2733302414417267, "eval_runtime": 0.7396, "eval_samples_per_second": 6.761, "eval_steps_per_second": 1.352, "step": 5850 }, { "epoch": 1.184173244282534, "grad_norm": 0.3396178185939789, "learning_rate": 7.159516166932587e-05, "loss": 0.2486, "step": 5851 }, { "epoch": 1.184375632463064, "grad_norm": 0.2949715256690979, "learning_rate": 7.156466609051182e-05, "loss": 0.1871, "step": 5852 }, { "epoch": 1.1845780206435945, "grad_norm": 0.3098287284374237, "learning_rate": 7.153417338848651e-05, "loss": 0.1727, "step": 5853 }, { "epoch": 1.1847804088241247, "grad_norm": 0.2525773048400879, "learning_rate": 7.150368356633484e-05, "loss": 0.1943, "step": 5854 }, { "epoch": 1.184982797004655, "grad_norm": 0.31998711824417114, "learning_rate": 7.147319662714145e-05, "loss": 0.2131, "step": 5855 }, { "epoch": 1.1851851851851851, "grad_norm": 0.36343008279800415, "learning_rate": 7.144271257399068e-05, "loss": 0.1942, "step": 5856 }, { "epoch": 1.1853875733657153, "grad_norm": 0.32088908553123474, "learning_rate": 7.141223140996663e-05, "loss": 0.1923, "step": 5857 }, { "epoch": 1.1855899615462457, "grad_norm": 0.27131387591362, "learning_rate": 7.138175313815302e-05, "loss": 0.176, "step": 5858 }, { "epoch": 1.185792349726776, "grad_norm": 0.24319404363632202, "learning_rate": 7.135127776163336e-05, "loss": 0.1786, "step": 5859 }, { "epoch": 1.1859947379073061, "grad_norm": 0.26026079058647156, "learning_rate": 7.132080528349078e-05, "loss": 0.199, "step": 5860 }, { "epoch": 1.1861971260878366, "grad_norm": 0.2787209153175354, "learning_rate": 7.129033570680821e-05, "loss": 0.2177, "step": 5861 }, { "epoch": 1.1863995142683668, "grad_norm": 0.28949227929115295, "learning_rate": 7.125986903466823e-05, "loss": 0.2146, "step": 5862 }, { "epoch": 1.186601902448897, "grad_norm": 0.2977891266345978, "learning_rate": 7.122940527015314e-05, "loss": 0.198, "step": 5863 }, { "epoch": 1.1868042906294272, "grad_norm": 0.33024102449417114, "learning_rate": 7.119894441634494e-05, "loss": 0.198, "step": 5864 }, { "epoch": 1.1870066788099576, "grad_norm": 0.29550671577453613, "learning_rate": 7.116848647632532e-05, "loss": 0.2051, "step": 5865 }, { "epoch": 1.1872090669904878, "grad_norm": 0.2810512185096741, "learning_rate": 7.113803145317573e-05, "loss": 0.1662, "step": 5866 }, { "epoch": 1.187411455171018, "grad_norm": 0.2707330882549286, "learning_rate": 7.110757934997726e-05, "loss": 0.1954, "step": 5867 }, { "epoch": 1.1876138433515482, "grad_norm": 0.2579241394996643, "learning_rate": 7.107713016981075e-05, "loss": 0.1824, "step": 5868 }, { "epoch": 1.1878162315320786, "grad_norm": 0.23849055171012878, "learning_rate": 7.104668391575677e-05, "loss": 0.2018, "step": 5869 }, { "epoch": 1.1880186197126088, "grad_norm": 0.23894186317920685, "learning_rate": 7.101624059089547e-05, "loss": 0.15, "step": 5870 }, { "epoch": 1.188221007893139, "grad_norm": 0.29739347100257874, "learning_rate": 7.098580019830681e-05, "loss": 0.2244, "step": 5871 }, { "epoch": 1.1884233960736692, "grad_norm": 0.32754260301589966, "learning_rate": 7.095536274107046e-05, "loss": 0.1911, "step": 5872 }, { "epoch": 1.1886257842541996, "grad_norm": 0.3321670889854431, "learning_rate": 7.092492822226573e-05, "loss": 0.1978, "step": 5873 }, { "epoch": 1.1888281724347298, "grad_norm": 0.2944977283477783, "learning_rate": 7.089449664497169e-05, "loss": 0.1975, "step": 5874 }, { "epoch": 1.18903056061526, "grad_norm": 0.30659884214401245, "learning_rate": 7.086406801226709e-05, "loss": 0.1958, "step": 5875 }, { "epoch": 1.1892329487957904, "grad_norm": 0.2836208939552307, "learning_rate": 7.083364232723035e-05, "loss": 0.1985, "step": 5876 }, { "epoch": 1.1894353369763206, "grad_norm": 0.3166012763977051, "learning_rate": 7.080321959293964e-05, "loss": 0.2182, "step": 5877 }, { "epoch": 1.1896377251568508, "grad_norm": 0.28273382782936096, "learning_rate": 7.077279981247282e-05, "loss": 0.1937, "step": 5878 }, { "epoch": 1.189840113337381, "grad_norm": 0.3097810447216034, "learning_rate": 7.074238298890744e-05, "loss": 0.2273, "step": 5879 }, { "epoch": 1.1900425015179112, "grad_norm": 0.3035329580307007, "learning_rate": 7.071196912532075e-05, "loss": 0.2155, "step": 5880 }, { "epoch": 1.1902448896984417, "grad_norm": 0.2822204828262329, "learning_rate": 7.06815582247897e-05, "loss": 0.1871, "step": 5881 }, { "epoch": 1.1904472778789719, "grad_norm": 0.3719690442085266, "learning_rate": 7.065115029039097e-05, "loss": 0.2051, "step": 5882 }, { "epoch": 1.190649666059502, "grad_norm": 0.28150033950805664, "learning_rate": 7.062074532520089e-05, "loss": 0.217, "step": 5883 }, { "epoch": 1.1908520542400325, "grad_norm": 0.2478693574666977, "learning_rate": 7.059034333229552e-05, "loss": 0.1606, "step": 5884 }, { "epoch": 1.1910544424205627, "grad_norm": 0.35104045271873474, "learning_rate": 7.055994431475064e-05, "loss": 0.1931, "step": 5885 }, { "epoch": 1.1912568306010929, "grad_norm": 0.31746405363082886, "learning_rate": 7.052954827564167e-05, "loss": 0.2154, "step": 5886 }, { "epoch": 1.191459218781623, "grad_norm": 0.298566997051239, "learning_rate": 7.049915521804377e-05, "loss": 0.2133, "step": 5887 }, { "epoch": 1.1916616069621533, "grad_norm": 0.2977025508880615, "learning_rate": 7.04687651450318e-05, "loss": 0.1913, "step": 5888 }, { "epoch": 1.1918639951426837, "grad_norm": 0.2290477752685547, "learning_rate": 7.043837805968027e-05, "loss": 0.1734, "step": 5889 }, { "epoch": 1.192066383323214, "grad_norm": 0.2559657394886017, "learning_rate": 7.040799396506351e-05, "loss": 0.1614, "step": 5890 }, { "epoch": 1.192268771503744, "grad_norm": 0.28855013847351074, "learning_rate": 7.037761286425543e-05, "loss": 0.2001, "step": 5891 }, { "epoch": 1.1924711596842745, "grad_norm": 0.28611060976982117, "learning_rate": 7.034723476032965e-05, "loss": 0.2088, "step": 5892 }, { "epoch": 1.1926735478648047, "grad_norm": 0.2739086151123047, "learning_rate": 7.031685965635948e-05, "loss": 0.1936, "step": 5893 }, { "epoch": 1.192875936045335, "grad_norm": 0.36974024772644043, "learning_rate": 7.0286487555418e-05, "loss": 0.2255, "step": 5894 }, { "epoch": 1.1930783242258651, "grad_norm": 0.33781471848487854, "learning_rate": 7.025611846057794e-05, "loss": 0.2057, "step": 5895 }, { "epoch": 1.1932807124063955, "grad_norm": 0.23322023451328278, "learning_rate": 7.022575237491171e-05, "loss": 0.1592, "step": 5896 }, { "epoch": 1.1934831005869257, "grad_norm": 0.33305227756500244, "learning_rate": 7.019538930149144e-05, "loss": 0.2014, "step": 5897 }, { "epoch": 1.193685488767456, "grad_norm": 0.3045668303966522, "learning_rate": 7.016502924338892e-05, "loss": 0.1735, "step": 5898 }, { "epoch": 1.1938878769479861, "grad_norm": 0.26250389218330383, "learning_rate": 7.013467220367571e-05, "loss": 0.1937, "step": 5899 }, { "epoch": 1.1940902651285166, "grad_norm": 0.2677670419216156, "learning_rate": 7.010431818542297e-05, "loss": 0.1967, "step": 5900 }, { "epoch": 1.1940902651285166, "eval_loss": 0.27170634269714355, "eval_runtime": 0.7392, "eval_samples_per_second": 6.764, "eval_steps_per_second": 1.353, "step": 5900 }, { "epoch": 1.1942926533090468, "grad_norm": 0.32133355736732483, "learning_rate": 7.007396719170165e-05, "loss": 0.1986, "step": 5901 }, { "epoch": 1.194495041489577, "grad_norm": 0.2636141777038574, "learning_rate": 7.004361922558232e-05, "loss": 0.1639, "step": 5902 }, { "epoch": 1.1946974296701072, "grad_norm": 0.27497488260269165, "learning_rate": 7.001327429013525e-05, "loss": 0.2042, "step": 5903 }, { "epoch": 1.1948998178506376, "grad_norm": 0.29585936665534973, "learning_rate": 6.998293238843044e-05, "loss": 0.1888, "step": 5904 }, { "epoch": 1.1951022060311678, "grad_norm": 0.2776990830898285, "learning_rate": 6.995259352353758e-05, "loss": 0.2039, "step": 5905 }, { "epoch": 1.195304594211698, "grad_norm": 0.2712746262550354, "learning_rate": 6.992225769852601e-05, "loss": 0.17, "step": 5906 }, { "epoch": 1.1955069823922284, "grad_norm": 0.3972209095954895, "learning_rate": 6.989192491646481e-05, "loss": 0.2051, "step": 5907 }, { "epoch": 1.1957093705727586, "grad_norm": 0.2815692126750946, "learning_rate": 6.986159518042273e-05, "loss": 0.1901, "step": 5908 }, { "epoch": 1.1959117587532888, "grad_norm": 0.32967308163642883, "learning_rate": 6.983126849346821e-05, "loss": 0.2278, "step": 5909 }, { "epoch": 1.196114146933819, "grad_norm": 0.2739197313785553, "learning_rate": 6.980094485866938e-05, "loss": 0.1795, "step": 5910 }, { "epoch": 1.1963165351143492, "grad_norm": 0.2594100832939148, "learning_rate": 6.977062427909408e-05, "loss": 0.1745, "step": 5911 }, { "epoch": 1.1965189232948796, "grad_norm": 0.27367496490478516, "learning_rate": 6.974030675780982e-05, "loss": 0.1788, "step": 5912 }, { "epoch": 1.1967213114754098, "grad_norm": 0.35172849893569946, "learning_rate": 6.970999229788381e-05, "loss": 0.2273, "step": 5913 }, { "epoch": 1.19692369965594, "grad_norm": 0.3972116708755493, "learning_rate": 6.967968090238297e-05, "loss": 0.1738, "step": 5914 }, { "epoch": 1.1971260878364705, "grad_norm": 0.24987083673477173, "learning_rate": 6.964937257437386e-05, "loss": 0.1734, "step": 5915 }, { "epoch": 1.1973284760170007, "grad_norm": 0.27676859498023987, "learning_rate": 6.961906731692276e-05, "loss": 0.1973, "step": 5916 }, { "epoch": 1.1975308641975309, "grad_norm": 0.2857378423213959, "learning_rate": 6.958876513309565e-05, "loss": 0.1837, "step": 5917 }, { "epoch": 1.197733252378061, "grad_norm": 0.36634954810142517, "learning_rate": 6.955846602595817e-05, "loss": 0.2126, "step": 5918 }, { "epoch": 1.1979356405585915, "grad_norm": 0.2530911862850189, "learning_rate": 6.952816999857567e-05, "loss": 0.1965, "step": 5919 }, { "epoch": 1.1981380287391217, "grad_norm": 0.27030566334724426, "learning_rate": 6.949787705401321e-05, "loss": 0.1827, "step": 5920 }, { "epoch": 1.1983404169196519, "grad_norm": 0.2770173251628876, "learning_rate": 6.946758719533549e-05, "loss": 0.1862, "step": 5921 }, { "epoch": 1.198542805100182, "grad_norm": 0.2795921564102173, "learning_rate": 6.943730042560692e-05, "loss": 0.2097, "step": 5922 }, { "epoch": 1.1987451932807125, "grad_norm": 0.3575659990310669, "learning_rate": 6.940701674789162e-05, "loss": 0.2033, "step": 5923 }, { "epoch": 1.1989475814612427, "grad_norm": 0.30576780438423157, "learning_rate": 6.937673616525334e-05, "loss": 0.1951, "step": 5924 }, { "epoch": 1.199149969641773, "grad_norm": 0.3140013515949249, "learning_rate": 6.934645868075558e-05, "loss": 0.1944, "step": 5925 }, { "epoch": 1.199352357822303, "grad_norm": 0.3479853570461273, "learning_rate": 6.931618429746147e-05, "loss": 0.1831, "step": 5926 }, { "epoch": 1.1995547460028335, "grad_norm": 0.26819106936454773, "learning_rate": 6.928591301843389e-05, "loss": 0.1886, "step": 5927 }, { "epoch": 1.1997571341833637, "grad_norm": 0.2444477528333664, "learning_rate": 6.925564484673534e-05, "loss": 0.1746, "step": 5928 }, { "epoch": 1.199959522363894, "grad_norm": 0.2941974103450775, "learning_rate": 6.922537978542804e-05, "loss": 0.1963, "step": 5929 }, { "epoch": 1.2001619105444241, "grad_norm": 0.28198763728141785, "learning_rate": 6.91951178375739e-05, "loss": 0.192, "step": 5930 }, { "epoch": 1.2003642987249545, "grad_norm": 0.3067132830619812, "learning_rate": 6.916485900623453e-05, "loss": 0.2116, "step": 5931 }, { "epoch": 1.2005666869054847, "grad_norm": 0.2618827521800995, "learning_rate": 6.913460329447116e-05, "loss": 0.1794, "step": 5932 }, { "epoch": 1.200769075086015, "grad_norm": 0.2466832846403122, "learning_rate": 6.910435070534475e-05, "loss": 0.1855, "step": 5933 }, { "epoch": 1.2009714632665451, "grad_norm": 0.28418102860450745, "learning_rate": 6.907410124191598e-05, "loss": 0.1869, "step": 5934 }, { "epoch": 1.2011738514470756, "grad_norm": 0.26387548446655273, "learning_rate": 6.904385490724512e-05, "loss": 0.1716, "step": 5935 }, { "epoch": 1.2013762396276058, "grad_norm": 0.3512365221977234, "learning_rate": 6.901361170439223e-05, "loss": 0.2098, "step": 5936 }, { "epoch": 1.201578627808136, "grad_norm": 0.28574854135513306, "learning_rate": 6.898337163641695e-05, "loss": 0.1929, "step": 5937 }, { "epoch": 1.2017810159886664, "grad_norm": 0.28817689418792725, "learning_rate": 6.895313470637868e-05, "loss": 0.2125, "step": 5938 }, { "epoch": 1.2019834041691966, "grad_norm": 0.2598557770252228, "learning_rate": 6.892290091733646e-05, "loss": 0.1669, "step": 5939 }, { "epoch": 1.2021857923497268, "grad_norm": 0.29402410984039307, "learning_rate": 6.889267027234905e-05, "loss": 0.2113, "step": 5940 }, { "epoch": 1.202388180530257, "grad_norm": 0.283035546541214, "learning_rate": 6.886244277447485e-05, "loss": 0.1857, "step": 5941 }, { "epoch": 1.2025905687107872, "grad_norm": 0.2820853888988495, "learning_rate": 6.883221842677196e-05, "loss": 0.1733, "step": 5942 }, { "epoch": 1.2027929568913176, "grad_norm": 0.3180203139781952, "learning_rate": 6.880199723229817e-05, "loss": 0.1972, "step": 5943 }, { "epoch": 1.2029953450718478, "grad_norm": 0.31403082609176636, "learning_rate": 6.877177919411095e-05, "loss": 0.2062, "step": 5944 }, { "epoch": 1.203197733252378, "grad_norm": 0.2898111045360565, "learning_rate": 6.874156431526743e-05, "loss": 0.189, "step": 5945 }, { "epoch": 1.2034001214329084, "grad_norm": 0.3071668744087219, "learning_rate": 6.871135259882445e-05, "loss": 0.1931, "step": 5946 }, { "epoch": 1.2036025096134386, "grad_norm": 0.33244362473487854, "learning_rate": 6.868114404783849e-05, "loss": 0.2055, "step": 5947 }, { "epoch": 1.2038048977939688, "grad_norm": 0.3338499963283539, "learning_rate": 6.865093866536576e-05, "loss": 0.208, "step": 5948 }, { "epoch": 1.204007285974499, "grad_norm": 0.27435195446014404, "learning_rate": 6.862073645446211e-05, "loss": 0.2154, "step": 5949 }, { "epoch": 1.2042096741550294, "grad_norm": 0.3393666744232178, "learning_rate": 6.859053741818311e-05, "loss": 0.2009, "step": 5950 }, { "epoch": 1.2042096741550294, "eval_loss": 0.2669513523578644, "eval_runtime": 0.7388, "eval_samples_per_second": 6.768, "eval_steps_per_second": 1.354, "step": 5950 }, { "epoch": 1.2044120623355596, "grad_norm": 0.31124457716941833, "learning_rate": 6.856034155958394e-05, "loss": 0.1991, "step": 5951 }, { "epoch": 1.2046144505160898, "grad_norm": 0.2741898000240326, "learning_rate": 6.853014888171952e-05, "loss": 0.193, "step": 5952 }, { "epoch": 1.20481683869662, "grad_norm": 0.2568211257457733, "learning_rate": 6.849995938764442e-05, "loss": 0.1955, "step": 5953 }, { "epoch": 1.2050192268771505, "grad_norm": 0.3031553626060486, "learning_rate": 6.846977308041292e-05, "loss": 0.1921, "step": 5954 }, { "epoch": 1.2052216150576807, "grad_norm": 0.48870640993118286, "learning_rate": 6.843958996307892e-05, "loss": 0.2202, "step": 5955 }, { "epoch": 1.2054240032382109, "grad_norm": 0.2706666588783264, "learning_rate": 6.840941003869606e-05, "loss": 0.1782, "step": 5956 }, { "epoch": 1.205626391418741, "grad_norm": 0.2780493199825287, "learning_rate": 6.83792333103176e-05, "loss": 0.2277, "step": 5957 }, { "epoch": 1.2058287795992715, "grad_norm": 0.3681378960609436, "learning_rate": 6.834905978099655e-05, "loss": 0.1998, "step": 5958 }, { "epoch": 1.2060311677798017, "grad_norm": 0.3750686049461365, "learning_rate": 6.83188894537855e-05, "loss": 0.2168, "step": 5959 }, { "epoch": 1.2062335559603319, "grad_norm": 0.31940484046936035, "learning_rate": 6.82887223317368e-05, "loss": 0.1864, "step": 5960 }, { "epoch": 1.206435944140862, "grad_norm": 0.24824610352516174, "learning_rate": 6.825855841790242e-05, "loss": 0.1506, "step": 5961 }, { "epoch": 1.2066383323213925, "grad_norm": 0.2760399580001831, "learning_rate": 6.822839771533405e-05, "loss": 0.1842, "step": 5962 }, { "epoch": 1.2068407205019227, "grad_norm": 0.2778237462043762, "learning_rate": 6.8198240227083e-05, "loss": 0.2034, "step": 5963 }, { "epoch": 1.207043108682453, "grad_norm": 0.27293694019317627, "learning_rate": 6.816808595620034e-05, "loss": 0.2083, "step": 5964 }, { "epoch": 1.207245496862983, "grad_norm": 0.25446030497550964, "learning_rate": 6.813793490573672e-05, "loss": 0.1804, "step": 5965 }, { "epoch": 1.2074478850435135, "grad_norm": 0.25617870688438416, "learning_rate": 6.810778707874253e-05, "loss": 0.1838, "step": 5966 }, { "epoch": 1.2076502732240437, "grad_norm": 0.330093652009964, "learning_rate": 6.80776424782678e-05, "loss": 0.2248, "step": 5967 }, { "epoch": 1.207852661404574, "grad_norm": 0.3035315275192261, "learning_rate": 6.804750110736224e-05, "loss": 0.2039, "step": 5968 }, { "epoch": 1.2080550495851043, "grad_norm": 0.2813340425491333, "learning_rate": 6.801736296907524e-05, "loss": 0.2072, "step": 5969 }, { "epoch": 1.2082574377656345, "grad_norm": 0.30979204177856445, "learning_rate": 6.798722806645588e-05, "loss": 0.2057, "step": 5970 }, { "epoch": 1.2084598259461647, "grad_norm": 0.273605078458786, "learning_rate": 6.795709640255286e-05, "loss": 0.1723, "step": 5971 }, { "epoch": 1.208662214126695, "grad_norm": 0.2786939740180969, "learning_rate": 6.792696798041463e-05, "loss": 0.1982, "step": 5972 }, { "epoch": 1.2088646023072251, "grad_norm": 0.2765547037124634, "learning_rate": 6.789684280308922e-05, "loss": 0.1726, "step": 5973 }, { "epoch": 1.2090669904877556, "grad_norm": 0.2661789357662201, "learning_rate": 6.78667208736244e-05, "loss": 0.1907, "step": 5974 }, { "epoch": 1.2092693786682858, "grad_norm": 0.276862770318985, "learning_rate": 6.78366021950676e-05, "loss": 0.1961, "step": 5975 }, { "epoch": 1.209471766848816, "grad_norm": 0.36788874864578247, "learning_rate": 6.78064867704659e-05, "loss": 0.2106, "step": 5976 }, { "epoch": 1.2096741550293464, "grad_norm": 0.3060303032398224, "learning_rate": 6.777637460286607e-05, "loss": 0.1976, "step": 5977 }, { "epoch": 1.2098765432098766, "grad_norm": 0.4363291263580322, "learning_rate": 6.774626569531453e-05, "loss": 0.1675, "step": 5978 }, { "epoch": 1.2100789313904068, "grad_norm": 0.26958876848220825, "learning_rate": 6.771616005085739e-05, "loss": 0.1882, "step": 5979 }, { "epoch": 1.210281319570937, "grad_norm": 0.34765639901161194, "learning_rate": 6.768605767254048e-05, "loss": 0.2025, "step": 5980 }, { "epoch": 1.2104837077514674, "grad_norm": 0.2938627302646637, "learning_rate": 6.765595856340914e-05, "loss": 0.172, "step": 5981 }, { "epoch": 1.2106860959319976, "grad_norm": 0.3988211452960968, "learning_rate": 6.762586272650854e-05, "loss": 0.2233, "step": 5982 }, { "epoch": 1.2108884841125278, "grad_norm": 0.30394846200942993, "learning_rate": 6.759577016488343e-05, "loss": 0.2043, "step": 5983 }, { "epoch": 1.211090872293058, "grad_norm": 0.26461684703826904, "learning_rate": 6.756568088157829e-05, "loss": 0.2121, "step": 5984 }, { "epoch": 1.2112932604735884, "grad_norm": 0.3332306146621704, "learning_rate": 6.753559487963723e-05, "loss": 0.1988, "step": 5985 }, { "epoch": 1.2114956486541186, "grad_norm": 0.2860182523727417, "learning_rate": 6.750551216210404e-05, "loss": 0.2115, "step": 5986 }, { "epoch": 1.2116980368346488, "grad_norm": 0.30926713347435, "learning_rate": 6.747543273202216e-05, "loss": 0.1919, "step": 5987 }, { "epoch": 1.211900425015179, "grad_norm": 0.30893874168395996, "learning_rate": 6.744535659243473e-05, "loss": 0.1935, "step": 5988 }, { "epoch": 1.2121028131957094, "grad_norm": 0.2799331843852997, "learning_rate": 6.741528374638453e-05, "loss": 0.1952, "step": 5989 }, { "epoch": 1.2123052013762396, "grad_norm": 0.2534390687942505, "learning_rate": 6.7385214196914e-05, "loss": 0.1648, "step": 5990 }, { "epoch": 1.2125075895567698, "grad_norm": 0.2808387279510498, "learning_rate": 6.735514794706528e-05, "loss": 0.1927, "step": 5991 }, { "epoch": 1.2127099777373, "grad_norm": 0.29062768816947937, "learning_rate": 6.732508499988015e-05, "loss": 0.2046, "step": 5992 }, { "epoch": 1.2129123659178305, "grad_norm": 0.2915301024913788, "learning_rate": 6.729502535840007e-05, "loss": 0.1989, "step": 5993 }, { "epoch": 1.2131147540983607, "grad_norm": 0.27135908603668213, "learning_rate": 6.726496902566616e-05, "loss": 0.1998, "step": 5994 }, { "epoch": 1.2133171422788909, "grad_norm": 0.28507882356643677, "learning_rate": 6.723491600471919e-05, "loss": 0.1819, "step": 5995 }, { "epoch": 1.213519530459421, "grad_norm": 0.2728084623813629, "learning_rate": 6.720486629859963e-05, "loss": 0.2034, "step": 5996 }, { "epoch": 1.2137219186399515, "grad_norm": 0.2556982934474945, "learning_rate": 6.717481991034757e-05, "loss": 0.196, "step": 5997 }, { "epoch": 1.2139243068204817, "grad_norm": 0.2836724519729614, "learning_rate": 6.71447768430028e-05, "loss": 0.1905, "step": 5998 }, { "epoch": 1.214126695001012, "grad_norm": 0.3173321783542633, "learning_rate": 6.711473709960478e-05, "loss": 0.2118, "step": 5999 }, { "epoch": 1.2143290831815423, "grad_norm": 0.28240132331848145, "learning_rate": 6.708470068319258e-05, "loss": 0.1941, "step": 6000 }, { "epoch": 1.2143290831815423, "eval_loss": 0.2702457010746002, "eval_runtime": 0.7379, "eval_samples_per_second": 6.776, "eval_steps_per_second": 1.355, "step": 6000 }, { "epoch": 1.2145314713620725, "grad_norm": 0.25992351770401, "learning_rate": 6.7054667596805e-05, "loss": 0.1922, "step": 6001 }, { "epoch": 1.2147338595426027, "grad_norm": 0.2810695469379425, "learning_rate": 6.702463784348045e-05, "loss": 0.1942, "step": 6002 }, { "epoch": 1.214936247723133, "grad_norm": 0.3523179292678833, "learning_rate": 6.69946114262571e-05, "loss": 0.2278, "step": 6003 }, { "epoch": 1.2151386359036631, "grad_norm": 0.2615192234516144, "learning_rate": 6.696458834817258e-05, "loss": 0.1566, "step": 6004 }, { "epoch": 1.2153410240841935, "grad_norm": 0.27877670526504517, "learning_rate": 6.693456861226438e-05, "loss": 0.2057, "step": 6005 }, { "epoch": 1.2155434122647237, "grad_norm": 0.3264453113079071, "learning_rate": 6.690455222156959e-05, "loss": 0.1961, "step": 6006 }, { "epoch": 1.215745800445254, "grad_norm": 0.27684032917022705, "learning_rate": 6.687453917912492e-05, "loss": 0.1956, "step": 6007 }, { "epoch": 1.2159481886257844, "grad_norm": 0.27163904905319214, "learning_rate": 6.68445294879668e-05, "loss": 0.1715, "step": 6008 }, { "epoch": 1.2161505768063146, "grad_norm": 0.2859479784965515, "learning_rate": 6.68145231511313e-05, "loss": 0.1967, "step": 6009 }, { "epoch": 1.2163529649868448, "grad_norm": 0.3354541063308716, "learning_rate": 6.678452017165413e-05, "loss": 0.2147, "step": 6010 }, { "epoch": 1.216555353167375, "grad_norm": 0.3036941587924957, "learning_rate": 6.675452055257067e-05, "loss": 0.2056, "step": 6011 }, { "epoch": 1.2167577413479054, "grad_norm": 0.2504706084728241, "learning_rate": 6.6724524296916e-05, "loss": 0.1774, "step": 6012 }, { "epoch": 1.2169601295284356, "grad_norm": 0.2577364444732666, "learning_rate": 6.669453140772477e-05, "loss": 0.1485, "step": 6013 }, { "epoch": 1.2171625177089658, "grad_norm": 0.28054022789001465, "learning_rate": 6.666454188803142e-05, "loss": 0.1875, "step": 6014 }, { "epoch": 1.217364905889496, "grad_norm": 0.28038644790649414, "learning_rate": 6.663455574086992e-05, "loss": 0.2117, "step": 6015 }, { "epoch": 1.2175672940700264, "grad_norm": 0.28910353779792786, "learning_rate": 6.660457296927398e-05, "loss": 0.1783, "step": 6016 }, { "epoch": 1.2177696822505566, "grad_norm": 0.2723628580570221, "learning_rate": 6.657459357627693e-05, "loss": 0.2085, "step": 6017 }, { "epoch": 1.2179720704310868, "grad_norm": 0.2672482430934906, "learning_rate": 6.654461756491177e-05, "loss": 0.171, "step": 6018 }, { "epoch": 1.218174458611617, "grad_norm": 0.25246506929397583, "learning_rate": 6.651464493821116e-05, "loss": 0.1909, "step": 6019 }, { "epoch": 1.2183768467921474, "grad_norm": 0.23635558784008026, "learning_rate": 6.648467569920742e-05, "loss": 0.1799, "step": 6020 }, { "epoch": 1.2185792349726776, "grad_norm": 0.2640747129917145, "learning_rate": 6.645470985093253e-05, "loss": 0.1684, "step": 6021 }, { "epoch": 1.2187816231532078, "grad_norm": 0.26772499084472656, "learning_rate": 6.642474739641811e-05, "loss": 0.1854, "step": 6022 }, { "epoch": 1.218984011333738, "grad_norm": 0.2757259011268616, "learning_rate": 6.639478833869543e-05, "loss": 0.2053, "step": 6023 }, { "epoch": 1.2191863995142684, "grad_norm": 0.25957462191581726, "learning_rate": 6.636483268079545e-05, "loss": 0.1799, "step": 6024 }, { "epoch": 1.2193887876947986, "grad_norm": 0.34773412346839905, "learning_rate": 6.633488042574882e-05, "loss": 0.2232, "step": 6025 }, { "epoch": 1.2195911758753288, "grad_norm": 0.24219773709774017, "learning_rate": 6.630493157658571e-05, "loss": 0.1838, "step": 6026 }, { "epoch": 1.219793564055859, "grad_norm": 0.34960097074508667, "learning_rate": 6.627498613633606e-05, "loss": 0.214, "step": 6027 }, { "epoch": 1.2199959522363895, "grad_norm": 0.3201826214790344, "learning_rate": 6.624504410802944e-05, "loss": 0.2323, "step": 6028 }, { "epoch": 1.2201983404169197, "grad_norm": 0.28623464703559875, "learning_rate": 6.621510549469507e-05, "loss": 0.1956, "step": 6029 }, { "epoch": 1.2204007285974499, "grad_norm": 0.25482234358787537, "learning_rate": 6.618517029936182e-05, "loss": 0.1587, "step": 6030 }, { "epoch": 1.2206031167779803, "grad_norm": 0.29422393441200256, "learning_rate": 6.615523852505825e-05, "loss": 0.1794, "step": 6031 }, { "epoch": 1.2208055049585105, "grad_norm": 0.27444425225257874, "learning_rate": 6.612531017481248e-05, "loss": 0.2016, "step": 6032 }, { "epoch": 1.2210078931390407, "grad_norm": 0.3157899081707001, "learning_rate": 6.60953852516524e-05, "loss": 0.2259, "step": 6033 }, { "epoch": 1.2212102813195709, "grad_norm": 0.3079804480075836, "learning_rate": 6.606546375860548e-05, "loss": 0.2183, "step": 6034 }, { "epoch": 1.221412669500101, "grad_norm": 0.3177958130836487, "learning_rate": 6.603554569869888e-05, "loss": 0.1983, "step": 6035 }, { "epoch": 1.2216150576806315, "grad_norm": 0.28156325221061707, "learning_rate": 6.600563107495937e-05, "loss": 0.205, "step": 6036 }, { "epoch": 1.2218174458611617, "grad_norm": 0.3072798252105713, "learning_rate": 6.59757198904134e-05, "loss": 0.1835, "step": 6037 }, { "epoch": 1.222019834041692, "grad_norm": 0.30911877751350403, "learning_rate": 6.594581214808708e-05, "loss": 0.1635, "step": 6038 }, { "epoch": 1.2222222222222223, "grad_norm": 0.2736305296421051, "learning_rate": 6.591590785100617e-05, "loss": 0.1832, "step": 6039 }, { "epoch": 1.2224246104027525, "grad_norm": 0.27624863386154175, "learning_rate": 6.588600700219608e-05, "loss": 0.2023, "step": 6040 }, { "epoch": 1.2226269985832827, "grad_norm": 0.2884620130062103, "learning_rate": 6.585610960468182e-05, "loss": 0.1468, "step": 6041 }, { "epoch": 1.222829386763813, "grad_norm": 0.2596905529499054, "learning_rate": 6.58262156614881e-05, "loss": 0.1543, "step": 6042 }, { "epoch": 1.2230317749443433, "grad_norm": 0.26053115725517273, "learning_rate": 6.579632517563934e-05, "loss": 0.1909, "step": 6043 }, { "epoch": 1.2232341631248735, "grad_norm": 0.2973617911338806, "learning_rate": 6.576643815015949e-05, "loss": 0.1814, "step": 6044 }, { "epoch": 1.2234365513054037, "grad_norm": 0.2856094539165497, "learning_rate": 6.573655458807222e-05, "loss": 0.2318, "step": 6045 }, { "epoch": 1.223638939485934, "grad_norm": 0.29081985354423523, "learning_rate": 6.570667449240083e-05, "loss": 0.2022, "step": 6046 }, { "epoch": 1.2238413276664644, "grad_norm": 0.36599013209342957, "learning_rate": 6.567679786616834e-05, "loss": 0.2136, "step": 6047 }, { "epoch": 1.2240437158469946, "grad_norm": 0.36280548572540283, "learning_rate": 6.564692471239723e-05, "loss": 0.1944, "step": 6048 }, { "epoch": 1.2242461040275248, "grad_norm": 0.30856937170028687, "learning_rate": 6.561705503410982e-05, "loss": 0.2411, "step": 6049 }, { "epoch": 1.224448492208055, "grad_norm": 0.29295429587364197, "learning_rate": 6.558718883432802e-05, "loss": 0.218, "step": 6050 }, { "epoch": 1.224448492208055, "eval_loss": 0.26702526211738586, "eval_runtime": 0.7381, "eval_samples_per_second": 6.774, "eval_steps_per_second": 1.355, "step": 6050 }, { "epoch": 1.2246508803885854, "grad_norm": 0.2865926921367645, "learning_rate": 6.555732611607335e-05, "loss": 0.1894, "step": 6051 }, { "epoch": 1.2248532685691156, "grad_norm": 0.2995022237300873, "learning_rate": 6.552746688236702e-05, "loss": 0.1862, "step": 6052 }, { "epoch": 1.2250556567496458, "grad_norm": 0.27584776282310486, "learning_rate": 6.549761113622988e-05, "loss": 0.1876, "step": 6053 }, { "epoch": 1.225258044930176, "grad_norm": 0.35745367407798767, "learning_rate": 6.54677588806824e-05, "loss": 0.2175, "step": 6054 }, { "epoch": 1.2254604331107064, "grad_norm": 0.24347814917564392, "learning_rate": 6.543791011874476e-05, "loss": 0.1802, "step": 6055 }, { "epoch": 1.2256628212912366, "grad_norm": 0.35814881324768066, "learning_rate": 6.54080648534367e-05, "loss": 0.1824, "step": 6056 }, { "epoch": 1.2258652094717668, "grad_norm": 0.23016677796840668, "learning_rate": 6.537822308777769e-05, "loss": 0.1863, "step": 6057 }, { "epoch": 1.226067597652297, "grad_norm": 0.3142828047275543, "learning_rate": 6.534838482478675e-05, "loss": 0.2092, "step": 6058 }, { "epoch": 1.2262699858328274, "grad_norm": 0.26234740018844604, "learning_rate": 6.531855006748267e-05, "loss": 0.179, "step": 6059 }, { "epoch": 1.2264723740133576, "grad_norm": 0.31807130575180054, "learning_rate": 6.528871881888376e-05, "loss": 0.223, "step": 6060 }, { "epoch": 1.2266747621938878, "grad_norm": 0.3054802417755127, "learning_rate": 6.525889108200808e-05, "loss": 0.2306, "step": 6061 }, { "epoch": 1.2268771503744182, "grad_norm": 0.2600635886192322, "learning_rate": 6.522906685987326e-05, "loss": 0.1949, "step": 6062 }, { "epoch": 1.2270795385549484, "grad_norm": 0.27494141459465027, "learning_rate": 6.51992461554966e-05, "loss": 0.1904, "step": 6063 }, { "epoch": 1.2272819267354786, "grad_norm": 0.2199246734380722, "learning_rate": 6.516942897189506e-05, "loss": 0.1731, "step": 6064 }, { "epoch": 1.2274843149160088, "grad_norm": 0.27572178840637207, "learning_rate": 6.513961531208523e-05, "loss": 0.1338, "step": 6065 }, { "epoch": 1.227686703096539, "grad_norm": 0.2560636103153229, "learning_rate": 6.510980517908334e-05, "loss": 0.1797, "step": 6066 }, { "epoch": 1.2278890912770695, "grad_norm": 0.27987346053123474, "learning_rate": 6.507999857590525e-05, "loss": 0.1868, "step": 6067 }, { "epoch": 1.2280914794575997, "grad_norm": 0.2953600585460663, "learning_rate": 6.50501955055665e-05, "loss": 0.2177, "step": 6068 }, { "epoch": 1.2282938676381299, "grad_norm": 0.29752638936042786, "learning_rate": 6.502039597108226e-05, "loss": 0.2291, "step": 6069 }, { "epoch": 1.2284962558186603, "grad_norm": 0.3324909806251526, "learning_rate": 6.49905999754673e-05, "loss": 0.205, "step": 6070 }, { "epoch": 1.2286986439991905, "grad_norm": 0.2881925404071808, "learning_rate": 6.496080752173607e-05, "loss": 0.203, "step": 6071 }, { "epoch": 1.2289010321797207, "grad_norm": 0.44888433814048767, "learning_rate": 6.49310186129027e-05, "loss": 0.1765, "step": 6072 }, { "epoch": 1.2291034203602509, "grad_norm": 0.283622682094574, "learning_rate": 6.490123325198089e-05, "loss": 0.2107, "step": 6073 }, { "epoch": 1.2293058085407813, "grad_norm": 0.25115031003952026, "learning_rate": 6.4871451441984e-05, "loss": 0.1669, "step": 6074 }, { "epoch": 1.2295081967213115, "grad_norm": 0.29004356265068054, "learning_rate": 6.484167318592505e-05, "loss": 0.176, "step": 6075 }, { "epoch": 1.2297105849018417, "grad_norm": 0.2971007227897644, "learning_rate": 6.48118984868167e-05, "loss": 0.2017, "step": 6076 }, { "epoch": 1.229912973082372, "grad_norm": 0.30012375116348267, "learning_rate": 6.478212734767124e-05, "loss": 0.1948, "step": 6077 }, { "epoch": 1.2301153612629023, "grad_norm": 0.2942335307598114, "learning_rate": 6.47523597715006e-05, "loss": 0.2029, "step": 6078 }, { "epoch": 1.2303177494434325, "grad_norm": 0.2728146016597748, "learning_rate": 6.472259576131635e-05, "loss": 0.1736, "step": 6079 }, { "epoch": 1.2305201376239627, "grad_norm": 0.29079195857048035, "learning_rate": 6.469283532012969e-05, "loss": 0.1831, "step": 6080 }, { "epoch": 1.230722525804493, "grad_norm": 0.25617703795433044, "learning_rate": 6.466307845095148e-05, "loss": 0.1744, "step": 6081 }, { "epoch": 1.2309249139850234, "grad_norm": 0.26472562551498413, "learning_rate": 6.463332515679221e-05, "loss": 0.1943, "step": 6082 }, { "epoch": 1.2311273021655535, "grad_norm": 0.2645573616027832, "learning_rate": 6.4603575440662e-05, "loss": 0.1982, "step": 6083 }, { "epoch": 1.2313296903460837, "grad_norm": 0.3020835220813751, "learning_rate": 6.457382930557062e-05, "loss": 0.181, "step": 6084 }, { "epoch": 1.231532078526614, "grad_norm": 0.261520653963089, "learning_rate": 6.454408675452747e-05, "loss": 0.1818, "step": 6085 }, { "epoch": 1.2317344667071444, "grad_norm": 0.2937348186969757, "learning_rate": 6.451434779054158e-05, "loss": 0.2025, "step": 6086 }, { "epoch": 1.2319368548876746, "grad_norm": 0.2838839292526245, "learning_rate": 6.448461241662163e-05, "loss": 0.2023, "step": 6087 }, { "epoch": 1.2321392430682048, "grad_norm": 0.3361426293849945, "learning_rate": 6.445488063577595e-05, "loss": 0.2433, "step": 6088 }, { "epoch": 1.232341631248735, "grad_norm": 0.2763873338699341, "learning_rate": 6.442515245101247e-05, "loss": 0.1605, "step": 6089 }, { "epoch": 1.2325440194292654, "grad_norm": 0.2895985543727875, "learning_rate": 6.439542786533879e-05, "loss": 0.1764, "step": 6090 }, { "epoch": 1.2327464076097956, "grad_norm": 0.3270489573478699, "learning_rate": 6.436570688176211e-05, "loss": 0.2119, "step": 6091 }, { "epoch": 1.2329487957903258, "grad_norm": 0.2826992869377136, "learning_rate": 6.433598950328934e-05, "loss": 0.2041, "step": 6092 }, { "epoch": 1.2331511839708562, "grad_norm": 0.24582330882549286, "learning_rate": 6.430627573292689e-05, "loss": 0.1679, "step": 6093 }, { "epoch": 1.2333535721513864, "grad_norm": 0.2923988401889801, "learning_rate": 6.427656557368095e-05, "loss": 0.1922, "step": 6094 }, { "epoch": 1.2335559603319166, "grad_norm": 0.3192385733127594, "learning_rate": 6.424685902855725e-05, "loss": 0.1708, "step": 6095 }, { "epoch": 1.2337583485124468, "grad_norm": 0.27581390738487244, "learning_rate": 6.421715610056121e-05, "loss": 0.175, "step": 6096 }, { "epoch": 1.233960736692977, "grad_norm": 0.3236446976661682, "learning_rate": 6.418745679269785e-05, "loss": 0.1985, "step": 6097 }, { "epoch": 1.2341631248735074, "grad_norm": 0.29190582036972046, "learning_rate": 6.415776110797184e-05, "loss": 0.1825, "step": 6098 }, { "epoch": 1.2343655130540376, "grad_norm": 0.31546276807785034, "learning_rate": 6.412806904938746e-05, "loss": 0.1647, "step": 6099 }, { "epoch": 1.2345679012345678, "grad_norm": 0.24743930995464325, "learning_rate": 6.409838061994867e-05, "loss": 0.1971, "step": 6100 }, { "epoch": 1.2345679012345678, "eval_loss": 0.26864010095596313, "eval_runtime": 0.7356, "eval_samples_per_second": 6.797, "eval_steps_per_second": 1.359, "step": 6100 }, { "epoch": 1.2347702894150983, "grad_norm": 0.2839614450931549, "learning_rate": 6.4068695822659e-05, "loss": 0.1969, "step": 6101 }, { "epoch": 1.2349726775956285, "grad_norm": 0.26908546686172485, "learning_rate": 6.403901466052167e-05, "loss": 0.1676, "step": 6102 }, { "epoch": 1.2351750657761587, "grad_norm": 0.32045114040374756, "learning_rate": 6.400933713653949e-05, "loss": 0.2132, "step": 6103 }, { "epoch": 1.2353774539566889, "grad_norm": 0.31629490852355957, "learning_rate": 6.397966325371496e-05, "loss": 0.1936, "step": 6104 }, { "epoch": 1.2355798421372193, "grad_norm": 0.2909006178379059, "learning_rate": 6.394999301505013e-05, "loss": 0.2212, "step": 6105 }, { "epoch": 1.2357822303177495, "grad_norm": 0.29362747073173523, "learning_rate": 6.392032642354673e-05, "loss": 0.1745, "step": 6106 }, { "epoch": 1.2359846184982797, "grad_norm": 0.24971556663513184, "learning_rate": 6.389066348220613e-05, "loss": 0.1852, "step": 6107 }, { "epoch": 1.2361870066788099, "grad_norm": 0.26508629322052, "learning_rate": 6.386100419402931e-05, "loss": 0.1669, "step": 6108 }, { "epoch": 1.2363893948593403, "grad_norm": 0.3305191695690155, "learning_rate": 6.383134856201689e-05, "loss": 0.1902, "step": 6109 }, { "epoch": 1.2365917830398705, "grad_norm": 0.27122199535369873, "learning_rate": 6.38016965891691e-05, "loss": 0.1783, "step": 6110 }, { "epoch": 1.2367941712204007, "grad_norm": 0.271418958902359, "learning_rate": 6.377204827848584e-05, "loss": 0.1913, "step": 6111 }, { "epoch": 1.236996559400931, "grad_norm": 0.3329184651374817, "learning_rate": 6.374240363296657e-05, "loss": 0.2314, "step": 6112 }, { "epoch": 1.2371989475814613, "grad_norm": 0.270542174577713, "learning_rate": 6.371276265561047e-05, "loss": 0.1918, "step": 6113 }, { "epoch": 1.2374013357619915, "grad_norm": 0.24374881386756897, "learning_rate": 6.368312534941632e-05, "loss": 0.1699, "step": 6114 }, { "epoch": 1.2376037239425217, "grad_norm": 0.24941346049308777, "learning_rate": 6.365349171738244e-05, "loss": 0.156, "step": 6115 }, { "epoch": 1.2378061121230521, "grad_norm": 0.2745361924171448, "learning_rate": 6.362386176250689e-05, "loss": 0.2191, "step": 6116 }, { "epoch": 1.2380085003035823, "grad_norm": 0.25560203194618225, "learning_rate": 6.359423548778733e-05, "loss": 0.1756, "step": 6117 }, { "epoch": 1.2382108884841125, "grad_norm": 0.3089434802532196, "learning_rate": 6.356461289622102e-05, "loss": 0.1757, "step": 6118 }, { "epoch": 1.2384132766646427, "grad_norm": 0.30926546454429626, "learning_rate": 6.353499399080485e-05, "loss": 0.2051, "step": 6119 }, { "epoch": 1.238615664845173, "grad_norm": 0.26371267437934875, "learning_rate": 6.350537877453537e-05, "loss": 0.1658, "step": 6120 }, { "epoch": 1.2388180530257034, "grad_norm": 0.30429723858833313, "learning_rate": 6.347576725040874e-05, "loss": 0.192, "step": 6121 }, { "epoch": 1.2390204412062336, "grad_norm": 0.2675933837890625, "learning_rate": 6.344615942142071e-05, "loss": 0.1864, "step": 6122 }, { "epoch": 1.2392228293867638, "grad_norm": 0.275258868932724, "learning_rate": 6.341655529056675e-05, "loss": 0.1937, "step": 6123 }, { "epoch": 1.2394252175672942, "grad_norm": 0.26812008023262024, "learning_rate": 6.338695486084184e-05, "loss": 0.1943, "step": 6124 }, { "epoch": 1.2396276057478244, "grad_norm": 0.29009515047073364, "learning_rate": 6.335735813524066e-05, "loss": 0.1872, "step": 6125 }, { "epoch": 1.2398299939283546, "grad_norm": 0.2780061364173889, "learning_rate": 6.33277651167575e-05, "loss": 0.1845, "step": 6126 }, { "epoch": 1.2400323821088848, "grad_norm": 0.25830531120300293, "learning_rate": 6.329817580838628e-05, "loss": 0.1902, "step": 6127 }, { "epoch": 1.240234770289415, "grad_norm": 0.3139815926551819, "learning_rate": 6.32685902131205e-05, "loss": 0.2267, "step": 6128 }, { "epoch": 1.2404371584699454, "grad_norm": 0.2877956032752991, "learning_rate": 6.323900833395338e-05, "loss": 0.2039, "step": 6129 }, { "epoch": 1.2406395466504756, "grad_norm": 0.2266833633184433, "learning_rate": 6.320943017387764e-05, "loss": 0.1606, "step": 6130 }, { "epoch": 1.2408419348310058, "grad_norm": 0.24923524260520935, "learning_rate": 6.317985573588572e-05, "loss": 0.1749, "step": 6131 }, { "epoch": 1.2410443230115362, "grad_norm": 0.26781517267227173, "learning_rate": 6.315028502296965e-05, "loss": 0.2144, "step": 6132 }, { "epoch": 1.2412467111920664, "grad_norm": 0.4893890619277954, "learning_rate": 6.312071803812107e-05, "loss": 0.1791, "step": 6133 }, { "epoch": 1.2414490993725966, "grad_norm": 0.28728941082954407, "learning_rate": 6.309115478433129e-05, "loss": 0.1983, "step": 6134 }, { "epoch": 1.2416514875531268, "grad_norm": 0.24946346879005432, "learning_rate": 6.306159526459118e-05, "loss": 0.1551, "step": 6135 }, { "epoch": 1.2418538757336572, "grad_norm": 0.2808363437652588, "learning_rate": 6.303203948189131e-05, "loss": 0.156, "step": 6136 }, { "epoch": 1.2420562639141874, "grad_norm": 0.251274973154068, "learning_rate": 6.300248743922172e-05, "loss": 0.189, "step": 6137 }, { "epoch": 1.2422586520947176, "grad_norm": 0.26637983322143555, "learning_rate": 6.297293913957227e-05, "loss": 0.2127, "step": 6138 }, { "epoch": 1.2424610402752478, "grad_norm": 0.2673642635345459, "learning_rate": 6.29433945859323e-05, "loss": 0.1819, "step": 6139 }, { "epoch": 1.2426634284557783, "grad_norm": 0.33593377470970154, "learning_rate": 6.291385378129085e-05, "loss": 0.1829, "step": 6140 }, { "epoch": 1.2428658166363085, "grad_norm": 0.23823416233062744, "learning_rate": 6.288431672863654e-05, "loss": 0.1773, "step": 6141 }, { "epoch": 1.2430682048168387, "grad_norm": 0.4076298177242279, "learning_rate": 6.28547834309576e-05, "loss": 0.2245, "step": 6142 }, { "epoch": 1.2432705929973689, "grad_norm": 0.28459057211875916, "learning_rate": 6.282525389124192e-05, "loss": 0.1853, "step": 6143 }, { "epoch": 1.2434729811778993, "grad_norm": 0.27502378821372986, "learning_rate": 6.279572811247698e-05, "loss": 0.1984, "step": 6144 }, { "epoch": 1.2436753693584295, "grad_norm": 0.2636633515357971, "learning_rate": 6.276620609764988e-05, "loss": 0.1823, "step": 6145 }, { "epoch": 1.2438777575389597, "grad_norm": 0.28661084175109863, "learning_rate": 6.273668784974737e-05, "loss": 0.2045, "step": 6146 }, { "epoch": 1.24408014571949, "grad_norm": 0.3029300570487976, "learning_rate": 6.270717337175578e-05, "loss": 0.1976, "step": 6147 }, { "epoch": 1.2442825339000203, "grad_norm": 0.3371375501155853, "learning_rate": 6.267766266666107e-05, "loss": 0.19, "step": 6148 }, { "epoch": 1.2444849220805505, "grad_norm": 0.2589677572250366, "learning_rate": 6.264815573744884e-05, "loss": 0.1714, "step": 6149 }, { "epoch": 1.2446873102610807, "grad_norm": 0.36157044768333435, "learning_rate": 6.261865258710428e-05, "loss": 0.2119, "step": 6150 }, { "epoch": 1.2446873102610807, "eval_loss": 0.2650595009326935, "eval_runtime": 0.737, "eval_samples_per_second": 6.784, "eval_steps_per_second": 1.357, "step": 6150 }, { "epoch": 1.244889698441611, "grad_norm": 0.31364962458610535, "learning_rate": 6.258915321861223e-05, "loss": 0.1856, "step": 6151 }, { "epoch": 1.2450920866221413, "grad_norm": 0.2639737129211426, "learning_rate": 6.255965763495709e-05, "loss": 0.1939, "step": 6152 }, { "epoch": 1.2452944748026715, "grad_norm": 0.2909432053565979, "learning_rate": 6.253016583912295e-05, "loss": 0.1955, "step": 6153 }, { "epoch": 1.2454968629832017, "grad_norm": 0.2598069906234741, "learning_rate": 6.250067783409345e-05, "loss": 0.2062, "step": 6154 }, { "epoch": 1.2456992511637321, "grad_norm": 0.35393086075782776, "learning_rate": 6.24711936228519e-05, "loss": 0.194, "step": 6155 }, { "epoch": 1.2459016393442623, "grad_norm": 0.27285313606262207, "learning_rate": 6.244171320838118e-05, "loss": 0.1832, "step": 6156 }, { "epoch": 1.2461040275247925, "grad_norm": 0.30949246883392334, "learning_rate": 6.241223659366383e-05, "loss": 0.1989, "step": 6157 }, { "epoch": 1.2463064157053227, "grad_norm": 0.32277756929397583, "learning_rate": 6.238276378168202e-05, "loss": 0.2191, "step": 6158 }, { "epoch": 1.246508803885853, "grad_norm": 0.2878342568874359, "learning_rate": 6.235329477541743e-05, "loss": 0.1834, "step": 6159 }, { "epoch": 1.2467111920663834, "grad_norm": 0.2992894947528839, "learning_rate": 6.232382957785143e-05, "loss": 0.193, "step": 6160 }, { "epoch": 1.2469135802469136, "grad_norm": 0.26729676127433777, "learning_rate": 6.229436819196503e-05, "loss": 0.1742, "step": 6161 }, { "epoch": 1.2471159684274438, "grad_norm": 0.2526571452617645, "learning_rate": 6.226491062073882e-05, "loss": 0.1429, "step": 6162 }, { "epoch": 1.2473183566079742, "grad_norm": 0.2803010642528534, "learning_rate": 6.2235456867153e-05, "loss": 0.2015, "step": 6163 }, { "epoch": 1.2475207447885044, "grad_norm": 0.307064950466156, "learning_rate": 6.220600693418739e-05, "loss": 0.1896, "step": 6164 }, { "epoch": 1.2477231329690346, "grad_norm": 0.28746598958969116, "learning_rate": 6.217656082482143e-05, "loss": 0.1931, "step": 6165 }, { "epoch": 1.2479255211495648, "grad_norm": 0.31549063324928284, "learning_rate": 6.214711854203417e-05, "loss": 0.2117, "step": 6166 }, { "epoch": 1.2481279093300952, "grad_norm": 0.31217721104621887, "learning_rate": 6.211768008880427e-05, "loss": 0.2061, "step": 6167 }, { "epoch": 1.2483302975106254, "grad_norm": 0.287009060382843, "learning_rate": 6.208824546811001e-05, "loss": 0.2023, "step": 6168 }, { "epoch": 1.2485326856911556, "grad_norm": 0.32352015376091003, "learning_rate": 6.205881468292927e-05, "loss": 0.2042, "step": 6169 }, { "epoch": 1.2487350738716858, "grad_norm": 0.3012515604496002, "learning_rate": 6.202938773623954e-05, "loss": 0.1969, "step": 6170 }, { "epoch": 1.2489374620522162, "grad_norm": 0.26024118065834045, "learning_rate": 6.199996463101795e-05, "loss": 0.1842, "step": 6171 }, { "epoch": 1.2491398502327464, "grad_norm": 0.2766067087650299, "learning_rate": 6.19705453702412e-05, "loss": 0.1998, "step": 6172 }, { "epoch": 1.2493422384132766, "grad_norm": 0.2818344235420227, "learning_rate": 6.194112995688563e-05, "loss": 0.1755, "step": 6173 }, { "epoch": 1.2495446265938068, "grad_norm": 0.2871859073638916, "learning_rate": 6.19117183939272e-05, "loss": 0.1693, "step": 6174 }, { "epoch": 1.2497470147743373, "grad_norm": 0.2709919214248657, "learning_rate": 6.188231068434143e-05, "loss": 0.198, "step": 6175 }, { "epoch": 1.2499494029548675, "grad_norm": 0.2757877707481384, "learning_rate": 6.18529068311035e-05, "loss": 0.2063, "step": 6176 }, { "epoch": 1.2501517911353976, "grad_norm": 0.33978694677352905, "learning_rate": 6.18235068371882e-05, "loss": 0.1982, "step": 6177 }, { "epoch": 1.250354179315928, "grad_norm": 0.25776079297065735, "learning_rate": 6.179411070556989e-05, "loss": 0.1908, "step": 6178 }, { "epoch": 1.2505565674964583, "grad_norm": 0.2702656388282776, "learning_rate": 6.176471843922256e-05, "loss": 0.1746, "step": 6179 }, { "epoch": 1.2507589556769885, "grad_norm": 0.3064190745353699, "learning_rate": 6.173533004111982e-05, "loss": 0.199, "step": 6180 }, { "epoch": 1.2509613438575187, "grad_norm": 0.2589879631996155, "learning_rate": 6.170594551423493e-05, "loss": 0.1843, "step": 6181 }, { "epoch": 1.2511637320380489, "grad_norm": 0.2778613269329071, "learning_rate": 6.167656486154061e-05, "loss": 0.1809, "step": 6182 }, { "epoch": 1.2513661202185793, "grad_norm": 0.27029654383659363, "learning_rate": 6.164718808600933e-05, "loss": 0.1613, "step": 6183 }, { "epoch": 1.2515685083991095, "grad_norm": 0.2589007616043091, "learning_rate": 6.161781519061314e-05, "loss": 0.1769, "step": 6184 }, { "epoch": 1.2517708965796397, "grad_norm": 0.3096887767314911, "learning_rate": 6.158844617832367e-05, "loss": 0.2047, "step": 6185 }, { "epoch": 1.2519732847601701, "grad_norm": 0.35860517621040344, "learning_rate": 6.155908105211216e-05, "loss": 0.1758, "step": 6186 }, { "epoch": 1.2521756729407003, "grad_norm": 0.3288900554180145, "learning_rate": 6.152971981494948e-05, "loss": 0.2195, "step": 6187 }, { "epoch": 1.2523780611212305, "grad_norm": 0.2885974049568176, "learning_rate": 6.150036246980609e-05, "loss": 0.1957, "step": 6188 }, { "epoch": 1.2525804493017607, "grad_norm": 0.27418920397758484, "learning_rate": 6.147100901965203e-05, "loss": 0.1887, "step": 6189 }, { "epoch": 1.252782837482291, "grad_norm": 0.2852184772491455, "learning_rate": 6.144165946745701e-05, "loss": 0.1942, "step": 6190 }, { "epoch": 1.2529852256628213, "grad_norm": 0.2928641140460968, "learning_rate": 6.14123138161903e-05, "loss": 0.1887, "step": 6191 }, { "epoch": 1.2531876138433515, "grad_norm": 0.3721943497657776, "learning_rate": 6.138297206882077e-05, "loss": 0.214, "step": 6192 }, { "epoch": 1.2533900020238817, "grad_norm": 0.32005420327186584, "learning_rate": 6.135363422831695e-05, "loss": 0.186, "step": 6193 }, { "epoch": 1.2535923902044122, "grad_norm": 0.34797215461730957, "learning_rate": 6.132430029764688e-05, "loss": 0.2279, "step": 6194 }, { "epoch": 1.2537947783849424, "grad_norm": 0.2843010425567627, "learning_rate": 6.129497027977829e-05, "loss": 0.1889, "step": 6195 }, { "epoch": 1.2539971665654726, "grad_norm": 0.30852001905441284, "learning_rate": 6.126564417767849e-05, "loss": 0.2335, "step": 6196 }, { "epoch": 1.2541995547460028, "grad_norm": 0.2834334075450897, "learning_rate": 6.123632199431436e-05, "loss": 0.1958, "step": 6197 }, { "epoch": 1.254401942926533, "grad_norm": 0.2763972580432892, "learning_rate": 6.120700373265245e-05, "loss": 0.1835, "step": 6198 }, { "epoch": 1.2546043311070634, "grad_norm": 0.27830395102500916, "learning_rate": 6.117768939565883e-05, "loss": 0.1904, "step": 6199 }, { "epoch": 1.2548067192875936, "grad_norm": 0.2894650101661682, "learning_rate": 6.114837898629926e-05, "loss": 0.207, "step": 6200 }, { "epoch": 1.2548067192875936, "eval_loss": 0.2662354111671448, "eval_runtime": 0.737, "eval_samples_per_second": 6.784, "eval_steps_per_second": 1.357, "step": 6200 }, { "epoch": 1.255009107468124, "grad_norm": 0.29689645767211914, "learning_rate": 6.111907250753903e-05, "loss": 0.2057, "step": 6201 }, { "epoch": 1.2552114956486542, "grad_norm": 0.31450599431991577, "learning_rate": 6.108976996234307e-05, "loss": 0.2192, "step": 6202 }, { "epoch": 1.2554138838291844, "grad_norm": 0.293605238199234, "learning_rate": 6.106047135367594e-05, "loss": 0.1811, "step": 6203 }, { "epoch": 1.2556162720097146, "grad_norm": 0.28915974497795105, "learning_rate": 6.103117668450171e-05, "loss": 0.1885, "step": 6204 }, { "epoch": 1.2558186601902448, "grad_norm": 0.337788462638855, "learning_rate": 6.100188595778411e-05, "loss": 0.1903, "step": 6205 }, { "epoch": 1.2560210483707752, "grad_norm": 0.316983699798584, "learning_rate": 6.097259917648649e-05, "loss": 0.2047, "step": 6206 }, { "epoch": 1.2562234365513054, "grad_norm": 0.28668999671936035, "learning_rate": 6.0943316343571776e-05, "loss": 0.1867, "step": 6207 }, { "epoch": 1.2564258247318356, "grad_norm": 0.2853834927082062, "learning_rate": 6.091403746200251e-05, "loss": 0.1839, "step": 6208 }, { "epoch": 1.256628212912366, "grad_norm": 0.29834699630737305, "learning_rate": 6.088476253474078e-05, "loss": 0.196, "step": 6209 }, { "epoch": 1.2568306010928962, "grad_norm": 0.23479576408863068, "learning_rate": 6.085549156474837e-05, "loss": 0.1569, "step": 6210 }, { "epoch": 1.2570329892734264, "grad_norm": 0.25744327902793884, "learning_rate": 6.0826224554986574e-05, "loss": 0.1651, "step": 6211 }, { "epoch": 1.2572353774539566, "grad_norm": 0.2955494821071625, "learning_rate": 6.079696150841634e-05, "loss": 0.1895, "step": 6212 }, { "epoch": 1.2574377656344868, "grad_norm": 0.2956841289997101, "learning_rate": 6.076770242799818e-05, "loss": 0.1915, "step": 6213 }, { "epoch": 1.2576401538150173, "grad_norm": 0.2864188849925995, "learning_rate": 6.0738447316692225e-05, "loss": 0.188, "step": 6214 }, { "epoch": 1.2578425419955475, "grad_norm": 0.3533359169960022, "learning_rate": 6.0709196177458214e-05, "loss": 0.2335, "step": 6215 }, { "epoch": 1.2580449301760777, "grad_norm": 0.3068154752254486, "learning_rate": 6.067994901325546e-05, "loss": 0.1835, "step": 6216 }, { "epoch": 1.258247318356608, "grad_norm": 0.28471609950065613, "learning_rate": 6.0650705827042874e-05, "loss": 0.1961, "step": 6217 }, { "epoch": 1.2584497065371383, "grad_norm": 0.2817171514034271, "learning_rate": 6.062146662177899e-05, "loss": 0.2149, "step": 6218 }, { "epoch": 1.2586520947176685, "grad_norm": 0.2808842957019806, "learning_rate": 6.0592231400421914e-05, "loss": 0.1907, "step": 6219 }, { "epoch": 1.2588544828981987, "grad_norm": 0.29851841926574707, "learning_rate": 6.056300016592937e-05, "loss": 0.199, "step": 6220 }, { "epoch": 1.2590568710787289, "grad_norm": 0.3412727415561676, "learning_rate": 6.053377292125867e-05, "loss": 0.2199, "step": 6221 }, { "epoch": 1.2592592592592593, "grad_norm": 0.27769625186920166, "learning_rate": 6.0504549669366706e-05, "loss": 0.1949, "step": 6222 }, { "epoch": 1.2594616474397895, "grad_norm": 0.26295939087867737, "learning_rate": 6.047533041320998e-05, "loss": 0.1899, "step": 6223 }, { "epoch": 1.2596640356203197, "grad_norm": 0.30268317461013794, "learning_rate": 6.0446115155744576e-05, "loss": 0.1826, "step": 6224 }, { "epoch": 1.2598664238008501, "grad_norm": 0.34082984924316406, "learning_rate": 6.041690389992627e-05, "loss": 0.2085, "step": 6225 }, { "epoch": 1.2600688119813803, "grad_norm": 0.2618637979030609, "learning_rate": 6.0387696648710246e-05, "loss": 0.1653, "step": 6226 }, { "epoch": 1.2602712001619105, "grad_norm": 0.27268511056900024, "learning_rate": 6.035849340505142e-05, "loss": 0.1629, "step": 6227 }, { "epoch": 1.2604735883424407, "grad_norm": 0.29634785652160645, "learning_rate": 6.0329294171904295e-05, "loss": 0.189, "step": 6228 }, { "epoch": 1.260675976522971, "grad_norm": 0.27574586868286133, "learning_rate": 6.03000989522229e-05, "loss": 0.1727, "step": 6229 }, { "epoch": 1.2608783647035013, "grad_norm": 0.3131870627403259, "learning_rate": 6.027090774896095e-05, "loss": 0.1839, "step": 6230 }, { "epoch": 1.2610807528840315, "grad_norm": 0.3620019853115082, "learning_rate": 6.024172056507167e-05, "loss": 0.1879, "step": 6231 }, { "epoch": 1.261283141064562, "grad_norm": 0.2766468822956085, "learning_rate": 6.021253740350793e-05, "loss": 0.1889, "step": 6232 }, { "epoch": 1.2614855292450922, "grad_norm": 0.2821180820465088, "learning_rate": 6.0183358267222167e-05, "loss": 0.1848, "step": 6233 }, { "epoch": 1.2616879174256224, "grad_norm": 0.4128684997558594, "learning_rate": 6.015418315916642e-05, "loss": 0.1691, "step": 6234 }, { "epoch": 1.2618903056061526, "grad_norm": 0.4056648015975952, "learning_rate": 6.012501208229233e-05, "loss": 0.2035, "step": 6235 }, { "epoch": 1.2620926937866828, "grad_norm": 0.24870654940605164, "learning_rate": 6.009584503955111e-05, "loss": 0.1644, "step": 6236 }, { "epoch": 1.2622950819672132, "grad_norm": 0.3007480800151825, "learning_rate": 6.0066682033893586e-05, "loss": 0.2186, "step": 6237 }, { "epoch": 1.2624974701477434, "grad_norm": 0.302317351102829, "learning_rate": 6.003752306827015e-05, "loss": 0.2275, "step": 6238 }, { "epoch": 1.2626998583282736, "grad_norm": 0.3214733302593231, "learning_rate": 6.0008368145630814e-05, "loss": 0.1849, "step": 6239 }, { "epoch": 1.262902246508804, "grad_norm": 0.30345696210861206, "learning_rate": 5.997921726892516e-05, "loss": 0.1862, "step": 6240 }, { "epoch": 1.2631046346893342, "grad_norm": 0.31955331563949585, "learning_rate": 5.995007044110237e-05, "loss": 0.238, "step": 6241 }, { "epoch": 1.2633070228698644, "grad_norm": 0.3335302472114563, "learning_rate": 5.992092766511121e-05, "loss": 0.1993, "step": 6242 }, { "epoch": 1.2635094110503946, "grad_norm": 0.27500903606414795, "learning_rate": 5.989178894390004e-05, "loss": 0.1757, "step": 6243 }, { "epoch": 1.2637117992309248, "grad_norm": 0.2906457781791687, "learning_rate": 5.9862654280416816e-05, "loss": 0.2086, "step": 6244 }, { "epoch": 1.2639141874114552, "grad_norm": 0.298608660697937, "learning_rate": 5.9833523677609084e-05, "loss": 0.2096, "step": 6245 }, { "epoch": 1.2641165755919854, "grad_norm": 0.30972978472709656, "learning_rate": 5.9804397138423965e-05, "loss": 0.201, "step": 6246 }, { "epoch": 1.2643189637725156, "grad_norm": 0.27621757984161377, "learning_rate": 5.977527466580819e-05, "loss": 0.1929, "step": 6247 }, { "epoch": 1.264521351953046, "grad_norm": 0.37198740243911743, "learning_rate": 5.974615626270803e-05, "loss": 0.2281, "step": 6248 }, { "epoch": 1.2647237401335762, "grad_norm": 0.3002682328224182, "learning_rate": 5.9717041932069393e-05, "loss": 0.1646, "step": 6249 }, { "epoch": 1.2649261283141064, "grad_norm": 0.26520946621894836, "learning_rate": 5.96879316768378e-05, "loss": 0.1953, "step": 6250 }, { "epoch": 1.2649261283141064, "eval_loss": 0.2648610472679138, "eval_runtime": 0.7416, "eval_samples_per_second": 6.742, "eval_steps_per_second": 1.348, "step": 6250 }, { "epoch": 1.2651285164946366, "grad_norm": 0.3371294140815735, "learning_rate": 5.965882549995825e-05, "loss": 0.2109, "step": 6251 }, { "epoch": 1.2653309046751668, "grad_norm": 0.2856069803237915, "learning_rate": 5.962972340437547e-05, "loss": 0.2024, "step": 6252 }, { "epoch": 1.2655332928556973, "grad_norm": 0.3305569589138031, "learning_rate": 5.960062539303366e-05, "loss": 0.2167, "step": 6253 }, { "epoch": 1.2657356810362275, "grad_norm": 0.27187907695770264, "learning_rate": 5.957153146887666e-05, "loss": 0.1833, "step": 6254 }, { "epoch": 1.2659380692167577, "grad_norm": 0.29358017444610596, "learning_rate": 5.954244163484792e-05, "loss": 0.1918, "step": 6255 }, { "epoch": 1.266140457397288, "grad_norm": 0.28766417503356934, "learning_rate": 5.95133558938904e-05, "loss": 0.1937, "step": 6256 }, { "epoch": 1.2663428455778183, "grad_norm": 0.2951597571372986, "learning_rate": 5.9484274248946715e-05, "loss": 0.1819, "step": 6257 }, { "epoch": 1.2665452337583485, "grad_norm": 0.29034483432769775, "learning_rate": 5.9455196702959035e-05, "loss": 0.2177, "step": 6258 }, { "epoch": 1.2667476219388787, "grad_norm": 0.2750239074230194, "learning_rate": 5.942612325886912e-05, "loss": 0.1965, "step": 6259 }, { "epoch": 1.266950010119409, "grad_norm": 0.27870023250579834, "learning_rate": 5.9397053919618317e-05, "loss": 0.1724, "step": 6260 }, { "epoch": 1.2671523982999393, "grad_norm": 0.27458661794662476, "learning_rate": 5.9367988688147556e-05, "loss": 0.1918, "step": 6261 }, { "epoch": 1.2673547864804695, "grad_norm": 0.27289730310440063, "learning_rate": 5.933892756739736e-05, "loss": 0.2136, "step": 6262 }, { "epoch": 1.267557174661, "grad_norm": 0.3496069610118866, "learning_rate": 5.930987056030781e-05, "loss": 0.202, "step": 6263 }, { "epoch": 1.2677595628415301, "grad_norm": 0.29557445645332336, "learning_rate": 5.9280817669818615e-05, "loss": 0.2219, "step": 6264 }, { "epoch": 1.2679619510220603, "grad_norm": 0.24535717070102692, "learning_rate": 5.925176889886901e-05, "loss": 0.1737, "step": 6265 }, { "epoch": 1.2681643392025905, "grad_norm": 0.2717592716217041, "learning_rate": 5.922272425039786e-05, "loss": 0.1855, "step": 6266 }, { "epoch": 1.2683667273831207, "grad_norm": 0.2716783285140991, "learning_rate": 5.919368372734361e-05, "loss": 0.1665, "step": 6267 }, { "epoch": 1.2685691155636512, "grad_norm": 0.2683229446411133, "learning_rate": 5.9164647332644266e-05, "loss": 0.1919, "step": 6268 }, { "epoch": 1.2687715037441814, "grad_norm": 0.2556900084018707, "learning_rate": 5.913561506923741e-05, "loss": 0.1909, "step": 6269 }, { "epoch": 1.2689738919247116, "grad_norm": 0.30283623933792114, "learning_rate": 5.9106586940060275e-05, "loss": 0.2169, "step": 6270 }, { "epoch": 1.269176280105242, "grad_norm": 0.2764374911785126, "learning_rate": 5.907756294804955e-05, "loss": 0.1926, "step": 6271 }, { "epoch": 1.2693786682857722, "grad_norm": 0.29008007049560547, "learning_rate": 5.904854309614162e-05, "loss": 0.2117, "step": 6272 }, { "epoch": 1.2695810564663024, "grad_norm": 0.27948057651519775, "learning_rate": 5.901952738727239e-05, "loss": 0.1912, "step": 6273 }, { "epoch": 1.2697834446468326, "grad_norm": 0.2950563430786133, "learning_rate": 5.899051582437738e-05, "loss": 0.1747, "step": 6274 }, { "epoch": 1.2699858328273628, "grad_norm": 0.3165188729763031, "learning_rate": 5.8961508410391674e-05, "loss": 0.2518, "step": 6275 }, { "epoch": 1.2701882210078932, "grad_norm": 0.24708828330039978, "learning_rate": 5.893250514824994e-05, "loss": 0.186, "step": 6276 }, { "epoch": 1.2703906091884234, "grad_norm": 0.32647305727005005, "learning_rate": 5.8903506040886415e-05, "loss": 0.2121, "step": 6277 }, { "epoch": 1.2705929973689536, "grad_norm": 0.3253696858882904, "learning_rate": 5.887451109123492e-05, "loss": 0.2455, "step": 6278 }, { "epoch": 1.270795385549484, "grad_norm": 0.3270389437675476, "learning_rate": 5.8845520302228876e-05, "loss": 0.229, "step": 6279 }, { "epoch": 1.2709977737300142, "grad_norm": 0.2574213743209839, "learning_rate": 5.8816533676801265e-05, "loss": 0.2018, "step": 6280 }, { "epoch": 1.2712001619105444, "grad_norm": 0.27948006987571716, "learning_rate": 5.878755121788464e-05, "loss": 0.2016, "step": 6281 }, { "epoch": 1.2714025500910746, "grad_norm": 0.2876220941543579, "learning_rate": 5.8758572928411136e-05, "loss": 0.1904, "step": 6282 }, { "epoch": 1.2716049382716048, "grad_norm": 0.26333025097846985, "learning_rate": 5.872959881131248e-05, "loss": 0.19, "step": 6283 }, { "epoch": 1.2718073264521352, "grad_norm": 0.31372183561325073, "learning_rate": 5.870062886951999e-05, "loss": 0.2082, "step": 6284 }, { "epoch": 1.2720097146326654, "grad_norm": 0.26160863041877747, "learning_rate": 5.86716631059645e-05, "loss": 0.1935, "step": 6285 }, { "epoch": 1.2722121028131956, "grad_norm": 0.29009556770324707, "learning_rate": 5.864270152357649e-05, "loss": 0.2208, "step": 6286 }, { "epoch": 1.272414490993726, "grad_norm": 0.29613327980041504, "learning_rate": 5.8613744125285996e-05, "loss": 0.179, "step": 6287 }, { "epoch": 1.2726168791742563, "grad_norm": 0.29994410276412964, "learning_rate": 5.85847909140226e-05, "loss": 0.2183, "step": 6288 }, { "epoch": 1.2728192673547865, "grad_norm": 0.28694406151771545, "learning_rate": 5.855584189271549e-05, "loss": 0.2246, "step": 6289 }, { "epoch": 1.2730216555353167, "grad_norm": 0.26828375458717346, "learning_rate": 5.852689706429344e-05, "loss": 0.2023, "step": 6290 }, { "epoch": 1.273224043715847, "grad_norm": 0.2522238492965698, "learning_rate": 5.8497956431684766e-05, "loss": 0.1794, "step": 6291 }, { "epoch": 1.2734264318963773, "grad_norm": 0.306525856256485, "learning_rate": 5.84690199978174e-05, "loss": 0.2097, "step": 6292 }, { "epoch": 1.2736288200769075, "grad_norm": 0.29265809059143066, "learning_rate": 5.84400877656188e-05, "loss": 0.206, "step": 6293 }, { "epoch": 1.273831208257438, "grad_norm": 0.298153817653656, "learning_rate": 5.841115973801603e-05, "loss": 0.2021, "step": 6294 }, { "epoch": 1.274033596437968, "grad_norm": 0.30544334650039673, "learning_rate": 5.8382235917935745e-05, "loss": 0.1921, "step": 6295 }, { "epoch": 1.2742359846184983, "grad_norm": 0.2744888365268707, "learning_rate": 5.835331630830414e-05, "loss": 0.1753, "step": 6296 }, { "epoch": 1.2744383727990285, "grad_norm": 0.2987912595272064, "learning_rate": 5.832440091204698e-05, "loss": 0.1775, "step": 6297 }, { "epoch": 1.2746407609795587, "grad_norm": 0.31008341908454895, "learning_rate": 5.829548973208965e-05, "loss": 0.171, "step": 6298 }, { "epoch": 1.2748431491600891, "grad_norm": 0.2730657458305359, "learning_rate": 5.826658277135706e-05, "loss": 0.21, "step": 6299 }, { "epoch": 1.2750455373406193, "grad_norm": 0.308075875043869, "learning_rate": 5.823768003277372e-05, "loss": 0.2053, "step": 6300 }, { "epoch": 1.2750455373406193, "eval_loss": 0.26606449484825134, "eval_runtime": 0.7383, "eval_samples_per_second": 6.772, "eval_steps_per_second": 1.354, "step": 6300 }, { "epoch": 1.2752479255211495, "grad_norm": 0.30432766675949097, "learning_rate": 5.820878151926371e-05, "loss": 0.229, "step": 6301 }, { "epoch": 1.27545031370168, "grad_norm": 0.3013366162776947, "learning_rate": 5.8179887233750674e-05, "loss": 0.2356, "step": 6302 }, { "epoch": 1.2756527018822101, "grad_norm": 0.3118283748626709, "learning_rate": 5.815099717915784e-05, "loss": 0.1861, "step": 6303 }, { "epoch": 1.2758550900627403, "grad_norm": 0.29927361011505127, "learning_rate": 5.812211135840799e-05, "loss": 0.1997, "step": 6304 }, { "epoch": 1.2760574782432705, "grad_norm": 0.289941668510437, "learning_rate": 5.809322977442349e-05, "loss": 0.2175, "step": 6305 }, { "epoch": 1.2762598664238007, "grad_norm": 0.2943812608718872, "learning_rate": 5.806435243012629e-05, "loss": 0.2035, "step": 6306 }, { "epoch": 1.2764622546043312, "grad_norm": 0.2723061740398407, "learning_rate": 5.803547932843787e-05, "loss": 0.2055, "step": 6307 }, { "epoch": 1.2766646427848614, "grad_norm": 0.2959640622138977, "learning_rate": 5.8006610472279336e-05, "loss": 0.2127, "step": 6308 }, { "epoch": 1.2768670309653916, "grad_norm": 0.2741999328136444, "learning_rate": 5.797774586457132e-05, "loss": 0.1693, "step": 6309 }, { "epoch": 1.277069419145922, "grad_norm": 0.3455542325973511, "learning_rate": 5.794888550823403e-05, "loss": 0.2402, "step": 6310 }, { "epoch": 1.2772718073264522, "grad_norm": 0.2634921669960022, "learning_rate": 5.7920029406187284e-05, "loss": 0.1774, "step": 6311 }, { "epoch": 1.2774741955069824, "grad_norm": 0.3226475715637207, "learning_rate": 5.789117756135042e-05, "loss": 0.1888, "step": 6312 }, { "epoch": 1.2776765836875126, "grad_norm": 0.2682722210884094, "learning_rate": 5.786232997664236e-05, "loss": 0.2034, "step": 6313 }, { "epoch": 1.2778789718680428, "grad_norm": 0.26142412424087524, "learning_rate": 5.7833486654981606e-05, "loss": 0.2068, "step": 6314 }, { "epoch": 1.2780813600485732, "grad_norm": 0.2496640533208847, "learning_rate": 5.780464759928623e-05, "loss": 0.1919, "step": 6315 }, { "epoch": 1.2782837482291034, "grad_norm": 0.28327369689941406, "learning_rate": 5.7775812812473864e-05, "loss": 0.1949, "step": 6316 }, { "epoch": 1.2784861364096336, "grad_norm": 0.2788563668727875, "learning_rate": 5.774698229746169e-05, "loss": 0.2042, "step": 6317 }, { "epoch": 1.278688524590164, "grad_norm": 0.2682736814022064, "learning_rate": 5.77181560571665e-05, "loss": 0.19, "step": 6318 }, { "epoch": 1.2788909127706942, "grad_norm": 0.29064783453941345, "learning_rate": 5.7689334094504635e-05, "loss": 0.2215, "step": 6319 }, { "epoch": 1.2790933009512244, "grad_norm": 0.2953173518180847, "learning_rate": 5.766051641239196e-05, "loss": 0.1892, "step": 6320 }, { "epoch": 1.2792956891317546, "grad_norm": 0.2516275942325592, "learning_rate": 5.7631703013743984e-05, "loss": 0.1574, "step": 6321 }, { "epoch": 1.279498077312285, "grad_norm": 0.2933824360370636, "learning_rate": 5.7602893901475744e-05, "loss": 0.2212, "step": 6322 }, { "epoch": 1.2797004654928152, "grad_norm": 0.3145001232624054, "learning_rate": 5.757408907850181e-05, "loss": 0.2046, "step": 6323 }, { "epoch": 1.2799028536733454, "grad_norm": 0.2757995128631592, "learning_rate": 5.754528854773639e-05, "loss": 0.1678, "step": 6324 }, { "epoch": 1.2801052418538759, "grad_norm": 0.3153713047504425, "learning_rate": 5.7516492312093195e-05, "loss": 0.1897, "step": 6325 }, { "epoch": 1.280307630034406, "grad_norm": 0.29023733735084534, "learning_rate": 5.748770037448552e-05, "loss": 0.2178, "step": 6326 }, { "epoch": 1.2805100182149363, "grad_norm": 0.24412359297275543, "learning_rate": 5.745891273782626e-05, "loss": 0.1861, "step": 6327 }, { "epoch": 1.2807124063954665, "grad_norm": 0.28133055567741394, "learning_rate": 5.7430129405027835e-05, "loss": 0.2106, "step": 6328 }, { "epoch": 1.2809147945759967, "grad_norm": 0.3459916412830353, "learning_rate": 5.740135037900223e-05, "loss": 0.2096, "step": 6329 }, { "epoch": 1.281117182756527, "grad_norm": 0.29903319478034973, "learning_rate": 5.737257566266101e-05, "loss": 0.2135, "step": 6330 }, { "epoch": 1.2813195709370573, "grad_norm": 0.33177486062049866, "learning_rate": 5.73438052589153e-05, "loss": 0.216, "step": 6331 }, { "epoch": 1.2815219591175875, "grad_norm": 0.3481799066066742, "learning_rate": 5.731503917067578e-05, "loss": 0.2489, "step": 6332 }, { "epoch": 1.281724347298118, "grad_norm": 0.2963894009590149, "learning_rate": 5.728627740085273e-05, "loss": 0.1812, "step": 6333 }, { "epoch": 1.281926735478648, "grad_norm": 0.24134975671768188, "learning_rate": 5.725751995235592e-05, "loss": 0.1348, "step": 6334 }, { "epoch": 1.2821291236591783, "grad_norm": 0.27999383211135864, "learning_rate": 5.722876682809476e-05, "loss": 0.1701, "step": 6335 }, { "epoch": 1.2823315118397085, "grad_norm": 0.2429206371307373, "learning_rate": 5.720001803097821e-05, "loss": 0.1919, "step": 6336 }, { "epoch": 1.2825339000202387, "grad_norm": 0.27424222230911255, "learning_rate": 5.717127356391472e-05, "loss": 0.213, "step": 6337 }, { "epoch": 1.2827362882007691, "grad_norm": 0.2957404553890228, "learning_rate": 5.714253342981235e-05, "loss": 0.1927, "step": 6338 }, { "epoch": 1.2829386763812993, "grad_norm": 0.2850241959095001, "learning_rate": 5.711379763157876e-05, "loss": 0.1778, "step": 6339 }, { "epoch": 1.2831410645618295, "grad_norm": 0.30980175733566284, "learning_rate": 5.708506617212113e-05, "loss": 0.2397, "step": 6340 }, { "epoch": 1.28334345274236, "grad_norm": 0.3064621090888977, "learning_rate": 5.7056339054346194e-05, "loss": 0.1958, "step": 6341 }, { "epoch": 1.2835458409228901, "grad_norm": 0.2849607765674591, "learning_rate": 5.702761628116029e-05, "loss": 0.1998, "step": 6342 }, { "epoch": 1.2837482291034203, "grad_norm": 0.25129014253616333, "learning_rate": 5.6998897855469245e-05, "loss": 0.2023, "step": 6343 }, { "epoch": 1.2839506172839505, "grad_norm": 0.27141299843788147, "learning_rate": 5.697018378017851e-05, "loss": 0.2023, "step": 6344 }, { "epoch": 1.2841530054644807, "grad_norm": 0.30553534626960754, "learning_rate": 5.694147405819309e-05, "loss": 0.2055, "step": 6345 }, { "epoch": 1.2843553936450112, "grad_norm": 0.2521169185638428, "learning_rate": 5.6912768692417505e-05, "loss": 0.1964, "step": 6346 }, { "epoch": 1.2845577818255414, "grad_norm": 0.2850729823112488, "learning_rate": 5.688406768575587e-05, "loss": 0.1935, "step": 6347 }, { "epoch": 1.2847601700060716, "grad_norm": 0.27700698375701904, "learning_rate": 5.6855371041111874e-05, "loss": 0.2078, "step": 6348 }, { "epoch": 1.284962558186602, "grad_norm": 0.24651356041431427, "learning_rate": 5.682667876138871e-05, "loss": 0.169, "step": 6349 }, { "epoch": 1.2851649463671322, "grad_norm": 0.325048565864563, "learning_rate": 5.679799084948918e-05, "loss": 0.1984, "step": 6350 }, { "epoch": 1.2851649463671322, "eval_loss": 0.2654205858707428, "eval_runtime": 0.7368, "eval_samples_per_second": 6.786, "eval_steps_per_second": 1.357, "step": 6350 }, { "epoch": 1.2853673345476624, "grad_norm": 0.28931573033332825, "learning_rate": 5.676930730831562e-05, "loss": 0.205, "step": 6351 }, { "epoch": 1.2855697227281926, "grad_norm": 0.37909775972366333, "learning_rate": 5.674062814076994e-05, "loss": 0.2056, "step": 6352 }, { "epoch": 1.285772110908723, "grad_norm": 0.25672927498817444, "learning_rate": 5.671195334975358e-05, "loss": 0.1567, "step": 6353 }, { "epoch": 1.2859744990892532, "grad_norm": 0.25225627422332764, "learning_rate": 5.668328293816756e-05, "loss": 0.158, "step": 6354 }, { "epoch": 1.2861768872697834, "grad_norm": 0.35539114475250244, "learning_rate": 5.6654616908912473e-05, "loss": 0.2258, "step": 6355 }, { "epoch": 1.2863792754503138, "grad_norm": 0.2866462767124176, "learning_rate": 5.6625955264888405e-05, "loss": 0.2068, "step": 6356 }, { "epoch": 1.286581663630844, "grad_norm": 0.26378700137138367, "learning_rate": 5.659729800899509e-05, "loss": 0.1868, "step": 6357 }, { "epoch": 1.2867840518113742, "grad_norm": 0.3105945289134979, "learning_rate": 5.656864514413174e-05, "loss": 0.1813, "step": 6358 }, { "epoch": 1.2869864399919044, "grad_norm": 0.24603483080863953, "learning_rate": 5.6539996673197134e-05, "loss": 0.1574, "step": 6359 }, { "epoch": 1.2871888281724346, "grad_norm": 0.28006672859191895, "learning_rate": 5.6511352599089664e-05, "loss": 0.2039, "step": 6360 }, { "epoch": 1.287391216352965, "grad_norm": 0.2715175449848175, "learning_rate": 5.6482712924707203e-05, "loss": 0.2076, "step": 6361 }, { "epoch": 1.2875936045334953, "grad_norm": 0.2849876284599304, "learning_rate": 5.6454077652947236e-05, "loss": 0.2028, "step": 6362 }, { "epoch": 1.2877959927140255, "grad_norm": 0.2738122045993805, "learning_rate": 5.642544678670676e-05, "loss": 0.2017, "step": 6363 }, { "epoch": 1.2879983808945559, "grad_norm": 0.25495240092277527, "learning_rate": 5.639682032888236e-05, "loss": 0.1781, "step": 6364 }, { "epoch": 1.288200769075086, "grad_norm": 0.2999967038631439, "learning_rate": 5.6368198282370164e-05, "loss": 0.2034, "step": 6365 }, { "epoch": 1.2884031572556163, "grad_norm": 0.25779151916503906, "learning_rate": 5.633958065006584e-05, "loss": 0.193, "step": 6366 }, { "epoch": 1.2886055454361465, "grad_norm": 0.30179300904273987, "learning_rate": 5.6310967434864614e-05, "loss": 0.1953, "step": 6367 }, { "epoch": 1.2888079336166767, "grad_norm": 0.2599449157714844, "learning_rate": 5.6282358639661284e-05, "loss": 0.1783, "step": 6368 }, { "epoch": 1.289010321797207, "grad_norm": 0.2871106266975403, "learning_rate": 5.6253754267350176e-05, "loss": 0.1946, "step": 6369 }, { "epoch": 1.2892127099777373, "grad_norm": 0.2777661383152008, "learning_rate": 5.62251543208252e-05, "loss": 0.1887, "step": 6370 }, { "epoch": 1.2894150981582675, "grad_norm": 0.2921324074268341, "learning_rate": 5.619655880297978e-05, "loss": 0.215, "step": 6371 }, { "epoch": 1.289617486338798, "grad_norm": 0.29286885261535645, "learning_rate": 5.616796771670692e-05, "loss": 0.1865, "step": 6372 }, { "epoch": 1.2898198745193281, "grad_norm": 0.3215929865837097, "learning_rate": 5.613938106489916e-05, "loss": 0.2336, "step": 6373 }, { "epoch": 1.2900222626998583, "grad_norm": 0.25224849581718445, "learning_rate": 5.611079885044859e-05, "loss": 0.1729, "step": 6374 }, { "epoch": 1.2902246508803885, "grad_norm": 0.2967127561569214, "learning_rate": 5.608222107624688e-05, "loss": 0.2173, "step": 6375 }, { "epoch": 1.2904270390609187, "grad_norm": 0.2792723774909973, "learning_rate": 5.60536477451852e-05, "loss": 0.1813, "step": 6376 }, { "epoch": 1.2906294272414491, "grad_norm": 0.3118976950645447, "learning_rate": 5.6025078860154334e-05, "loss": 0.2256, "step": 6377 }, { "epoch": 1.2908318154219793, "grad_norm": 0.3105722665786743, "learning_rate": 5.5996514424044565e-05, "loss": 0.206, "step": 6378 }, { "epoch": 1.2910342036025095, "grad_norm": 0.28627604246139526, "learning_rate": 5.596795443974574e-05, "loss": 0.2053, "step": 6379 }, { "epoch": 1.29123659178304, "grad_norm": 0.32842501997947693, "learning_rate": 5.593939891014726e-05, "loss": 0.2459, "step": 6380 }, { "epoch": 1.2914389799635702, "grad_norm": 0.3171040117740631, "learning_rate": 5.5910847838138114e-05, "loss": 0.1923, "step": 6381 }, { "epoch": 1.2916413681441004, "grad_norm": 0.29570427536964417, "learning_rate": 5.588230122660671e-05, "loss": 0.2182, "step": 6382 }, { "epoch": 1.2918437563246306, "grad_norm": 0.28231585025787354, "learning_rate": 5.585375907844117e-05, "loss": 0.1847, "step": 6383 }, { "epoch": 1.292046144505161, "grad_norm": 0.30900073051452637, "learning_rate": 5.582522139652906e-05, "loss": 0.195, "step": 6384 }, { "epoch": 1.2922485326856912, "grad_norm": 0.37727421522140503, "learning_rate": 5.579668818375752e-05, "loss": 0.2336, "step": 6385 }, { "epoch": 1.2924509208662214, "grad_norm": 0.29819825291633606, "learning_rate": 5.5768159443013255e-05, "loss": 0.1505, "step": 6386 }, { "epoch": 1.2926533090467518, "grad_norm": 0.2777939736843109, "learning_rate": 5.57396351771825e-05, "loss": 0.1769, "step": 6387 }, { "epoch": 1.292855697227282, "grad_norm": 0.2909295856952667, "learning_rate": 5.5711115389151036e-05, "loss": 0.1709, "step": 6388 }, { "epoch": 1.2930580854078122, "grad_norm": 0.30930832028388977, "learning_rate": 5.5682600081804193e-05, "loss": 0.2104, "step": 6389 }, { "epoch": 1.2932604735883424, "grad_norm": 0.2792358994483948, "learning_rate": 5.5654089258026866e-05, "loss": 0.2252, "step": 6390 }, { "epoch": 1.2934628617688726, "grad_norm": 0.2880384624004364, "learning_rate": 5.5625582920703464e-05, "loss": 0.1987, "step": 6391 }, { "epoch": 1.293665249949403, "grad_norm": 0.2849469482898712, "learning_rate": 5.559708107271797e-05, "loss": 0.2174, "step": 6392 }, { "epoch": 1.2938676381299332, "grad_norm": 0.28191232681274414, "learning_rate": 5.55685837169539e-05, "loss": 0.1831, "step": 6393 }, { "epoch": 1.2940700263104634, "grad_norm": 0.26724427938461304, "learning_rate": 5.554009085629431e-05, "loss": 0.2101, "step": 6394 }, { "epoch": 1.2942724144909938, "grad_norm": 0.28229594230651855, "learning_rate": 5.551160249362183e-05, "loss": 0.1904, "step": 6395 }, { "epoch": 1.294474802671524, "grad_norm": 0.2788131833076477, "learning_rate": 5.5483118631818586e-05, "loss": 0.1992, "step": 6396 }, { "epoch": 1.2946771908520542, "grad_norm": 0.271913081407547, "learning_rate": 5.545463927376628e-05, "loss": 0.1844, "step": 6397 }, { "epoch": 1.2948795790325844, "grad_norm": 0.2459937036037445, "learning_rate": 5.542616442234618e-05, "loss": 0.1487, "step": 6398 }, { "epoch": 1.2950819672131146, "grad_norm": 0.2703413665294647, "learning_rate": 5.539769408043904e-05, "loss": 0.1766, "step": 6399 }, { "epoch": 1.295284355393645, "grad_norm": 0.3046536147594452, "learning_rate": 5.536922825092523e-05, "loss": 0.2165, "step": 6400 }, { "epoch": 1.295284355393645, "eval_loss": 0.26894286274909973, "eval_runtime": 0.7379, "eval_samples_per_second": 6.776, "eval_steps_per_second": 1.355, "step": 6400 }, { "epoch": 1.2954867435741753, "grad_norm": 0.2641198933124542, "learning_rate": 5.534076693668457e-05, "loss": 0.1458, "step": 6401 }, { "epoch": 1.2956891317547055, "grad_norm": 0.30567851662635803, "learning_rate": 5.5312310140596535e-05, "loss": 0.203, "step": 6402 }, { "epoch": 1.2958915199352359, "grad_norm": 0.2631327509880066, "learning_rate": 5.528385786554009e-05, "loss": 0.1742, "step": 6403 }, { "epoch": 1.296093908115766, "grad_norm": 0.2758510708808899, "learning_rate": 5.5255410114393656e-05, "loss": 0.1757, "step": 6404 }, { "epoch": 1.2962962962962963, "grad_norm": 0.2766825258731842, "learning_rate": 5.5226966890035325e-05, "loss": 0.1884, "step": 6405 }, { "epoch": 1.2964986844768265, "grad_norm": 0.2983852028846741, "learning_rate": 5.5198528195342704e-05, "loss": 0.192, "step": 6406 }, { "epoch": 1.2967010726573567, "grad_norm": 0.2765571177005768, "learning_rate": 5.517009403319289e-05, "loss": 0.1783, "step": 6407 }, { "epoch": 1.296903460837887, "grad_norm": 0.31526094675064087, "learning_rate": 5.514166440646256e-05, "loss": 0.1871, "step": 6408 }, { "epoch": 1.2971058490184173, "grad_norm": 0.28560030460357666, "learning_rate": 5.5113239318027945e-05, "loss": 0.2182, "step": 6409 }, { "epoch": 1.2973082371989475, "grad_norm": 0.2786945700645447, "learning_rate": 5.5084818770764746e-05, "loss": 0.1985, "step": 6410 }, { "epoch": 1.297510625379478, "grad_norm": 0.3191640377044678, "learning_rate": 5.505640276754832e-05, "loss": 0.2244, "step": 6411 }, { "epoch": 1.2977130135600081, "grad_norm": 0.26280149817466736, "learning_rate": 5.502799131125349e-05, "loss": 0.2237, "step": 6412 }, { "epoch": 1.2979154017405383, "grad_norm": 0.2876483201980591, "learning_rate": 5.49995844047546e-05, "loss": 0.2006, "step": 6413 }, { "epoch": 1.2981177899210685, "grad_norm": 0.26349586248397827, "learning_rate": 5.497118205092558e-05, "loss": 0.1689, "step": 6414 }, { "epoch": 1.298320178101599, "grad_norm": 0.30534908175468445, "learning_rate": 5.494278425263988e-05, "loss": 0.1732, "step": 6415 }, { "epoch": 1.2985225662821291, "grad_norm": 0.5254201292991638, "learning_rate": 5.491439101277049e-05, "loss": 0.2025, "step": 6416 }, { "epoch": 1.2987249544626593, "grad_norm": 0.31549495458602905, "learning_rate": 5.4886002334189946e-05, "loss": 0.184, "step": 6417 }, { "epoch": 1.2989273426431898, "grad_norm": 0.33482885360717773, "learning_rate": 5.485761821977029e-05, "loss": 0.2091, "step": 6418 }, { "epoch": 1.29912973082372, "grad_norm": 0.3019998371601105, "learning_rate": 5.482923867238317e-05, "loss": 0.1964, "step": 6419 }, { "epoch": 1.2993321190042502, "grad_norm": 0.2757321000099182, "learning_rate": 5.4800863694899695e-05, "loss": 0.1911, "step": 6420 }, { "epoch": 1.2995345071847804, "grad_norm": 0.299513578414917, "learning_rate": 5.477249329019057e-05, "loss": 0.1983, "step": 6421 }, { "epoch": 1.2997368953653106, "grad_norm": 0.2800210118293762, "learning_rate": 5.474412746112601e-05, "loss": 0.2082, "step": 6422 }, { "epoch": 1.299939283545841, "grad_norm": 0.27341270446777344, "learning_rate": 5.471576621057577e-05, "loss": 0.1975, "step": 6423 }, { "epoch": 1.3001416717263712, "grad_norm": 0.29459577798843384, "learning_rate": 5.468740954140913e-05, "loss": 0.2162, "step": 6424 }, { "epoch": 1.3003440599069014, "grad_norm": 0.31252217292785645, "learning_rate": 5.465905745649498e-05, "loss": 0.2091, "step": 6425 }, { "epoch": 1.3005464480874318, "grad_norm": 0.3006681501865387, "learning_rate": 5.46307099587016e-05, "loss": 0.2129, "step": 6426 }, { "epoch": 1.300748836267962, "grad_norm": 0.26092275977134705, "learning_rate": 5.460236705089693e-05, "loss": 0.1709, "step": 6427 }, { "epoch": 1.3009512244484922, "grad_norm": 0.2787107229232788, "learning_rate": 5.457402873594841e-05, "loss": 0.2133, "step": 6428 }, { "epoch": 1.3011536126290224, "grad_norm": 0.3300659954547882, "learning_rate": 5.454569501672302e-05, "loss": 0.2338, "step": 6429 }, { "epoch": 1.3013560008095526, "grad_norm": 0.29084473848342896, "learning_rate": 5.4517365896087246e-05, "loss": 0.2119, "step": 6430 }, { "epoch": 1.301558388990083, "grad_norm": 0.3082306385040283, "learning_rate": 5.4489041376907156e-05, "loss": 0.1762, "step": 6431 }, { "epoch": 1.3017607771706132, "grad_norm": 0.2574545443058014, "learning_rate": 5.4460721462048324e-05, "loss": 0.2003, "step": 6432 }, { "epoch": 1.3019631653511434, "grad_norm": 0.31898951530456543, "learning_rate": 5.443240615437586e-05, "loss": 0.2014, "step": 6433 }, { "epoch": 1.3021655535316738, "grad_norm": 0.25916787981987, "learning_rate": 5.44040954567544e-05, "loss": 0.2065, "step": 6434 }, { "epoch": 1.302367941712204, "grad_norm": 0.25367629528045654, "learning_rate": 5.437578937204813e-05, "loss": 0.1668, "step": 6435 }, { "epoch": 1.3025703298927342, "grad_norm": 0.27210503816604614, "learning_rate": 5.4347487903120744e-05, "loss": 0.1903, "step": 6436 }, { "epoch": 1.3027727180732644, "grad_norm": 0.2796219289302826, "learning_rate": 5.4319191052835525e-05, "loss": 0.1919, "step": 6437 }, { "epoch": 1.3029751062537946, "grad_norm": 0.35584887862205505, "learning_rate": 5.429089882405523e-05, "loss": 0.1834, "step": 6438 }, { "epoch": 1.303177494434325, "grad_norm": 0.30212247371673584, "learning_rate": 5.426261121964217e-05, "loss": 0.2138, "step": 6439 }, { "epoch": 1.3033798826148553, "grad_norm": 0.2893647849559784, "learning_rate": 5.423432824245819e-05, "loss": 0.21, "step": 6440 }, { "epoch": 1.3035822707953855, "grad_norm": 0.2603617310523987, "learning_rate": 5.4206049895364664e-05, "loss": 0.2013, "step": 6441 }, { "epoch": 1.3037846589759159, "grad_norm": 0.25256669521331787, "learning_rate": 5.417777618122249e-05, "loss": 0.1481, "step": 6442 }, { "epoch": 1.303987047156446, "grad_norm": 0.2812972068786621, "learning_rate": 5.414950710289213e-05, "loss": 0.2016, "step": 6443 }, { "epoch": 1.3041894353369763, "grad_norm": 0.2581676244735718, "learning_rate": 5.412124266323353e-05, "loss": 0.1927, "step": 6444 }, { "epoch": 1.3043918235175065, "grad_norm": 0.3383719325065613, "learning_rate": 5.40929828651062e-05, "loss": 0.2128, "step": 6445 }, { "epoch": 1.304594211698037, "grad_norm": 0.26960957050323486, "learning_rate": 5.406472771136917e-05, "loss": 0.1919, "step": 6446 }, { "epoch": 1.304796599878567, "grad_norm": 0.4218273460865021, "learning_rate": 5.403647720488099e-05, "loss": 0.1771, "step": 6447 }, { "epoch": 1.3049989880590973, "grad_norm": 0.33814752101898193, "learning_rate": 5.4008231348499794e-05, "loss": 0.1842, "step": 6448 }, { "epoch": 1.3052013762396277, "grad_norm": 0.27987149357795715, "learning_rate": 5.3979990145083124e-05, "loss": 0.2052, "step": 6449 }, { "epoch": 1.305403764420158, "grad_norm": 0.27401459217071533, "learning_rate": 5.3951753597488176e-05, "loss": 0.1909, "step": 6450 }, { "epoch": 1.305403764420158, "eval_loss": 0.2693765461444855, "eval_runtime": 0.7411, "eval_samples_per_second": 6.747, "eval_steps_per_second": 1.349, "step": 6450 }, { "epoch": 1.3056061526006881, "grad_norm": 0.3010677695274353, "learning_rate": 5.392352170857162e-05, "loss": 0.1577, "step": 6451 }, { "epoch": 1.3058085407812183, "grad_norm": 0.32166436314582825, "learning_rate": 5.389529448118966e-05, "loss": 0.2066, "step": 6452 }, { "epoch": 1.3060109289617485, "grad_norm": 0.283600389957428, "learning_rate": 5.386707191819803e-05, "loss": 0.2034, "step": 6453 }, { "epoch": 1.306213317142279, "grad_norm": 0.2762928605079651, "learning_rate": 5.3838854022452e-05, "loss": 0.1886, "step": 6454 }, { "epoch": 1.3064157053228092, "grad_norm": 0.2601023018360138, "learning_rate": 5.381064079680635e-05, "loss": 0.1645, "step": 6455 }, { "epoch": 1.3066180935033394, "grad_norm": 0.3133906424045563, "learning_rate": 5.378243224411541e-05, "loss": 0.2255, "step": 6456 }, { "epoch": 1.3068204816838698, "grad_norm": 0.269016295671463, "learning_rate": 5.375422836723303e-05, "loss": 0.1651, "step": 6457 }, { "epoch": 1.3070228698644, "grad_norm": 0.2578776478767395, "learning_rate": 5.3726029169012556e-05, "loss": 0.1606, "step": 6458 }, { "epoch": 1.3072252580449302, "grad_norm": 0.3729574382305145, "learning_rate": 5.369783465230691e-05, "loss": 0.2231, "step": 6459 }, { "epoch": 1.3074276462254604, "grad_norm": 0.27884671092033386, "learning_rate": 5.366964481996852e-05, "loss": 0.1803, "step": 6460 }, { "epoch": 1.3076300344059906, "grad_norm": 0.3325755000114441, "learning_rate": 5.3641459674849315e-05, "loss": 0.2229, "step": 6461 }, { "epoch": 1.307832422586521, "grad_norm": 0.27439218759536743, "learning_rate": 5.3613279219800794e-05, "loss": 0.1734, "step": 6462 }, { "epoch": 1.3080348107670512, "grad_norm": 0.31271788477897644, "learning_rate": 5.358510345767395e-05, "loss": 0.2196, "step": 6463 }, { "epoch": 1.3082371989475814, "grad_norm": 0.2967276871204376, "learning_rate": 5.3556932391319304e-05, "loss": 0.2027, "step": 6464 }, { "epoch": 1.3084395871281118, "grad_norm": 0.24664251506328583, "learning_rate": 5.3528766023586915e-05, "loss": 0.1916, "step": 6465 }, { "epoch": 1.308641975308642, "grad_norm": 0.269447386264801, "learning_rate": 5.3500604357326376e-05, "loss": 0.1685, "step": 6466 }, { "epoch": 1.3088443634891722, "grad_norm": 0.23026353120803833, "learning_rate": 5.347244739538677e-05, "loss": 0.1641, "step": 6467 }, { "epoch": 1.3090467516697024, "grad_norm": 0.27245429158210754, "learning_rate": 5.3444295140616684e-05, "loss": 0.1865, "step": 6468 }, { "epoch": 1.3092491398502326, "grad_norm": 0.2808758616447449, "learning_rate": 5.341614759586436e-05, "loss": 0.1984, "step": 6469 }, { "epoch": 1.309451528030763, "grad_norm": 0.2517721652984619, "learning_rate": 5.338800476397746e-05, "loss": 0.1535, "step": 6470 }, { "epoch": 1.3096539162112932, "grad_norm": 0.28509992361068726, "learning_rate": 5.3359866647803104e-05, "loss": 0.1886, "step": 6471 }, { "epoch": 1.3098563043918237, "grad_norm": 0.26791247725486755, "learning_rate": 5.3331733250188054e-05, "loss": 0.1953, "step": 6472 }, { "epoch": 1.3100586925723539, "grad_norm": 0.26313990354537964, "learning_rate": 5.330360457397854e-05, "loss": 0.1874, "step": 6473 }, { "epoch": 1.310261080752884, "grad_norm": 0.36424994468688965, "learning_rate": 5.3275480622020346e-05, "loss": 0.2314, "step": 6474 }, { "epoch": 1.3104634689334143, "grad_norm": 0.30342644453048706, "learning_rate": 5.324736139715875e-05, "loss": 0.2049, "step": 6475 }, { "epoch": 1.3106658571139445, "grad_norm": 0.281800240278244, "learning_rate": 5.321924690223854e-05, "loss": 0.1761, "step": 6476 }, { "epoch": 1.3108682452944749, "grad_norm": 0.2727698087692261, "learning_rate": 5.319113714010409e-05, "loss": 0.1895, "step": 6477 }, { "epoch": 1.311070633475005, "grad_norm": 0.3317430019378662, "learning_rate": 5.31630321135992e-05, "loss": 0.1835, "step": 6478 }, { "epoch": 1.3112730216555353, "grad_norm": 0.2980647385120392, "learning_rate": 5.313493182556728e-05, "loss": 0.2068, "step": 6479 }, { "epoch": 1.3114754098360657, "grad_norm": 0.3750055432319641, "learning_rate": 5.3106836278851205e-05, "loss": 0.1813, "step": 6480 }, { "epoch": 1.311677798016596, "grad_norm": 0.2763236165046692, "learning_rate": 5.307874547629339e-05, "loss": 0.1592, "step": 6481 }, { "epoch": 1.311880186197126, "grad_norm": 0.27184009552001953, "learning_rate": 5.305065942073576e-05, "loss": 0.1825, "step": 6482 }, { "epoch": 1.3120825743776563, "grad_norm": 0.324699342250824, "learning_rate": 5.3022578115019786e-05, "loss": 0.1988, "step": 6483 }, { "epoch": 1.3122849625581865, "grad_norm": 0.30634692311286926, "learning_rate": 5.299450156198642e-05, "loss": 0.1663, "step": 6484 }, { "epoch": 1.312487350738717, "grad_norm": 0.290509968996048, "learning_rate": 5.296642976447618e-05, "loss": 0.1816, "step": 6485 }, { "epoch": 1.3126897389192471, "grad_norm": 0.3158789277076721, "learning_rate": 5.293836272532905e-05, "loss": 0.2141, "step": 6486 }, { "epoch": 1.3128921270997773, "grad_norm": 0.29174625873565674, "learning_rate": 5.291030044738456e-05, "loss": 0.187, "step": 6487 }, { "epoch": 1.3130945152803077, "grad_norm": 0.2723025977611542, "learning_rate": 5.2882242933481775e-05, "loss": 0.2015, "step": 6488 }, { "epoch": 1.313296903460838, "grad_norm": 0.5538578033447266, "learning_rate": 5.285419018645925e-05, "loss": 0.2135, "step": 6489 }, { "epoch": 1.3134992916413681, "grad_norm": 0.4139079749584198, "learning_rate": 5.282614220915505e-05, "loss": 0.2084, "step": 6490 }, { "epoch": 1.3137016798218983, "grad_norm": 0.3343629539012909, "learning_rate": 5.279809900440681e-05, "loss": 0.194, "step": 6491 }, { "epoch": 1.3139040680024285, "grad_norm": 0.2938506007194519, "learning_rate": 5.277006057505166e-05, "loss": 0.246, "step": 6492 }, { "epoch": 1.314106456182959, "grad_norm": 0.3289811313152313, "learning_rate": 5.274202692392616e-05, "loss": 0.2207, "step": 6493 }, { "epoch": 1.3143088443634892, "grad_norm": 0.26164838671684265, "learning_rate": 5.271399805386652e-05, "loss": 0.1842, "step": 6494 }, { "epoch": 1.3145112325440194, "grad_norm": 0.29249104857444763, "learning_rate": 5.268597396770838e-05, "loss": 0.1949, "step": 6495 }, { "epoch": 1.3147136207245498, "grad_norm": 0.2775794267654419, "learning_rate": 5.265795466828692e-05, "loss": 0.1739, "step": 6496 }, { "epoch": 1.31491600890508, "grad_norm": 0.382659375667572, "learning_rate": 5.2629940158436874e-05, "loss": 0.2117, "step": 6497 }, { "epoch": 1.3151183970856102, "grad_norm": 0.28004685044288635, "learning_rate": 5.260193044099242e-05, "loss": 0.2075, "step": 6498 }, { "epoch": 1.3153207852661404, "grad_norm": 0.319001168012619, "learning_rate": 5.25739255187873e-05, "loss": 0.1853, "step": 6499 }, { "epoch": 1.3155231734466706, "grad_norm": 0.2945215404033661, "learning_rate": 5.254592539465477e-05, "loss": 0.1866, "step": 6500 }, { "epoch": 1.3155231734466706, "eval_loss": 0.27097755670547485, "eval_runtime": 0.7381, "eval_samples_per_second": 6.774, "eval_steps_per_second": 1.355, "step": 6500 }, { "epoch": 1.315725561627201, "grad_norm": 0.23622101545333862, "learning_rate": 5.251793007142758e-05, "loss": 0.1413, "step": 6501 }, { "epoch": 1.3159279498077312, "grad_norm": 0.29905086755752563, "learning_rate": 5.248993955193799e-05, "loss": 0.2117, "step": 6502 }, { "epoch": 1.3161303379882616, "grad_norm": 0.2794634699821472, "learning_rate": 5.246195383901782e-05, "loss": 0.1778, "step": 6503 }, { "epoch": 1.3163327261687918, "grad_norm": 0.2916606366634369, "learning_rate": 5.243397293549832e-05, "loss": 0.194, "step": 6504 }, { "epoch": 1.316535114349322, "grad_norm": 0.3154829442501068, "learning_rate": 5.240599684421036e-05, "loss": 0.2022, "step": 6505 }, { "epoch": 1.3167375025298522, "grad_norm": 0.2974871098995209, "learning_rate": 5.2378025567984225e-05, "loss": 0.2238, "step": 6506 }, { "epoch": 1.3169398907103824, "grad_norm": 0.2797040045261383, "learning_rate": 5.2350059109649784e-05, "loss": 0.2043, "step": 6507 }, { "epoch": 1.3171422788909128, "grad_norm": 0.26968511939048767, "learning_rate": 5.232209747203636e-05, "loss": 0.1652, "step": 6508 }, { "epoch": 1.317344667071443, "grad_norm": 0.30984237790107727, "learning_rate": 5.229414065797284e-05, "loss": 0.2272, "step": 6509 }, { "epoch": 1.3175470552519732, "grad_norm": 0.26939573884010315, "learning_rate": 5.226618867028761e-05, "loss": 0.1759, "step": 6510 }, { "epoch": 1.3177494434325037, "grad_norm": 0.3126527667045593, "learning_rate": 5.223824151180854e-05, "loss": 0.2236, "step": 6511 }, { "epoch": 1.3179518316130339, "grad_norm": 0.28654423356056213, "learning_rate": 5.221029918536302e-05, "loss": 0.1836, "step": 6512 }, { "epoch": 1.318154219793564, "grad_norm": 0.2838856875896454, "learning_rate": 5.218236169377799e-05, "loss": 0.1835, "step": 6513 }, { "epoch": 1.3183566079740943, "grad_norm": 0.28244468569755554, "learning_rate": 5.215442903987986e-05, "loss": 0.2124, "step": 6514 }, { "epoch": 1.3185589961546245, "grad_norm": 0.2554950416088104, "learning_rate": 5.212650122649456e-05, "loss": 0.188, "step": 6515 }, { "epoch": 1.3187613843351549, "grad_norm": 0.2603509724140167, "learning_rate": 5.209857825644753e-05, "loss": 0.197, "step": 6516 }, { "epoch": 1.318963772515685, "grad_norm": 0.24836356937885284, "learning_rate": 5.207066013256374e-05, "loss": 0.1781, "step": 6517 }, { "epoch": 1.3191661606962153, "grad_norm": 0.2643834948539734, "learning_rate": 5.204274685766764e-05, "loss": 0.181, "step": 6518 }, { "epoch": 1.3193685488767457, "grad_norm": 0.2769608199596405, "learning_rate": 5.201483843458319e-05, "loss": 0.1884, "step": 6519 }, { "epoch": 1.319570937057276, "grad_norm": 0.2706235349178314, "learning_rate": 5.198693486613389e-05, "loss": 0.1965, "step": 6520 }, { "epoch": 1.319773325237806, "grad_norm": 0.29300135374069214, "learning_rate": 5.1959036155142724e-05, "loss": 0.1768, "step": 6521 }, { "epoch": 1.3199757134183363, "grad_norm": 0.2877142131328583, "learning_rate": 5.193114230443219e-05, "loss": 0.1943, "step": 6522 }, { "epoch": 1.3201781015988665, "grad_norm": 0.26191914081573486, "learning_rate": 5.1903253316824305e-05, "loss": 0.2015, "step": 6523 }, { "epoch": 1.320380489779397, "grad_norm": 0.2909492254257202, "learning_rate": 5.187536919514058e-05, "loss": 0.1956, "step": 6524 }, { "epoch": 1.3205828779599271, "grad_norm": 0.2586766183376312, "learning_rate": 5.184748994220201e-05, "loss": 0.1553, "step": 6525 }, { "epoch": 1.3207852661404573, "grad_norm": 0.29264846444129944, "learning_rate": 5.181961556082917e-05, "loss": 0.1902, "step": 6526 }, { "epoch": 1.3209876543209877, "grad_norm": 0.3760776221752167, "learning_rate": 5.179174605384207e-05, "loss": 0.2315, "step": 6527 }, { "epoch": 1.321190042501518, "grad_norm": 0.2586089074611664, "learning_rate": 5.176388142406026e-05, "loss": 0.1886, "step": 6528 }, { "epoch": 1.3213924306820481, "grad_norm": 0.2983001470565796, "learning_rate": 5.17360216743028e-05, "loss": 0.2337, "step": 6529 }, { "epoch": 1.3215948188625783, "grad_norm": 0.30242887139320374, "learning_rate": 5.1708166807388235e-05, "loss": 0.2252, "step": 6530 }, { "epoch": 1.3217972070431085, "grad_norm": 0.2580106556415558, "learning_rate": 5.168031682613462e-05, "loss": 0.1768, "step": 6531 }, { "epoch": 1.321999595223639, "grad_norm": 0.31099486351013184, "learning_rate": 5.165247173335954e-05, "loss": 0.2352, "step": 6532 }, { "epoch": 1.3222019834041692, "grad_norm": 0.25483667850494385, "learning_rate": 5.162463153188009e-05, "loss": 0.1635, "step": 6533 }, { "epoch": 1.3224043715846996, "grad_norm": 0.27290982007980347, "learning_rate": 5.159679622451279e-05, "loss": 0.1768, "step": 6534 }, { "epoch": 1.3226067597652298, "grad_norm": 0.3144781291484833, "learning_rate": 5.1568965814073775e-05, "loss": 0.213, "step": 6535 }, { "epoch": 1.32280914794576, "grad_norm": 0.314395546913147, "learning_rate": 5.154114030337862e-05, "loss": 0.1714, "step": 6536 }, { "epoch": 1.3230115361262902, "grad_norm": 0.27756208181381226, "learning_rate": 5.1513319695242446e-05, "loss": 0.1874, "step": 6537 }, { "epoch": 1.3232139243068204, "grad_norm": 0.3120867609977722, "learning_rate": 5.14855039924798e-05, "loss": 0.1853, "step": 6538 }, { "epoch": 1.3234163124873508, "grad_norm": 0.28202709555625916, "learning_rate": 5.145769319790479e-05, "loss": 0.2047, "step": 6539 }, { "epoch": 1.323618700667881, "grad_norm": 0.2737361490726471, "learning_rate": 5.1429887314331025e-05, "loss": 0.2033, "step": 6540 }, { "epoch": 1.3238210888484112, "grad_norm": 0.31434905529022217, "learning_rate": 5.140208634457163e-05, "loss": 0.1968, "step": 6541 }, { "epoch": 1.3240234770289416, "grad_norm": 0.2981642484664917, "learning_rate": 5.137429029143921e-05, "loss": 0.1957, "step": 6542 }, { "epoch": 1.3242258652094718, "grad_norm": 0.2877632677555084, "learning_rate": 5.134649915774588e-05, "loss": 0.1701, "step": 6543 }, { "epoch": 1.324428253390002, "grad_norm": 0.3690420389175415, "learning_rate": 5.131871294630324e-05, "loss": 0.2493, "step": 6544 }, { "epoch": 1.3246306415705322, "grad_norm": 0.2771603763103485, "learning_rate": 5.1290931659922406e-05, "loss": 0.1945, "step": 6545 }, { "epoch": 1.3248330297510624, "grad_norm": 0.31182003021240234, "learning_rate": 5.126315530141402e-05, "loss": 0.1746, "step": 6546 }, { "epoch": 1.3250354179315929, "grad_norm": 0.2257387489080429, "learning_rate": 5.12353838735882e-05, "loss": 0.1489, "step": 6547 }, { "epoch": 1.325237806112123, "grad_norm": 0.3137997090816498, "learning_rate": 5.1207617379254544e-05, "loss": 0.2043, "step": 6548 }, { "epoch": 1.3254401942926533, "grad_norm": 0.29156142473220825, "learning_rate": 5.11798558212222e-05, "loss": 0.1906, "step": 6549 }, { "epoch": 1.3256425824731837, "grad_norm": 0.23661822080612183, "learning_rate": 5.115209920229978e-05, "loss": 0.1642, "step": 6550 }, { "epoch": 1.3256425824731837, "eval_loss": 0.2720639705657959, "eval_runtime": 0.7408, "eval_samples_per_second": 6.749, "eval_steps_per_second": 1.35, "step": 6550 }, { "epoch": 1.3258449706537139, "grad_norm": 0.2968902587890625, "learning_rate": 5.112434752529539e-05, "loss": 0.2282, "step": 6551 }, { "epoch": 1.326047358834244, "grad_norm": 0.26929181814193726, "learning_rate": 5.109660079301668e-05, "loss": 0.1882, "step": 6552 }, { "epoch": 1.3262497470147743, "grad_norm": 0.27666157484054565, "learning_rate": 5.1068859008270765e-05, "loss": 0.1905, "step": 6553 }, { "epoch": 1.3264521351953045, "grad_norm": 0.35504576563835144, "learning_rate": 5.1041122173864275e-05, "loss": 0.2033, "step": 6554 }, { "epoch": 1.326654523375835, "grad_norm": 0.3042171895503998, "learning_rate": 5.1013390292603325e-05, "loss": 0.2009, "step": 6555 }, { "epoch": 1.326856911556365, "grad_norm": 0.2512917220592499, "learning_rate": 5.098566336729351e-05, "loss": 0.2048, "step": 6556 }, { "epoch": 1.3270592997368953, "grad_norm": 0.2652257978916168, "learning_rate": 5.0957941400739996e-05, "loss": 0.1826, "step": 6557 }, { "epoch": 1.3272616879174257, "grad_norm": 0.2854307293891907, "learning_rate": 5.0930224395747374e-05, "loss": 0.1861, "step": 6558 }, { "epoch": 1.327464076097956, "grad_norm": 0.32963502407073975, "learning_rate": 5.0902512355119805e-05, "loss": 0.2098, "step": 6559 }, { "epoch": 1.3276664642784861, "grad_norm": 0.2660824656486511, "learning_rate": 5.087480528166082e-05, "loss": 0.1842, "step": 6560 }, { "epoch": 1.3278688524590163, "grad_norm": 0.4016515016555786, "learning_rate": 5.084710317817358e-05, "loss": 0.2121, "step": 6561 }, { "epoch": 1.3280712406395465, "grad_norm": 0.3001742959022522, "learning_rate": 5.081940604746067e-05, "loss": 0.1998, "step": 6562 }, { "epoch": 1.328273628820077, "grad_norm": 0.2764403223991394, "learning_rate": 5.079171389232418e-05, "loss": 0.1657, "step": 6563 }, { "epoch": 1.3284760170006071, "grad_norm": 0.31415703892707825, "learning_rate": 5.0764026715565785e-05, "loss": 0.2377, "step": 6564 }, { "epoch": 1.3286784051811376, "grad_norm": 0.2876707911491394, "learning_rate": 5.073634451998653e-05, "loss": 0.184, "step": 6565 }, { "epoch": 1.3288807933616678, "grad_norm": 0.2535141110420227, "learning_rate": 5.0708667308387025e-05, "loss": 0.1669, "step": 6566 }, { "epoch": 1.329083181542198, "grad_norm": 0.2769143879413605, "learning_rate": 5.0680995083567354e-05, "loss": 0.1816, "step": 6567 }, { "epoch": 1.3292855697227282, "grad_norm": 0.288330614566803, "learning_rate": 5.0653327848327104e-05, "loss": 0.1798, "step": 6568 }, { "epoch": 1.3294879579032584, "grad_norm": 0.2596645653247833, "learning_rate": 5.062566560546535e-05, "loss": 0.1654, "step": 6569 }, { "epoch": 1.3296903460837888, "grad_norm": 0.3217107951641083, "learning_rate": 5.059800835778066e-05, "loss": 0.1701, "step": 6570 }, { "epoch": 1.329892734264319, "grad_norm": 0.2558998465538025, "learning_rate": 5.0570356108071124e-05, "loss": 0.1725, "step": 6571 }, { "epoch": 1.3300951224448492, "grad_norm": 0.2911076843738556, "learning_rate": 5.0542708859134305e-05, "loss": 0.1956, "step": 6572 }, { "epoch": 1.3302975106253796, "grad_norm": 0.3187974989414215, "learning_rate": 5.051506661376725e-05, "loss": 0.2149, "step": 6573 }, { "epoch": 1.3304998988059098, "grad_norm": 0.3285270035266876, "learning_rate": 5.0487429374766515e-05, "loss": 0.1888, "step": 6574 }, { "epoch": 1.33070228698644, "grad_norm": 0.2761330306529999, "learning_rate": 5.045979714492814e-05, "loss": 0.2034, "step": 6575 }, { "epoch": 1.3309046751669702, "grad_norm": 0.2921268939971924, "learning_rate": 5.043216992704767e-05, "loss": 0.2066, "step": 6576 }, { "epoch": 1.3311070633475004, "grad_norm": 0.3227090537548065, "learning_rate": 5.040454772392015e-05, "loss": 0.2061, "step": 6577 }, { "epoch": 1.3313094515280308, "grad_norm": 0.2876438796520233, "learning_rate": 5.037693053834008e-05, "loss": 0.2014, "step": 6578 }, { "epoch": 1.331511839708561, "grad_norm": 0.3059580624103546, "learning_rate": 5.03493183731015e-05, "loss": 0.2583, "step": 6579 }, { "epoch": 1.3317142278890912, "grad_norm": 0.3144664466381073, "learning_rate": 5.032171123099789e-05, "loss": 0.1967, "step": 6580 }, { "epoch": 1.3319166160696216, "grad_norm": 0.29009315371513367, "learning_rate": 5.029410911482233e-05, "loss": 0.1995, "step": 6581 }, { "epoch": 1.3321190042501518, "grad_norm": 0.3386211395263672, "learning_rate": 5.0266512027367204e-05, "loss": 0.1961, "step": 6582 }, { "epoch": 1.332321392430682, "grad_norm": 0.27624696493148804, "learning_rate": 5.0238919971424536e-05, "loss": 0.1891, "step": 6583 }, { "epoch": 1.3325237806112122, "grad_norm": 0.2946856617927551, "learning_rate": 5.0211332949785815e-05, "loss": 0.2155, "step": 6584 }, { "epoch": 1.3327261687917424, "grad_norm": 0.2675507664680481, "learning_rate": 5.018375096524201e-05, "loss": 0.178, "step": 6585 }, { "epoch": 1.3329285569722729, "grad_norm": 0.30136606097221375, "learning_rate": 5.0156174020583546e-05, "loss": 0.1926, "step": 6586 }, { "epoch": 1.333130945152803, "grad_norm": 0.23659691214561462, "learning_rate": 5.01286021186004e-05, "loss": 0.1557, "step": 6587 }, { "epoch": 1.3333333333333333, "grad_norm": 0.2846830189228058, "learning_rate": 5.0101035262082005e-05, "loss": 0.173, "step": 6588 }, { "epoch": 1.3335357215138637, "grad_norm": 0.25915321707725525, "learning_rate": 5.007347345381727e-05, "loss": 0.1886, "step": 6589 }, { "epoch": 1.3337381096943939, "grad_norm": 0.30053645372390747, "learning_rate": 5.004591669659462e-05, "loss": 0.2086, "step": 6590 }, { "epoch": 1.333940497874924, "grad_norm": 0.30931761860847473, "learning_rate": 5.001836499320195e-05, "loss": 0.206, "step": 6591 }, { "epoch": 1.3341428860554543, "grad_norm": 0.2771763801574707, "learning_rate": 4.999081834642666e-05, "loss": 0.2223, "step": 6592 }, { "epoch": 1.3343452742359845, "grad_norm": 0.27251917123794556, "learning_rate": 4.996327675905563e-05, "loss": 0.1661, "step": 6593 }, { "epoch": 1.334547662416515, "grad_norm": 0.28505948185920715, "learning_rate": 4.9935740233875236e-05, "loss": 0.2201, "step": 6594 }, { "epoch": 1.334750050597045, "grad_norm": 0.34662339091300964, "learning_rate": 4.9908208773671315e-05, "loss": 0.2281, "step": 6595 }, { "epoch": 1.3349524387775755, "grad_norm": 0.32622262835502625, "learning_rate": 4.988068238122924e-05, "loss": 0.2139, "step": 6596 }, { "epoch": 1.3351548269581057, "grad_norm": 0.2837918996810913, "learning_rate": 4.985316105933381e-05, "loss": 0.1957, "step": 6597 }, { "epoch": 1.335357215138636, "grad_norm": 0.33972227573394775, "learning_rate": 4.9825644810769366e-05, "loss": 0.2278, "step": 6598 }, { "epoch": 1.3355596033191661, "grad_norm": 0.32084619998931885, "learning_rate": 4.97981336383197e-05, "loss": 0.2126, "step": 6599 }, { "epoch": 1.3357619914996963, "grad_norm": 0.27521273493766785, "learning_rate": 4.977062754476811e-05, "loss": 0.1871, "step": 6600 }, { "epoch": 1.3357619914996963, "eval_loss": 0.2676220238208771, "eval_runtime": 0.7409, "eval_samples_per_second": 6.749, "eval_steps_per_second": 1.35, "step": 6600 }, { "epoch": 1.3359643796802267, "grad_norm": 0.2413671761751175, "learning_rate": 4.974312653289739e-05, "loss": 0.172, "step": 6601 }, { "epoch": 1.336166767860757, "grad_norm": 0.3191070258617401, "learning_rate": 4.971563060548977e-05, "loss": 0.2377, "step": 6602 }, { "epoch": 1.3363691560412871, "grad_norm": 0.32372811436653137, "learning_rate": 4.968813976532707e-05, "loss": 0.1926, "step": 6603 }, { "epoch": 1.3365715442218176, "grad_norm": 0.3132017254829407, "learning_rate": 4.966065401519042e-05, "loss": 0.1901, "step": 6604 }, { "epoch": 1.3367739324023478, "grad_norm": 0.28450313210487366, "learning_rate": 4.9633173357860596e-05, "loss": 0.1886, "step": 6605 }, { "epoch": 1.336976320582878, "grad_norm": 0.27775466442108154, "learning_rate": 4.96056977961178e-05, "loss": 0.1568, "step": 6606 }, { "epoch": 1.3371787087634082, "grad_norm": 0.25570055842399597, "learning_rate": 4.957822733274172e-05, "loss": 0.1501, "step": 6607 }, { "epoch": 1.3373810969439384, "grad_norm": 0.26684972643852234, "learning_rate": 4.955076197051154e-05, "loss": 0.2062, "step": 6608 }, { "epoch": 1.3375834851244688, "grad_norm": 0.3012605309486389, "learning_rate": 4.952330171220589e-05, "loss": 0.2076, "step": 6609 }, { "epoch": 1.337785873304999, "grad_norm": 0.36997511982917786, "learning_rate": 4.949584656060293e-05, "loss": 0.1616, "step": 6610 }, { "epoch": 1.3379882614855292, "grad_norm": 0.27101650834083557, "learning_rate": 4.946839651848029e-05, "loss": 0.1829, "step": 6611 }, { "epoch": 1.3381906496660596, "grad_norm": 0.2873406708240509, "learning_rate": 4.9440951588615056e-05, "loss": 0.2096, "step": 6612 }, { "epoch": 1.3383930378465898, "grad_norm": 0.2543468177318573, "learning_rate": 4.9413511773783836e-05, "loss": 0.1665, "step": 6613 }, { "epoch": 1.33859542602712, "grad_norm": 0.2655819058418274, "learning_rate": 4.9386077076762695e-05, "loss": 0.1827, "step": 6614 }, { "epoch": 1.3387978142076502, "grad_norm": 0.2574715316295624, "learning_rate": 4.935864750032719e-05, "loss": 0.165, "step": 6615 }, { "epoch": 1.3390002023881804, "grad_norm": 0.3012125492095947, "learning_rate": 4.933122304725236e-05, "loss": 0.2141, "step": 6616 }, { "epoch": 1.3392025905687108, "grad_norm": 0.31119707226753235, "learning_rate": 4.930380372031273e-05, "loss": 0.2039, "step": 6617 }, { "epoch": 1.339404978749241, "grad_norm": 0.2763644754886627, "learning_rate": 4.9276389522282275e-05, "loss": 0.1877, "step": 6618 }, { "epoch": 1.3396073669297712, "grad_norm": 0.3010362982749939, "learning_rate": 4.924898045593449e-05, "loss": 0.1879, "step": 6619 }, { "epoch": 1.3398097551103016, "grad_norm": 0.26402631402015686, "learning_rate": 4.922157652404235e-05, "loss": 0.1837, "step": 6620 }, { "epoch": 1.3400121432908318, "grad_norm": 0.24701833724975586, "learning_rate": 4.9194177729378236e-05, "loss": 0.1591, "step": 6621 }, { "epoch": 1.340214531471362, "grad_norm": 0.356270432472229, "learning_rate": 4.916678407471417e-05, "loss": 0.2013, "step": 6622 }, { "epoch": 1.3404169196518922, "grad_norm": 0.36151617765426636, "learning_rate": 4.913939556282149e-05, "loss": 0.217, "step": 6623 }, { "epoch": 1.3406193078324227, "grad_norm": 0.31240034103393555, "learning_rate": 4.91120121964711e-05, "loss": 0.1944, "step": 6624 }, { "epoch": 1.3408216960129529, "grad_norm": 0.2937697172164917, "learning_rate": 4.9084633978433356e-05, "loss": 0.1846, "step": 6625 }, { "epoch": 1.341024084193483, "grad_norm": 0.26070502400398254, "learning_rate": 4.9057260911478134e-05, "loss": 0.1798, "step": 6626 }, { "epoch": 1.3412264723740135, "grad_norm": 0.28170114755630493, "learning_rate": 4.902989299837467e-05, "loss": 0.1827, "step": 6627 }, { "epoch": 1.3414288605545437, "grad_norm": 0.24363872408866882, "learning_rate": 4.900253024189182e-05, "loss": 0.1703, "step": 6628 }, { "epoch": 1.341631248735074, "grad_norm": 0.2929212749004364, "learning_rate": 4.897517264479785e-05, "loss": 0.2417, "step": 6629 }, { "epoch": 1.341833636915604, "grad_norm": 0.2667793035507202, "learning_rate": 4.894782020986052e-05, "loss": 0.1864, "step": 6630 }, { "epoch": 1.3420360250961343, "grad_norm": 0.2632579207420349, "learning_rate": 4.892047293984704e-05, "loss": 0.1878, "step": 6631 }, { "epoch": 1.3422384132766647, "grad_norm": 0.28002333641052246, "learning_rate": 4.8893130837524145e-05, "loss": 0.182, "step": 6632 }, { "epoch": 1.342440801457195, "grad_norm": 0.28146490454673767, "learning_rate": 4.886579390565802e-05, "loss": 0.1922, "step": 6633 }, { "epoch": 1.342643189637725, "grad_norm": 0.2770502269268036, "learning_rate": 4.883846214701431e-05, "loss": 0.1968, "step": 6634 }, { "epoch": 1.3428455778182555, "grad_norm": 0.3004387319087982, "learning_rate": 4.881113556435818e-05, "loss": 0.1821, "step": 6635 }, { "epoch": 1.3430479659987857, "grad_norm": 0.30087587237358093, "learning_rate": 4.878381416045422e-05, "loss": 0.2104, "step": 6636 }, { "epoch": 1.343250354179316, "grad_norm": 0.2681400775909424, "learning_rate": 4.8756497938066544e-05, "loss": 0.1663, "step": 6637 }, { "epoch": 1.3434527423598461, "grad_norm": 0.2617656886577606, "learning_rate": 4.8729186899958726e-05, "loss": 0.2128, "step": 6638 }, { "epoch": 1.3436551305403763, "grad_norm": 0.27163711190223694, "learning_rate": 4.8701881048893794e-05, "loss": 0.2162, "step": 6639 }, { "epoch": 1.3438575187209068, "grad_norm": 0.29744192957878113, "learning_rate": 4.867458038763426e-05, "loss": 0.223, "step": 6640 }, { "epoch": 1.344059906901437, "grad_norm": 0.22820989787578583, "learning_rate": 4.864728491894215e-05, "loss": 0.1541, "step": 6641 }, { "epoch": 1.3442622950819672, "grad_norm": 0.26483702659606934, "learning_rate": 4.86199946455789e-05, "loss": 0.1691, "step": 6642 }, { "epoch": 1.3444646832624976, "grad_norm": 0.24761466681957245, "learning_rate": 4.859270957030547e-05, "loss": 0.1563, "step": 6643 }, { "epoch": 1.3446670714430278, "grad_norm": 0.286159873008728, "learning_rate": 4.856542969588228e-05, "loss": 0.1792, "step": 6644 }, { "epoch": 1.344869459623558, "grad_norm": 0.29326891899108887, "learning_rate": 4.8538155025069206e-05, "loss": 0.2011, "step": 6645 }, { "epoch": 1.3450718478040882, "grad_norm": 0.278120219707489, "learning_rate": 4.851088556062563e-05, "loss": 0.1963, "step": 6646 }, { "epoch": 1.3452742359846184, "grad_norm": 0.2773338854312897, "learning_rate": 4.848362130531039e-05, "loss": 0.1859, "step": 6647 }, { "epoch": 1.3454766241651488, "grad_norm": 0.2770738899707794, "learning_rate": 4.845636226188183e-05, "loss": 0.2035, "step": 6648 }, { "epoch": 1.345679012345679, "grad_norm": 0.27181872725486755, "learning_rate": 4.8429108433097645e-05, "loss": 0.2015, "step": 6649 }, { "epoch": 1.3458814005262092, "grad_norm": 0.3191307783126831, "learning_rate": 4.840185982171514e-05, "loss": 0.1757, "step": 6650 }, { "epoch": 1.3458814005262092, "eval_loss": 0.26896873116493225, "eval_runtime": 0.7368, "eval_samples_per_second": 6.786, "eval_steps_per_second": 1.357, "step": 6650 }, { "epoch": 1.3460837887067396, "grad_norm": 0.2746305465698242, "learning_rate": 4.837461643049106e-05, "loss": 0.1954, "step": 6651 }, { "epoch": 1.3462861768872698, "grad_norm": 0.25542014837265015, "learning_rate": 4.8347378262181583e-05, "loss": 0.1673, "step": 6652 }, { "epoch": 1.3464885650678, "grad_norm": 0.303438663482666, "learning_rate": 4.832014531954239e-05, "loss": 0.1968, "step": 6653 }, { "epoch": 1.3466909532483302, "grad_norm": 0.29511943459510803, "learning_rate": 4.829291760532861e-05, "loss": 0.2027, "step": 6654 }, { "epoch": 1.3468933414288606, "grad_norm": 0.29404184222221375, "learning_rate": 4.826569512229488e-05, "loss": 0.1458, "step": 6655 }, { "epoch": 1.3470957296093908, "grad_norm": 0.2823881506919861, "learning_rate": 4.823847787319529e-05, "loss": 0.207, "step": 6656 }, { "epoch": 1.347298117789921, "grad_norm": 0.3050212860107422, "learning_rate": 4.821126586078336e-05, "loss": 0.2397, "step": 6657 }, { "epoch": 1.3475005059704515, "grad_norm": 0.3161782920360565, "learning_rate": 4.818405908781215e-05, "loss": 0.2163, "step": 6658 }, { "epoch": 1.3477028941509817, "grad_norm": 0.2794608175754547, "learning_rate": 4.8156857557034144e-05, "loss": 0.1952, "step": 6659 }, { "epoch": 1.3479052823315119, "grad_norm": 0.2686176300048828, "learning_rate": 4.8129661271201296e-05, "loss": 0.1546, "step": 6660 }, { "epoch": 1.348107670512042, "grad_norm": 0.280245304107666, "learning_rate": 4.810247023306505e-05, "loss": 0.1811, "step": 6661 }, { "epoch": 1.3483100586925723, "grad_norm": 0.27028578519821167, "learning_rate": 4.807528444537632e-05, "loss": 0.1729, "step": 6662 }, { "epoch": 1.3485124468731027, "grad_norm": 0.3122173845767975, "learning_rate": 4.8048103910885475e-05, "loss": 0.2259, "step": 6663 }, { "epoch": 1.3487148350536329, "grad_norm": 0.3586917519569397, "learning_rate": 4.8020928632342346e-05, "loss": 0.1921, "step": 6664 }, { "epoch": 1.348917223234163, "grad_norm": 0.29752397537231445, "learning_rate": 4.799375861249624e-05, "loss": 0.1728, "step": 6665 }, { "epoch": 1.3491196114146935, "grad_norm": 0.2642748951911926, "learning_rate": 4.796659385409595e-05, "loss": 0.1949, "step": 6666 }, { "epoch": 1.3493219995952237, "grad_norm": 0.2839334309101105, "learning_rate": 4.7939434359889714e-05, "loss": 0.1828, "step": 6667 }, { "epoch": 1.349524387775754, "grad_norm": 0.32244181632995605, "learning_rate": 4.7912280132625245e-05, "loss": 0.2258, "step": 6668 }, { "epoch": 1.349726775956284, "grad_norm": 0.2739431858062744, "learning_rate": 4.788513117504971e-05, "loss": 0.1394, "step": 6669 }, { "epoch": 1.3499291641368143, "grad_norm": 0.31246238946914673, "learning_rate": 4.785798748990978e-05, "loss": 0.2063, "step": 6670 }, { "epoch": 1.3501315523173447, "grad_norm": 0.32760030031204224, "learning_rate": 4.783084907995156e-05, "loss": 0.1873, "step": 6671 }, { "epoch": 1.350333940497875, "grad_norm": 0.2789546549320221, "learning_rate": 4.7803715947920614e-05, "loss": 0.1671, "step": 6672 }, { "epoch": 1.3505363286784051, "grad_norm": 0.306374192237854, "learning_rate": 4.7776588096562e-05, "loss": 0.2078, "step": 6673 }, { "epoch": 1.3507387168589355, "grad_norm": 0.3102760910987854, "learning_rate": 4.774946552862023e-05, "loss": 0.1982, "step": 6674 }, { "epoch": 1.3509411050394657, "grad_norm": 0.2927134037017822, "learning_rate": 4.7722348246839285e-05, "loss": 0.2124, "step": 6675 }, { "epoch": 1.351143493219996, "grad_norm": 0.3680955767631531, "learning_rate": 4.769523625396259e-05, "loss": 0.1934, "step": 6676 }, { "epoch": 1.3513458814005261, "grad_norm": 0.3348569869995117, "learning_rate": 4.7668129552733076e-05, "loss": 0.2212, "step": 6677 }, { "epoch": 1.3515482695810563, "grad_norm": 0.2908865213394165, "learning_rate": 4.7641028145893094e-05, "loss": 0.2018, "step": 6678 }, { "epoch": 1.3517506577615868, "grad_norm": 0.27470603585243225, "learning_rate": 4.761393203618449e-05, "loss": 0.2029, "step": 6679 }, { "epoch": 1.351953045942117, "grad_norm": 0.33673080801963806, "learning_rate": 4.7586841226348564e-05, "loss": 0.2223, "step": 6680 }, { "epoch": 1.3521554341226472, "grad_norm": 0.23110777139663696, "learning_rate": 4.7559755719126075e-05, "loss": 0.1587, "step": 6681 }, { "epoch": 1.3523578223031776, "grad_norm": 0.3230266571044922, "learning_rate": 4.7532675517257256e-05, "loss": 0.2, "step": 6682 }, { "epoch": 1.3525602104837078, "grad_norm": 0.3509168326854706, "learning_rate": 4.7505600623481784e-05, "loss": 0.2157, "step": 6683 }, { "epoch": 1.352762598664238, "grad_norm": 0.3701566755771637, "learning_rate": 4.747853104053883e-05, "loss": 0.1997, "step": 6684 }, { "epoch": 1.3529649868447682, "grad_norm": 0.2779744267463684, "learning_rate": 4.745146677116701e-05, "loss": 0.1959, "step": 6685 }, { "epoch": 1.3531673750252986, "grad_norm": 0.29480859637260437, "learning_rate": 4.7424407818104385e-05, "loss": 0.2142, "step": 6686 }, { "epoch": 1.3533697632058288, "grad_norm": 0.31336894631385803, "learning_rate": 4.739735418408852e-05, "loss": 0.2362, "step": 6687 }, { "epoch": 1.353572151386359, "grad_norm": 0.27973970770835876, "learning_rate": 4.73703058718564e-05, "loss": 0.2053, "step": 6688 }, { "epoch": 1.3537745395668894, "grad_norm": 0.25769269466400146, "learning_rate": 4.734326288414449e-05, "loss": 0.1937, "step": 6689 }, { "epoch": 1.3539769277474196, "grad_norm": 0.252174973487854, "learning_rate": 4.7316225223688724e-05, "loss": 0.1599, "step": 6690 }, { "epoch": 1.3541793159279498, "grad_norm": 0.26635318994522095, "learning_rate": 4.7289192893224486e-05, "loss": 0.1944, "step": 6691 }, { "epoch": 1.35438170410848, "grad_norm": 0.43550267815589905, "learning_rate": 4.726216589548667e-05, "loss": 0.2181, "step": 6692 }, { "epoch": 1.3545840922890102, "grad_norm": 0.25361430644989014, "learning_rate": 4.723514423320948e-05, "loss": 0.1711, "step": 6693 }, { "epoch": 1.3547864804695406, "grad_norm": 0.31329795718193054, "learning_rate": 4.720812790912675e-05, "loss": 0.203, "step": 6694 }, { "epoch": 1.3549888686500708, "grad_norm": 0.27394962310791016, "learning_rate": 4.71811169259717e-05, "loss": 0.1971, "step": 6695 }, { "epoch": 1.355191256830601, "grad_norm": 0.3030480742454529, "learning_rate": 4.715411128647702e-05, "loss": 0.2048, "step": 6696 }, { "epoch": 1.3553936450111315, "grad_norm": 0.30011892318725586, "learning_rate": 4.7127110993374855e-05, "loss": 0.1889, "step": 6697 }, { "epoch": 1.3555960331916617, "grad_norm": 0.292767733335495, "learning_rate": 4.7100116049396804e-05, "loss": 0.1699, "step": 6698 }, { "epoch": 1.3557984213721919, "grad_norm": 0.27436232566833496, "learning_rate": 4.707312645727394e-05, "loss": 0.1829, "step": 6699 }, { "epoch": 1.356000809552722, "grad_norm": 0.2540614902973175, "learning_rate": 4.70461422197368e-05, "loss": 0.2047, "step": 6700 }, { "epoch": 1.356000809552722, "eval_loss": 0.26921871304512024, "eval_runtime": 0.7381, "eval_samples_per_second": 6.774, "eval_steps_per_second": 1.355, "step": 6700 }, { "epoch": 1.3562031977332523, "grad_norm": 0.28804129362106323, "learning_rate": 4.701916333951534e-05, "loss": 0.1905, "step": 6701 }, { "epoch": 1.3564055859137827, "grad_norm": 0.2757605314254761, "learning_rate": 4.6992189819339006e-05, "loss": 0.1719, "step": 6702 }, { "epoch": 1.3566079740943129, "grad_norm": 0.25101038813591003, "learning_rate": 4.696522166193671e-05, "loss": 0.1795, "step": 6703 }, { "epoch": 1.356810362274843, "grad_norm": 0.2832792103290558, "learning_rate": 4.6938258870036786e-05, "loss": 0.1698, "step": 6704 }, { "epoch": 1.3570127504553735, "grad_norm": 0.2435804158449173, "learning_rate": 4.691130144636707e-05, "loss": 0.181, "step": 6705 }, { "epoch": 1.3572151386359037, "grad_norm": 0.28305795788764954, "learning_rate": 4.6884349393654823e-05, "loss": 0.2256, "step": 6706 }, { "epoch": 1.357417526816434, "grad_norm": 0.28656500577926636, "learning_rate": 4.6857402714626765e-05, "loss": 0.1737, "step": 6707 }, { "epoch": 1.357619914996964, "grad_norm": 0.2786801755428314, "learning_rate": 4.683046141200909e-05, "loss": 0.2059, "step": 6708 }, { "epoch": 1.3578223031774943, "grad_norm": 0.2742007374763489, "learning_rate": 4.680352548852741e-05, "loss": 0.2013, "step": 6709 }, { "epoch": 1.3580246913580247, "grad_norm": 0.2749464511871338, "learning_rate": 4.677659494690685e-05, "loss": 0.1908, "step": 6710 }, { "epoch": 1.358227079538555, "grad_norm": 0.2884829342365265, "learning_rate": 4.6749669789871944e-05, "loss": 0.1536, "step": 6711 }, { "epoch": 1.3584294677190851, "grad_norm": 0.303640216588974, "learning_rate": 4.672275002014669e-05, "loss": 0.2158, "step": 6712 }, { "epoch": 1.3586318558996155, "grad_norm": 0.29193374514579773, "learning_rate": 4.6695835640454564e-05, "loss": 0.1926, "step": 6713 }, { "epoch": 1.3588342440801457, "grad_norm": 0.27257031202316284, "learning_rate": 4.666892665351847e-05, "loss": 0.1714, "step": 6714 }, { "epoch": 1.359036632260676, "grad_norm": 0.3507302701473236, "learning_rate": 4.6642023062060825e-05, "loss": 0.2356, "step": 6715 }, { "epoch": 1.3592390204412061, "grad_norm": 0.2812184989452362, "learning_rate": 4.6615124868803326e-05, "loss": 0.2025, "step": 6716 }, { "epoch": 1.3594414086217366, "grad_norm": 0.36313724517822266, "learning_rate": 4.658823207646737e-05, "loss": 0.2103, "step": 6717 }, { "epoch": 1.3596437968022668, "grad_norm": 0.2924157977104187, "learning_rate": 4.6561344687773655e-05, "loss": 0.2042, "step": 6718 }, { "epoch": 1.359846184982797, "grad_norm": 0.293690949678421, "learning_rate": 4.653446270544236e-05, "loss": 0.1875, "step": 6719 }, { "epoch": 1.3600485731633274, "grad_norm": 0.3482380509376526, "learning_rate": 4.6507586132193115e-05, "loss": 0.2127, "step": 6720 }, { "epoch": 1.3602509613438576, "grad_norm": 0.3009283244609833, "learning_rate": 4.648071497074502e-05, "loss": 0.2143, "step": 6721 }, { "epoch": 1.3604533495243878, "grad_norm": 0.31680983304977417, "learning_rate": 4.6453849223816604e-05, "loss": 0.2419, "step": 6722 }, { "epoch": 1.360655737704918, "grad_norm": 0.3034413456916809, "learning_rate": 4.642698889412588e-05, "loss": 0.2122, "step": 6723 }, { "epoch": 1.3608581258854482, "grad_norm": 0.297990083694458, "learning_rate": 4.6400133984390283e-05, "loss": 0.1998, "step": 6724 }, { "epoch": 1.3610605140659786, "grad_norm": 0.2984466552734375, "learning_rate": 4.637328449732671e-05, "loss": 0.2176, "step": 6725 }, { "epoch": 1.3612629022465088, "grad_norm": 0.31007546186447144, "learning_rate": 4.6346440435651505e-05, "loss": 0.2072, "step": 6726 }, { "epoch": 1.361465290427039, "grad_norm": 0.313833087682724, "learning_rate": 4.6319601802080494e-05, "loss": 0.1845, "step": 6727 }, { "epoch": 1.3616676786075694, "grad_norm": 0.293378084897995, "learning_rate": 4.629276859932889e-05, "loss": 0.2082, "step": 6728 }, { "epoch": 1.3618700667880996, "grad_norm": 0.302562415599823, "learning_rate": 4.6265940830111434e-05, "loss": 0.1976, "step": 6729 }, { "epoch": 1.3620724549686298, "grad_norm": 0.31816911697387695, "learning_rate": 4.6239118497142256e-05, "loss": 0.2028, "step": 6730 }, { "epoch": 1.36227484314916, "grad_norm": 0.305603951215744, "learning_rate": 4.6212301603134954e-05, "loss": 0.193, "step": 6731 }, { "epoch": 1.3624772313296902, "grad_norm": 0.3000797927379608, "learning_rate": 4.618549015080259e-05, "loss": 0.194, "step": 6732 }, { "epoch": 1.3626796195102207, "grad_norm": 0.32458996772766113, "learning_rate": 4.615868414285767e-05, "loss": 0.2204, "step": 6733 }, { "epoch": 1.3628820076907509, "grad_norm": 0.27205538749694824, "learning_rate": 4.6131883582012125e-05, "loss": 0.176, "step": 6734 }, { "epoch": 1.363084395871281, "grad_norm": 0.2538856863975525, "learning_rate": 4.6105088470977384e-05, "loss": 0.1954, "step": 6735 }, { "epoch": 1.3632867840518115, "grad_norm": 0.3089398741722107, "learning_rate": 4.607829881246427e-05, "loss": 0.1953, "step": 6736 }, { "epoch": 1.3634891722323417, "grad_norm": 0.31733807921409607, "learning_rate": 4.6051514609183124e-05, "loss": 0.2068, "step": 6737 }, { "epoch": 1.3636915604128719, "grad_norm": 0.3041749894618988, "learning_rate": 4.602473586384361e-05, "loss": 0.2036, "step": 6738 }, { "epoch": 1.363893948593402, "grad_norm": 0.34960976243019104, "learning_rate": 4.599796257915499e-05, "loss": 0.2228, "step": 6739 }, { "epoch": 1.3640963367739323, "grad_norm": 0.2810072600841522, "learning_rate": 4.597119475782585e-05, "loss": 0.226, "step": 6740 }, { "epoch": 1.3642987249544627, "grad_norm": 0.2816447615623474, "learning_rate": 4.594443240256433e-05, "loss": 0.2007, "step": 6741 }, { "epoch": 1.364501113134993, "grad_norm": 0.31261932849884033, "learning_rate": 4.591767551607793e-05, "loss": 0.168, "step": 6742 }, { "epoch": 1.364703501315523, "grad_norm": 0.24643424153327942, "learning_rate": 4.589092410107364e-05, "loss": 0.1709, "step": 6743 }, { "epoch": 1.3649058894960535, "grad_norm": 0.25190091133117676, "learning_rate": 4.5864178160257895e-05, "loss": 0.1536, "step": 6744 }, { "epoch": 1.3651082776765837, "grad_norm": 0.2703429162502289, "learning_rate": 4.583743769633656e-05, "loss": 0.1802, "step": 6745 }, { "epoch": 1.365310665857114, "grad_norm": 0.30754923820495605, "learning_rate": 4.5810702712014964e-05, "loss": 0.1942, "step": 6746 }, { "epoch": 1.3655130540376441, "grad_norm": 0.2669358253479004, "learning_rate": 4.578397320999785e-05, "loss": 0.1915, "step": 6747 }, { "epoch": 1.3657154422181745, "grad_norm": 0.27001726627349854, "learning_rate": 4.575724919298946e-05, "loss": 0.1949, "step": 6748 }, { "epoch": 1.3659178303987047, "grad_norm": 0.27014675736427307, "learning_rate": 4.5730530663693425e-05, "loss": 0.2015, "step": 6749 }, { "epoch": 1.366120218579235, "grad_norm": 0.2672886550426483, "learning_rate": 4.570381762481286e-05, "loss": 0.1998, "step": 6750 }, { "epoch": 1.366120218579235, "eval_loss": 0.2655577063560486, "eval_runtime": 0.7398, "eval_samples_per_second": 6.759, "eval_steps_per_second": 1.352, "step": 6750 }, { "epoch": 1.3663226067597654, "grad_norm": 0.2848818004131317, "learning_rate": 4.56771100790503e-05, "loss": 0.2016, "step": 6751 }, { "epoch": 1.3665249949402956, "grad_norm": 0.2925681471824646, "learning_rate": 4.5650408029107746e-05, "loss": 0.1869, "step": 6752 }, { "epoch": 1.3667273831208258, "grad_norm": 0.27403298020362854, "learning_rate": 4.5623711477686614e-05, "loss": 0.19, "step": 6753 }, { "epoch": 1.366929771301356, "grad_norm": 0.2719515860080719, "learning_rate": 4.55970204274878e-05, "loss": 0.2171, "step": 6754 }, { "epoch": 1.3671321594818862, "grad_norm": 0.3114962875843048, "learning_rate": 4.557033488121161e-05, "loss": 0.1991, "step": 6755 }, { "epoch": 1.3673345476624166, "grad_norm": 0.24869102239608765, "learning_rate": 4.554365484155782e-05, "loss": 0.1907, "step": 6756 }, { "epoch": 1.3675369358429468, "grad_norm": 0.2927948534488678, "learning_rate": 4.551698031122561e-05, "loss": 0.2102, "step": 6757 }, { "epoch": 1.367739324023477, "grad_norm": 0.2724990248680115, "learning_rate": 4.549031129291367e-05, "loss": 0.2288, "step": 6758 }, { "epoch": 1.3679417122040074, "grad_norm": 0.26832640171051025, "learning_rate": 4.54636477893201e-05, "loss": 0.1698, "step": 6759 }, { "epoch": 1.3681441003845376, "grad_norm": 0.2984505891799927, "learning_rate": 4.543698980314236e-05, "loss": 0.1881, "step": 6760 }, { "epoch": 1.3683464885650678, "grad_norm": 0.2443763017654419, "learning_rate": 4.541033733707747e-05, "loss": 0.181, "step": 6761 }, { "epoch": 1.368548876745598, "grad_norm": 0.32323718070983887, "learning_rate": 4.538369039382184e-05, "loss": 0.2107, "step": 6762 }, { "epoch": 1.3687512649261282, "grad_norm": 0.24224111437797546, "learning_rate": 4.535704897607135e-05, "loss": 0.172, "step": 6763 }, { "epoch": 1.3689536531066586, "grad_norm": 0.2844039499759674, "learning_rate": 4.5330413086521276e-05, "loss": 0.1602, "step": 6764 }, { "epoch": 1.3691560412871888, "grad_norm": 0.31434163451194763, "learning_rate": 4.530378272786635e-05, "loss": 0.1955, "step": 6765 }, { "epoch": 1.369358429467719, "grad_norm": 0.28526541590690613, "learning_rate": 4.527715790280078e-05, "loss": 0.1656, "step": 6766 }, { "epoch": 1.3695608176482494, "grad_norm": 0.27655383944511414, "learning_rate": 4.525053861401818e-05, "loss": 0.1804, "step": 6767 }, { "epoch": 1.3697632058287796, "grad_norm": 0.48966366052627563, "learning_rate": 4.52239248642116e-05, "loss": 0.2221, "step": 6768 }, { "epoch": 1.3699655940093098, "grad_norm": 0.3002183139324188, "learning_rate": 4.519731665607353e-05, "loss": 0.2067, "step": 6769 }, { "epoch": 1.37016798218984, "grad_norm": 0.2847645878791809, "learning_rate": 4.517071399229593e-05, "loss": 0.2084, "step": 6770 }, { "epoch": 1.3703703703703702, "grad_norm": 0.2950843274593353, "learning_rate": 4.5144116875570176e-05, "loss": 0.1991, "step": 6771 }, { "epoch": 1.3705727585509007, "grad_norm": 0.29930636286735535, "learning_rate": 4.511752530858707e-05, "loss": 0.1767, "step": 6772 }, { "epoch": 1.3707751467314309, "grad_norm": 0.28075480461120605, "learning_rate": 4.5090939294036895e-05, "loss": 0.1898, "step": 6773 }, { "epoch": 1.370977534911961, "grad_norm": 0.2594175636768341, "learning_rate": 4.506435883460928e-05, "loss": 0.2015, "step": 6774 }, { "epoch": 1.3711799230924915, "grad_norm": 0.3175654113292694, "learning_rate": 4.503778393299344e-05, "loss": 0.2128, "step": 6775 }, { "epoch": 1.3713823112730217, "grad_norm": 0.3042403757572174, "learning_rate": 4.501121459187792e-05, "loss": 0.2206, "step": 6776 }, { "epoch": 1.3715846994535519, "grad_norm": 0.31100788712501526, "learning_rate": 4.49846508139507e-05, "loss": 0.1891, "step": 6777 }, { "epoch": 1.371787087634082, "grad_norm": 0.30918845534324646, "learning_rate": 4.495809260189925e-05, "loss": 0.173, "step": 6778 }, { "epoch": 1.3719894758146125, "grad_norm": 0.29384666681289673, "learning_rate": 4.493153995841045e-05, "loss": 0.1626, "step": 6779 }, { "epoch": 1.3721918639951427, "grad_norm": 0.27222007513046265, "learning_rate": 4.4904992886170595e-05, "loss": 0.1754, "step": 6780 }, { "epoch": 1.372394252175673, "grad_norm": 0.30638864636421204, "learning_rate": 4.487845138786551e-05, "loss": 0.2065, "step": 6781 }, { "epoch": 1.3725966403562033, "grad_norm": 0.30102983117103577, "learning_rate": 4.4851915466180274e-05, "loss": 0.1956, "step": 6782 }, { "epoch": 1.3727990285367335, "grad_norm": 0.30725544691085815, "learning_rate": 4.4825385123799576e-05, "loss": 0.2123, "step": 6783 }, { "epoch": 1.3730014167172637, "grad_norm": 0.25662949681282043, "learning_rate": 4.479886036340748e-05, "loss": 0.1818, "step": 6784 }, { "epoch": 1.373203804897794, "grad_norm": 0.2699354887008667, "learning_rate": 4.477234118768746e-05, "loss": 0.17, "step": 6785 }, { "epoch": 1.3734061930783241, "grad_norm": 0.26643893122673035, "learning_rate": 4.4745827599322466e-05, "loss": 0.1903, "step": 6786 }, { "epoch": 1.3736085812588545, "grad_norm": 0.3266531527042389, "learning_rate": 4.4719319600994856e-05, "loss": 0.2274, "step": 6787 }, { "epoch": 1.3738109694393847, "grad_norm": 0.2596859633922577, "learning_rate": 4.469281719538645e-05, "loss": 0.1752, "step": 6788 }, { "epoch": 1.374013357619915, "grad_norm": 0.25986337661743164, "learning_rate": 4.466632038517845e-05, "loss": 0.16, "step": 6789 }, { "epoch": 1.3742157458004454, "grad_norm": 0.2841913402080536, "learning_rate": 4.4639829173051554e-05, "loss": 0.2121, "step": 6790 }, { "epoch": 1.3744181339809756, "grad_norm": 0.3008646070957184, "learning_rate": 4.461334356168585e-05, "loss": 0.1924, "step": 6791 }, { "epoch": 1.3746205221615058, "grad_norm": 0.29514750838279724, "learning_rate": 4.4586863553760885e-05, "loss": 0.199, "step": 6792 }, { "epoch": 1.374822910342036, "grad_norm": 0.2603028416633606, "learning_rate": 4.4560389151955615e-05, "loss": 0.1645, "step": 6793 }, { "epoch": 1.3750252985225662, "grad_norm": 0.3041701018810272, "learning_rate": 4.453392035894846e-05, "loss": 0.1889, "step": 6794 }, { "epoch": 1.3752276867030966, "grad_norm": 0.26599910855293274, "learning_rate": 4.4507457177417234e-05, "loss": 0.1522, "step": 6795 }, { "epoch": 1.3754300748836268, "grad_norm": 0.3250159025192261, "learning_rate": 4.448099961003922e-05, "loss": 0.1977, "step": 6796 }, { "epoch": 1.375632463064157, "grad_norm": 0.2732198238372803, "learning_rate": 4.4454547659491106e-05, "loss": 0.1951, "step": 6797 }, { "epoch": 1.3758348512446874, "grad_norm": 0.30923816561698914, "learning_rate": 4.442810132844903e-05, "loss": 0.231, "step": 6798 }, { "epoch": 1.3760372394252176, "grad_norm": 0.28462839126586914, "learning_rate": 4.440166061958856e-05, "loss": 0.1964, "step": 6799 }, { "epoch": 1.3762396276057478, "grad_norm": 0.210264652967453, "learning_rate": 4.437522553558466e-05, "loss": 0.1613, "step": 6800 }, { "epoch": 1.3762396276057478, "eval_loss": 0.2648058235645294, "eval_runtime": 0.7403, "eval_samples_per_second": 6.754, "eval_steps_per_second": 1.351, "step": 6800 }, { "epoch": 1.376442015786278, "grad_norm": 0.2972264885902405, "learning_rate": 4.43487960791118e-05, "loss": 0.2084, "step": 6801 }, { "epoch": 1.3766444039668082, "grad_norm": 0.23235231637954712, "learning_rate": 4.4322372252843805e-05, "loss": 0.1628, "step": 6802 }, { "epoch": 1.3768467921473386, "grad_norm": 0.28614068031311035, "learning_rate": 4.429595405945399e-05, "loss": 0.215, "step": 6803 }, { "epoch": 1.3770491803278688, "grad_norm": 0.35497477650642395, "learning_rate": 4.426954150161503e-05, "loss": 0.1987, "step": 6804 }, { "epoch": 1.377251568508399, "grad_norm": 0.28614556789398193, "learning_rate": 4.424313458199908e-05, "loss": 0.1982, "step": 6805 }, { "epoch": 1.3774539566889294, "grad_norm": 0.25958994030952454, "learning_rate": 4.421673330327771e-05, "loss": 0.1862, "step": 6806 }, { "epoch": 1.3776563448694596, "grad_norm": 0.2792111039161682, "learning_rate": 4.419033766812196e-05, "loss": 0.2023, "step": 6807 }, { "epoch": 1.3778587330499898, "grad_norm": 0.29506009817123413, "learning_rate": 4.416394767920222e-05, "loss": 0.1735, "step": 6808 }, { "epoch": 1.37806112123052, "grad_norm": 0.28253570199012756, "learning_rate": 4.4137563339188395e-05, "loss": 0.204, "step": 6809 }, { "epoch": 1.3782635094110505, "grad_norm": 0.2398274689912796, "learning_rate": 4.411118465074974e-05, "loss": 0.1415, "step": 6810 }, { "epoch": 1.3784658975915807, "grad_norm": 0.3018699884414673, "learning_rate": 4.408481161655499e-05, "loss": 0.2001, "step": 6811 }, { "epoch": 1.3786682857721109, "grad_norm": 0.26964306831359863, "learning_rate": 4.405844423927228e-05, "loss": 0.1626, "step": 6812 }, { "epoch": 1.3788706739526413, "grad_norm": 0.25589293241500854, "learning_rate": 4.403208252156921e-05, "loss": 0.1736, "step": 6813 }, { "epoch": 1.3790730621331715, "grad_norm": 0.28545406460762024, "learning_rate": 4.400572646611275e-05, "loss": 0.1692, "step": 6814 }, { "epoch": 1.3792754503137017, "grad_norm": 0.29041236639022827, "learning_rate": 4.3979376075569354e-05, "loss": 0.2007, "step": 6815 }, { "epoch": 1.379477838494232, "grad_norm": 0.26712194085121155, "learning_rate": 4.395303135260487e-05, "loss": 0.1918, "step": 6816 }, { "epoch": 1.379680226674762, "grad_norm": 0.3227420449256897, "learning_rate": 4.3926692299884573e-05, "loss": 0.1999, "step": 6817 }, { "epoch": 1.3798826148552925, "grad_norm": 0.310242235660553, "learning_rate": 4.3900358920073184e-05, "loss": 0.1989, "step": 6818 }, { "epoch": 1.3800850030358227, "grad_norm": 0.3100515305995941, "learning_rate": 4.387403121583482e-05, "loss": 0.2101, "step": 6819 }, { "epoch": 1.380287391216353, "grad_norm": 0.2882237434387207, "learning_rate": 4.3847709189833075e-05, "loss": 0.2105, "step": 6820 }, { "epoch": 1.3804897793968833, "grad_norm": 0.4425124526023865, "learning_rate": 4.38213928447309e-05, "loss": 0.1808, "step": 6821 }, { "epoch": 1.3806921675774135, "grad_norm": 0.2675884962081909, "learning_rate": 4.379508218319073e-05, "loss": 0.2059, "step": 6822 }, { "epoch": 1.3808945557579437, "grad_norm": 0.30350470542907715, "learning_rate": 4.376877720787439e-05, "loss": 0.2115, "step": 6823 }, { "epoch": 1.381096943938474, "grad_norm": 0.27757778763771057, "learning_rate": 4.374247792144314e-05, "loss": 0.1884, "step": 6824 }, { "epoch": 1.3812993321190041, "grad_norm": 0.3709230422973633, "learning_rate": 4.371618432655767e-05, "loss": 0.1827, "step": 6825 }, { "epoch": 1.3815017202995346, "grad_norm": 0.5349687933921814, "learning_rate": 4.3689896425878095e-05, "loss": 0.2039, "step": 6826 }, { "epoch": 1.3817041084800648, "grad_norm": 0.2831704914569855, "learning_rate": 4.3663614222063956e-05, "loss": 0.1719, "step": 6827 }, { "epoch": 1.381906496660595, "grad_norm": 0.2922341227531433, "learning_rate": 4.3637337717774186e-05, "loss": 0.2089, "step": 6828 }, { "epoch": 1.3821088848411254, "grad_norm": 0.3860246241092682, "learning_rate": 4.3611066915667173e-05, "loss": 0.2074, "step": 6829 }, { "epoch": 1.3823112730216556, "grad_norm": 0.3138883113861084, "learning_rate": 4.3584801818400746e-05, "loss": 0.2413, "step": 6830 }, { "epoch": 1.3825136612021858, "grad_norm": 0.27164164185523987, "learning_rate": 4.35585424286321e-05, "loss": 0.1726, "step": 6831 }, { "epoch": 1.382716049382716, "grad_norm": 0.32853418588638306, "learning_rate": 4.353228874901789e-05, "loss": 0.1984, "step": 6832 }, { "epoch": 1.3829184375632462, "grad_norm": 0.24393995106220245, "learning_rate": 4.350604078221421e-05, "loss": 0.1721, "step": 6833 }, { "epoch": 1.3831208257437766, "grad_norm": 0.26866650581359863, "learning_rate": 4.3479798530876535e-05, "loss": 0.135, "step": 6834 }, { "epoch": 1.3833232139243068, "grad_norm": 0.25781649351119995, "learning_rate": 4.3453561997659786e-05, "loss": 0.2081, "step": 6835 }, { "epoch": 1.3835256021048372, "grad_norm": 0.2890598177909851, "learning_rate": 4.342733118521829e-05, "loss": 0.1822, "step": 6836 }, { "epoch": 1.3837279902853674, "grad_norm": 0.25596973299980164, "learning_rate": 4.340110609620582e-05, "loss": 0.1756, "step": 6837 }, { "epoch": 1.3839303784658976, "grad_norm": 0.3108643889427185, "learning_rate": 4.3374886733275554e-05, "loss": 0.2116, "step": 6838 }, { "epoch": 1.3841327666464278, "grad_norm": 0.2812763452529907, "learning_rate": 4.3348673099080087e-05, "loss": 0.1977, "step": 6839 }, { "epoch": 1.384335154826958, "grad_norm": 0.27367645502090454, "learning_rate": 4.3322465196271434e-05, "loss": 0.2071, "step": 6840 }, { "epoch": 1.3845375430074884, "grad_norm": 0.3385069668292999, "learning_rate": 4.329626302750105e-05, "loss": 0.1943, "step": 6841 }, { "epoch": 1.3847399311880186, "grad_norm": 0.25230950117111206, "learning_rate": 4.327006659541979e-05, "loss": 0.1838, "step": 6842 }, { "epoch": 1.3849423193685488, "grad_norm": 0.28919148445129395, "learning_rate": 4.324387590267792e-05, "loss": 0.1669, "step": 6843 }, { "epoch": 1.3851447075490793, "grad_norm": 0.2459256798028946, "learning_rate": 4.321769095192516e-05, "loss": 0.1675, "step": 6844 }, { "epoch": 1.3853470957296095, "grad_norm": 0.2925647795200348, "learning_rate": 4.319151174581061e-05, "loss": 0.195, "step": 6845 }, { "epoch": 1.3855494839101397, "grad_norm": 0.27055978775024414, "learning_rate": 4.316533828698283e-05, "loss": 0.2061, "step": 6846 }, { "epoch": 1.3857518720906699, "grad_norm": 0.27059707045555115, "learning_rate": 4.313917057808975e-05, "loss": 0.1772, "step": 6847 }, { "epoch": 1.3859542602712, "grad_norm": 0.3185412883758545, "learning_rate": 4.311300862177879e-05, "loss": 0.1912, "step": 6848 }, { "epoch": 1.3861566484517305, "grad_norm": 0.3338720202445984, "learning_rate": 4.3086852420696685e-05, "loss": 0.2449, "step": 6849 }, { "epoch": 1.3863590366322607, "grad_norm": 0.3972804844379425, "learning_rate": 4.306070197748967e-05, "loss": 0.2011, "step": 6850 }, { "epoch": 1.3863590366322607, "eval_loss": 0.2659958302974701, "eval_runtime": 0.7397, "eval_samples_per_second": 6.759, "eval_steps_per_second": 1.352, "step": 6850 }, { "epoch": 1.3865614248127909, "grad_norm": 0.2555653750896454, "learning_rate": 4.3034557294803365e-05, "loss": 0.1863, "step": 6851 }, { "epoch": 1.3867638129933213, "grad_norm": 0.27162060141563416, "learning_rate": 4.300841837528282e-05, "loss": 0.2004, "step": 6852 }, { "epoch": 1.3869662011738515, "grad_norm": 0.3083297908306122, "learning_rate": 4.2982285221572505e-05, "loss": 0.2093, "step": 6853 }, { "epoch": 1.3871685893543817, "grad_norm": 0.3471545875072479, "learning_rate": 4.295615783631629e-05, "loss": 0.1907, "step": 6854 }, { "epoch": 1.387370977534912, "grad_norm": 0.2562052309513092, "learning_rate": 4.2930036222157466e-05, "loss": 0.1747, "step": 6855 }, { "epoch": 1.387573365715442, "grad_norm": 0.27602705359458923, "learning_rate": 4.290392038173875e-05, "loss": 0.2041, "step": 6856 }, { "epoch": 1.3877757538959725, "grad_norm": 0.32171010971069336, "learning_rate": 4.287781031770227e-05, "loss": 0.2447, "step": 6857 }, { "epoch": 1.3879781420765027, "grad_norm": 0.27243730425834656, "learning_rate": 4.285170603268957e-05, "loss": 0.177, "step": 6858 }, { "epoch": 1.388180530257033, "grad_norm": 0.3044010400772095, "learning_rate": 4.28256075293416e-05, "loss": 0.2262, "step": 6859 }, { "epoch": 1.3883829184375633, "grad_norm": 0.25574493408203125, "learning_rate": 4.279951481029872e-05, "loss": 0.1701, "step": 6860 }, { "epoch": 1.3885853066180935, "grad_norm": 0.3542279899120331, "learning_rate": 4.277342787820076e-05, "loss": 0.2242, "step": 6861 }, { "epoch": 1.3887876947986237, "grad_norm": 0.3481219708919525, "learning_rate": 4.274734673568688e-05, "loss": 0.1787, "step": 6862 }, { "epoch": 1.388990082979154, "grad_norm": 0.3042903542518616, "learning_rate": 4.272127138539571e-05, "loss": 0.1845, "step": 6863 }, { "epoch": 1.3891924711596841, "grad_norm": 0.3019198775291443, "learning_rate": 4.269520182996528e-05, "loss": 0.206, "step": 6864 }, { "epoch": 1.3893948593402146, "grad_norm": 0.2840752601623535, "learning_rate": 4.2669138072033056e-05, "loss": 0.1861, "step": 6865 }, { "epoch": 1.3895972475207448, "grad_norm": 0.25696811079978943, "learning_rate": 4.2643080114235854e-05, "loss": 0.143, "step": 6866 }, { "epoch": 1.3897996357012752, "grad_norm": 0.26223090291023254, "learning_rate": 4.2617027959209975e-05, "loss": 0.1767, "step": 6867 }, { "epoch": 1.3900020238818054, "grad_norm": 0.277608722448349, "learning_rate": 4.259098160959109e-05, "loss": 0.1934, "step": 6868 }, { "epoch": 1.3902044120623356, "grad_norm": 0.3046850562095642, "learning_rate": 4.256494106801432e-05, "loss": 0.2011, "step": 6869 }, { "epoch": 1.3904068002428658, "grad_norm": 0.2791365087032318, "learning_rate": 4.2538906337114136e-05, "loss": 0.2009, "step": 6870 }, { "epoch": 1.390609188423396, "grad_norm": 0.245823934674263, "learning_rate": 4.2512877419524476e-05, "loss": 0.1618, "step": 6871 }, { "epoch": 1.3908115766039264, "grad_norm": 0.33263981342315674, "learning_rate": 4.2486854317878674e-05, "loss": 0.1809, "step": 6872 }, { "epoch": 1.3910139647844566, "grad_norm": 0.3047584295272827, "learning_rate": 4.246083703480949e-05, "loss": 0.1679, "step": 6873 }, { "epoch": 1.3912163529649868, "grad_norm": 0.2599356174468994, "learning_rate": 4.243482557294904e-05, "loss": 0.1797, "step": 6874 }, { "epoch": 1.3914187411455172, "grad_norm": 0.28328827023506165, "learning_rate": 4.2408819934928924e-05, "loss": 0.217, "step": 6875 }, { "epoch": 1.3916211293260474, "grad_norm": 0.275232195854187, "learning_rate": 4.2382820123380105e-05, "loss": 0.2133, "step": 6876 }, { "epoch": 1.3918235175065776, "grad_norm": 0.3404455780982971, "learning_rate": 4.235682614093298e-05, "loss": 0.1836, "step": 6877 }, { "epoch": 1.3920259056871078, "grad_norm": 0.24205805361270905, "learning_rate": 4.233083799021734e-05, "loss": 0.1525, "step": 6878 }, { "epoch": 1.392228293867638, "grad_norm": 0.30682337284088135, "learning_rate": 4.230485567386241e-05, "loss": 0.2042, "step": 6879 }, { "epoch": 1.3924306820481684, "grad_norm": 0.2682032585144043, "learning_rate": 4.227887919449678e-05, "loss": 0.1837, "step": 6880 }, { "epoch": 1.3926330702286986, "grad_norm": 0.27204856276512146, "learning_rate": 4.225290855474849e-05, "loss": 0.1906, "step": 6881 }, { "epoch": 1.3928354584092288, "grad_norm": 0.31881260871887207, "learning_rate": 4.2226943757245e-05, "loss": 0.2215, "step": 6882 }, { "epoch": 1.3930378465897593, "grad_norm": 0.3069551885128021, "learning_rate": 4.220098480461311e-05, "loss": 0.2167, "step": 6883 }, { "epoch": 1.3932402347702895, "grad_norm": 0.3175070285797119, "learning_rate": 4.217503169947912e-05, "loss": 0.1998, "step": 6884 }, { "epoch": 1.3934426229508197, "grad_norm": 0.3131294250488281, "learning_rate": 4.2149084444468656e-05, "loss": 0.1858, "step": 6885 }, { "epoch": 1.3936450111313499, "grad_norm": 0.3579147458076477, "learning_rate": 4.212314304220681e-05, "loss": 0.2275, "step": 6886 }, { "epoch": 1.39384739931188, "grad_norm": 0.3155602812767029, "learning_rate": 4.209720749531806e-05, "loss": 0.1723, "step": 6887 }, { "epoch": 1.3940497874924105, "grad_norm": 0.30170321464538574, "learning_rate": 4.207127780642628e-05, "loss": 0.1445, "step": 6888 }, { "epoch": 1.3942521756729407, "grad_norm": 0.28568869829177856, "learning_rate": 4.204535397815478e-05, "loss": 0.1945, "step": 6889 }, { "epoch": 1.3944545638534709, "grad_norm": 0.2988249957561493, "learning_rate": 4.2019436013126244e-05, "loss": 0.2014, "step": 6890 }, { "epoch": 1.3946569520340013, "grad_norm": 0.3752059042453766, "learning_rate": 4.199352391396281e-05, "loss": 0.2106, "step": 6891 }, { "epoch": 1.3948593402145315, "grad_norm": 0.28706473112106323, "learning_rate": 4.196761768328599e-05, "loss": 0.1889, "step": 6892 }, { "epoch": 1.3950617283950617, "grad_norm": 0.2898298501968384, "learning_rate": 4.1941717323716645e-05, "loss": 0.2069, "step": 6893 }, { "epoch": 1.395264116575592, "grad_norm": 0.3084106743335724, "learning_rate": 4.191582283787515e-05, "loss": 0.2127, "step": 6894 }, { "epoch": 1.395466504756122, "grad_norm": 0.29156970977783203, "learning_rate": 4.188993422838123e-05, "loss": 0.2036, "step": 6895 }, { "epoch": 1.3956688929366525, "grad_norm": 0.2639651596546173, "learning_rate": 4.186405149785403e-05, "loss": 0.2066, "step": 6896 }, { "epoch": 1.3958712811171827, "grad_norm": 0.3042910397052765, "learning_rate": 4.1838174648912074e-05, "loss": 0.2027, "step": 6897 }, { "epoch": 1.3960736692977131, "grad_norm": 0.28110945224761963, "learning_rate": 4.1812303684173334e-05, "loss": 0.1957, "step": 6898 }, { "epoch": 1.3962760574782433, "grad_norm": 0.3179463744163513, "learning_rate": 4.178643860625514e-05, "loss": 0.2002, "step": 6899 }, { "epoch": 1.3964784456587735, "grad_norm": 0.24411508440971375, "learning_rate": 4.176057941777427e-05, "loss": 0.1795, "step": 6900 }, { "epoch": 1.3964784456587735, "eval_loss": 0.2689473032951355, "eval_runtime": 0.7378, "eval_samples_per_second": 6.777, "eval_steps_per_second": 1.355, "step": 6900 }, { "epoch": 1.3966808338393037, "grad_norm": 0.27842822670936584, "learning_rate": 4.1734726121346865e-05, "loss": 0.2144, "step": 6901 }, { "epoch": 1.396883222019834, "grad_norm": 0.3558849096298218, "learning_rate": 4.170887871958851e-05, "loss": 0.1779, "step": 6902 }, { "epoch": 1.3970856102003644, "grad_norm": 0.33816561102867126, "learning_rate": 4.168303721511415e-05, "loss": 0.1732, "step": 6903 }, { "epoch": 1.3972879983808946, "grad_norm": 0.3181740641593933, "learning_rate": 4.1657201610538185e-05, "loss": 0.2123, "step": 6904 }, { "epoch": 1.3974903865614248, "grad_norm": 0.2679811120033264, "learning_rate": 4.163137190847437e-05, "loss": 0.2069, "step": 6905 }, { "epoch": 1.3976927747419552, "grad_norm": 0.2734729051589966, "learning_rate": 4.1605548111535894e-05, "loss": 0.1787, "step": 6906 }, { "epoch": 1.3978951629224854, "grad_norm": 0.28582727909088135, "learning_rate": 4.1579730222335333e-05, "loss": 0.2116, "step": 6907 }, { "epoch": 1.3980975511030156, "grad_norm": 0.3257206976413727, "learning_rate": 4.155391824348467e-05, "loss": 0.2168, "step": 6908 }, { "epoch": 1.3982999392835458, "grad_norm": 0.4075670540332794, "learning_rate": 4.152811217759529e-05, "loss": 0.2054, "step": 6909 }, { "epoch": 1.398502327464076, "grad_norm": 0.2531040906906128, "learning_rate": 4.150231202727799e-05, "loss": 0.1747, "step": 6910 }, { "epoch": 1.3987047156446064, "grad_norm": 0.2485847920179367, "learning_rate": 4.1476517795142945e-05, "loss": 0.1639, "step": 6911 }, { "epoch": 1.3989071038251366, "grad_norm": 0.33335885405540466, "learning_rate": 4.1450729483799746e-05, "loss": 0.1984, "step": 6912 }, { "epoch": 1.3991094920056668, "grad_norm": 0.29418519139289856, "learning_rate": 4.142494709585739e-05, "loss": 0.2102, "step": 6913 }, { "epoch": 1.3993118801861972, "grad_norm": 0.31125608086586, "learning_rate": 4.139917063392427e-05, "loss": 0.2247, "step": 6914 }, { "epoch": 1.3995142683667274, "grad_norm": 0.29453811049461365, "learning_rate": 4.1373400100608194e-05, "loss": 0.201, "step": 6915 }, { "epoch": 1.3997166565472576, "grad_norm": 0.2884596586227417, "learning_rate": 4.1347635498516314e-05, "loss": 0.2132, "step": 6916 }, { "epoch": 1.3999190447277878, "grad_norm": 0.2621874511241913, "learning_rate": 4.132187683025523e-05, "loss": 0.1808, "step": 6917 }, { "epoch": 1.400121432908318, "grad_norm": 0.26058146357536316, "learning_rate": 4.129612409843095e-05, "loss": 0.173, "step": 6918 }, { "epoch": 1.4003238210888485, "grad_norm": 0.2764558494091034, "learning_rate": 4.127037730564888e-05, "loss": 0.1854, "step": 6919 }, { "epoch": 1.4005262092693787, "grad_norm": 0.3047637641429901, "learning_rate": 4.1244636454513766e-05, "loss": 0.1963, "step": 6920 }, { "epoch": 1.4007285974499089, "grad_norm": 0.2553144693374634, "learning_rate": 4.121890154762983e-05, "loss": 0.156, "step": 6921 }, { "epoch": 1.4009309856304393, "grad_norm": 0.3238953948020935, "learning_rate": 4.119317258760066e-05, "loss": 0.1743, "step": 6922 }, { "epoch": 1.4011333738109695, "grad_norm": 0.2943544089794159, "learning_rate": 4.1167449577029224e-05, "loss": 0.2146, "step": 6923 }, { "epoch": 1.4013357619914997, "grad_norm": 0.33927851915359497, "learning_rate": 4.114173251851793e-05, "loss": 0.1974, "step": 6924 }, { "epoch": 1.4015381501720299, "grad_norm": 0.2715267241001129, "learning_rate": 4.1116021414668525e-05, "loss": 0.2103, "step": 6925 }, { "epoch": 1.40174053835256, "grad_norm": 0.29221105575561523, "learning_rate": 4.109031626808223e-05, "loss": 0.1773, "step": 6926 }, { "epoch": 1.4019429265330905, "grad_norm": 0.33900484442710876, "learning_rate": 4.106461708135956e-05, "loss": 0.1881, "step": 6927 }, { "epoch": 1.4021453147136207, "grad_norm": 0.27142077684402466, "learning_rate": 4.1038923857100565e-05, "loss": 0.18, "step": 6928 }, { "epoch": 1.4023477028941511, "grad_norm": 0.2642443776130676, "learning_rate": 4.101323659790459e-05, "loss": 0.1691, "step": 6929 }, { "epoch": 1.4025500910746813, "grad_norm": 0.2583783268928528, "learning_rate": 4.09875553063704e-05, "loss": 0.1571, "step": 6930 }, { "epoch": 1.4027524792552115, "grad_norm": 0.3036493957042694, "learning_rate": 4.096187998509614e-05, "loss": 0.1887, "step": 6931 }, { "epoch": 1.4029548674357417, "grad_norm": 0.2827909588813782, "learning_rate": 4.0936210636679386e-05, "loss": 0.1945, "step": 6932 }, { "epoch": 1.403157255616272, "grad_norm": 0.34129467606544495, "learning_rate": 4.091054726371709e-05, "loss": 0.2067, "step": 6933 }, { "epoch": 1.4033596437968023, "grad_norm": 0.2848436236381531, "learning_rate": 4.0884889868805606e-05, "loss": 0.2043, "step": 6934 }, { "epoch": 1.4035620319773325, "grad_norm": 0.28599414229393005, "learning_rate": 4.085923845454067e-05, "loss": 0.198, "step": 6935 }, { "epoch": 1.4037644201578627, "grad_norm": 0.3087374269962311, "learning_rate": 4.0833593023517445e-05, "loss": 0.1792, "step": 6936 }, { "epoch": 1.4039668083383932, "grad_norm": 0.30320149660110474, "learning_rate": 4.080795357833047e-05, "loss": 0.1712, "step": 6937 }, { "epoch": 1.4041691965189234, "grad_norm": 0.3309793770313263, "learning_rate": 4.0782320121573635e-05, "loss": 0.2074, "step": 6938 }, { "epoch": 1.4043715846994536, "grad_norm": 0.30699872970581055, "learning_rate": 4.075669265584028e-05, "loss": 0.2281, "step": 6939 }, { "epoch": 1.4045739728799838, "grad_norm": 0.36575666069984436, "learning_rate": 4.0731071183723135e-05, "loss": 0.1978, "step": 6940 }, { "epoch": 1.404776361060514, "grad_norm": 0.2952874004840851, "learning_rate": 4.07054557078143e-05, "loss": 0.2132, "step": 6941 }, { "epoch": 1.4049787492410444, "grad_norm": 0.29006901383399963, "learning_rate": 4.067984623070529e-05, "loss": 0.1625, "step": 6942 }, { "epoch": 1.4051811374215746, "grad_norm": 0.286811888217926, "learning_rate": 4.065424275498699e-05, "loss": 0.1725, "step": 6943 }, { "epoch": 1.4053835256021048, "grad_norm": 0.25828805565834045, "learning_rate": 4.062864528324971e-05, "loss": 0.1738, "step": 6944 }, { "epoch": 1.4055859137826352, "grad_norm": 0.267084538936615, "learning_rate": 4.0603053818083125e-05, "loss": 0.1879, "step": 6945 }, { "epoch": 1.4057883019631654, "grad_norm": 0.39066869020462036, "learning_rate": 4.0577468362076297e-05, "loss": 0.1714, "step": 6946 }, { "epoch": 1.4059906901436956, "grad_norm": 0.2583625614643097, "learning_rate": 4.0551888917817716e-05, "loss": 0.1542, "step": 6947 }, { "epoch": 1.4061930783242258, "grad_norm": 0.28583186864852905, "learning_rate": 4.052631548789524e-05, "loss": 0.1832, "step": 6948 }, { "epoch": 1.406395466504756, "grad_norm": 0.3042111098766327, "learning_rate": 4.05007480748961e-05, "loss": 0.2508, "step": 6949 }, { "epoch": 1.4065978546852864, "grad_norm": 0.2741240859031677, "learning_rate": 4.0475186681406954e-05, "loss": 0.1845, "step": 6950 }, { "epoch": 1.4065978546852864, "eval_loss": 0.26788368821144104, "eval_runtime": 0.7398, "eval_samples_per_second": 6.758, "eval_steps_per_second": 1.352, "step": 6950 }, { "epoch": 1.4068002428658166, "grad_norm": 0.2780517339706421, "learning_rate": 4.044963131001383e-05, "loss": 0.194, "step": 6951 }, { "epoch": 1.4070026310463468, "grad_norm": 0.3128349781036377, "learning_rate": 4.0424081963302164e-05, "loss": 0.2026, "step": 6952 }, { "epoch": 1.4072050192268772, "grad_norm": 0.2936166524887085, "learning_rate": 4.0398538643856754e-05, "loss": 0.2069, "step": 6953 }, { "epoch": 1.4074074074074074, "grad_norm": 0.2642104923725128, "learning_rate": 4.037300135426182e-05, "loss": 0.1818, "step": 6954 }, { "epoch": 1.4076097955879376, "grad_norm": 0.3181673586368561, "learning_rate": 4.0347470097100934e-05, "loss": 0.2027, "step": 6955 }, { "epoch": 1.4078121837684678, "grad_norm": 0.26954516768455505, "learning_rate": 4.032194487495712e-05, "loss": 0.1967, "step": 6956 }, { "epoch": 1.4080145719489983, "grad_norm": 0.2782799005508423, "learning_rate": 4.029642569041271e-05, "loss": 0.2049, "step": 6957 }, { "epoch": 1.4082169601295285, "grad_norm": 0.2677285969257355, "learning_rate": 4.02709125460495e-05, "loss": 0.1798, "step": 6958 }, { "epoch": 1.4084193483100587, "grad_norm": 0.29425325989723206, "learning_rate": 4.024540544444865e-05, "loss": 0.1793, "step": 6959 }, { "epoch": 1.408621736490589, "grad_norm": 0.2720067799091339, "learning_rate": 4.0219904388190655e-05, "loss": 0.1665, "step": 6960 }, { "epoch": 1.4088241246711193, "grad_norm": 0.28150567412376404, "learning_rate": 4.0194409379855456e-05, "loss": 0.1955, "step": 6961 }, { "epoch": 1.4090265128516495, "grad_norm": 0.29516106843948364, "learning_rate": 4.016892042202239e-05, "loss": 0.1957, "step": 6962 }, { "epoch": 1.4092289010321797, "grad_norm": 0.299935519695282, "learning_rate": 4.014343751727017e-05, "loss": 0.2035, "step": 6963 }, { "epoch": 1.4094312892127099, "grad_norm": 0.3131525218486786, "learning_rate": 4.011796066817686e-05, "loss": 0.1969, "step": 6964 }, { "epoch": 1.4096336773932403, "grad_norm": 0.25779253244400024, "learning_rate": 4.009248987731995e-05, "loss": 0.1746, "step": 6965 }, { "epoch": 1.4098360655737705, "grad_norm": 0.24830158054828644, "learning_rate": 4.006702514727632e-05, "loss": 0.2183, "step": 6966 }, { "epoch": 1.4100384537543007, "grad_norm": 0.29047366976737976, "learning_rate": 4.0041566480622215e-05, "loss": 0.1979, "step": 6967 }, { "epoch": 1.4102408419348311, "grad_norm": 0.27898839116096497, "learning_rate": 4.001611387993327e-05, "loss": 0.2051, "step": 6968 }, { "epoch": 1.4104432301153613, "grad_norm": 0.30387625098228455, "learning_rate": 3.9990667347784525e-05, "loss": 0.2038, "step": 6969 }, { "epoch": 1.4106456182958915, "grad_norm": 0.29681432247161865, "learning_rate": 3.996522688675038e-05, "loss": 0.2124, "step": 6970 }, { "epoch": 1.4108480064764217, "grad_norm": 0.3277590274810791, "learning_rate": 3.993979249940465e-05, "loss": 0.2206, "step": 6971 }, { "epoch": 1.411050394656952, "grad_norm": 0.3136327266693115, "learning_rate": 3.991436418832051e-05, "loss": 0.2164, "step": 6972 }, { "epoch": 1.4112527828374823, "grad_norm": 0.25054970383644104, "learning_rate": 3.9888941956070525e-05, "loss": 0.1874, "step": 6973 }, { "epoch": 1.4114551710180125, "grad_norm": 0.3261873424053192, "learning_rate": 3.9863525805226664e-05, "loss": 0.1866, "step": 6974 }, { "epoch": 1.4116575591985427, "grad_norm": 0.26633456349372864, "learning_rate": 3.983811573836025e-05, "loss": 0.1779, "step": 6975 }, { "epoch": 1.4118599473790732, "grad_norm": 0.3039325773715973, "learning_rate": 3.981271175804201e-05, "loss": 0.2131, "step": 6976 }, { "epoch": 1.4120623355596034, "grad_norm": 0.2937398850917816, "learning_rate": 3.978731386684206e-05, "loss": 0.1881, "step": 6977 }, { "epoch": 1.4122647237401336, "grad_norm": 0.2729499936103821, "learning_rate": 3.976192206732989e-05, "loss": 0.1745, "step": 6978 }, { "epoch": 1.4124671119206638, "grad_norm": 0.2721683979034424, "learning_rate": 3.973653636207437e-05, "loss": 0.2057, "step": 6979 }, { "epoch": 1.412669500101194, "grad_norm": 0.267595112323761, "learning_rate": 3.971115675364378e-05, "loss": 0.1943, "step": 6980 }, { "epoch": 1.4128718882817244, "grad_norm": 0.30413955450057983, "learning_rate": 3.9685783244605726e-05, "loss": 0.2195, "step": 6981 }, { "epoch": 1.4130742764622546, "grad_norm": 0.23452375829219818, "learning_rate": 3.966041583752726e-05, "loss": 0.1611, "step": 6982 }, { "epoch": 1.4132766646427848, "grad_norm": 0.2777538597583771, "learning_rate": 3.963505453497478e-05, "loss": 0.1663, "step": 6983 }, { "epoch": 1.4134790528233152, "grad_norm": 0.2605833411216736, "learning_rate": 3.960969933951409e-05, "loss": 0.1797, "step": 6984 }, { "epoch": 1.4136814410038454, "grad_norm": 0.2730015814304352, "learning_rate": 3.9584350253710345e-05, "loss": 0.205, "step": 6985 }, { "epoch": 1.4138838291843756, "grad_norm": 0.3188944160938263, "learning_rate": 3.9559007280128105e-05, "loss": 0.2173, "step": 6986 }, { "epoch": 1.4140862173649058, "grad_norm": 0.27802667021751404, "learning_rate": 3.9533670421331314e-05, "loss": 0.1775, "step": 6987 }, { "epoch": 1.4142886055454362, "grad_norm": 0.27725279331207275, "learning_rate": 3.9508339679883276e-05, "loss": 0.1848, "step": 6988 }, { "epoch": 1.4144909937259664, "grad_norm": 0.25943297147750854, "learning_rate": 3.948301505834671e-05, "loss": 0.1987, "step": 6989 }, { "epoch": 1.4146933819064966, "grad_norm": 0.3306175768375397, "learning_rate": 3.9457696559283674e-05, "loss": 0.186, "step": 6990 }, { "epoch": 1.414895770087027, "grad_norm": 0.2895580530166626, "learning_rate": 3.9432384185255635e-05, "loss": 0.1855, "step": 6991 }, { "epoch": 1.4150981582675572, "grad_norm": 0.26445573568344116, "learning_rate": 3.940707793882344e-05, "loss": 0.1952, "step": 6992 }, { "epoch": 1.4153005464480874, "grad_norm": 0.28116846084594727, "learning_rate": 3.9381777822547305e-05, "loss": 0.1798, "step": 6993 }, { "epoch": 1.4155029346286176, "grad_norm": 0.2855150103569031, "learning_rate": 3.935648383898683e-05, "loss": 0.1667, "step": 6994 }, { "epoch": 1.4157053228091478, "grad_norm": 0.25917181372642517, "learning_rate": 3.9331195990701e-05, "loss": 0.1764, "step": 6995 }, { "epoch": 1.4159077109896783, "grad_norm": 0.3372366726398468, "learning_rate": 3.930591428024816e-05, "loss": 0.203, "step": 6996 }, { "epoch": 1.4161100991702085, "grad_norm": 0.32218167185783386, "learning_rate": 3.9280638710186056e-05, "loss": 0.2263, "step": 6997 }, { "epoch": 1.4163124873507387, "grad_norm": 0.2682129442691803, "learning_rate": 3.925536928307181e-05, "loss": 0.1852, "step": 6998 }, { "epoch": 1.416514875531269, "grad_norm": 0.25663620233535767, "learning_rate": 3.923010600146192e-05, "loss": 0.1841, "step": 6999 }, { "epoch": 1.4167172637117993, "grad_norm": 0.2717934548854828, "learning_rate": 3.920484886791225e-05, "loss": 0.1916, "step": 7000 }, { "epoch": 1.4167172637117993, "eval_loss": 0.2678060531616211, "eval_runtime": 0.7392, "eval_samples_per_second": 6.764, "eval_steps_per_second": 1.353, "step": 7000 }, { "epoch": 1.4169196518923295, "grad_norm": 0.25787022709846497, "learning_rate": 3.917959788497805e-05, "loss": 0.1891, "step": 7001 }, { "epoch": 1.4171220400728597, "grad_norm": 0.2321719080209732, "learning_rate": 3.9154353055213955e-05, "loss": 0.1634, "step": 7002 }, { "epoch": 1.41732442825339, "grad_norm": 0.2759595513343811, "learning_rate": 3.912911438117397e-05, "loss": 0.1692, "step": 7003 }, { "epoch": 1.4175268164339203, "grad_norm": 0.26630786061286926, "learning_rate": 3.910388186541153e-05, "loss": 0.1929, "step": 7004 }, { "epoch": 1.4177292046144505, "grad_norm": 0.28287503123283386, "learning_rate": 3.90786555104793e-05, "loss": 0.2087, "step": 7005 }, { "epoch": 1.4179315927949807, "grad_norm": 0.2573223114013672, "learning_rate": 3.9053435318929464e-05, "loss": 0.1451, "step": 7006 }, { "epoch": 1.4181339809755111, "grad_norm": 0.23673537373542786, "learning_rate": 3.902822129331355e-05, "loss": 0.1607, "step": 7007 }, { "epoch": 1.4183363691560413, "grad_norm": 0.26111093163490295, "learning_rate": 3.900301343618242e-05, "loss": 0.2106, "step": 7008 }, { "epoch": 1.4185387573365715, "grad_norm": 0.27173036336898804, "learning_rate": 3.897781175008637e-05, "loss": 0.1738, "step": 7009 }, { "epoch": 1.4187411455171017, "grad_norm": 0.300983190536499, "learning_rate": 3.895261623757502e-05, "loss": 0.1811, "step": 7010 }, { "epoch": 1.418943533697632, "grad_norm": 0.2922830581665039, "learning_rate": 3.89274269011974e-05, "loss": 0.2008, "step": 7011 }, { "epoch": 1.4191459218781624, "grad_norm": 0.2900276482105255, "learning_rate": 3.89022437435019e-05, "loss": 0.187, "step": 7012 }, { "epoch": 1.4193483100586926, "grad_norm": 0.30275261402130127, "learning_rate": 3.887706676703628e-05, "loss": 0.1927, "step": 7013 }, { "epoch": 1.4195506982392228, "grad_norm": 0.24182263016700745, "learning_rate": 3.88518959743477e-05, "loss": 0.1538, "step": 7014 }, { "epoch": 1.4197530864197532, "grad_norm": 0.2707059383392334, "learning_rate": 3.882673136798265e-05, "loss": 0.1697, "step": 7015 }, { "epoch": 1.4199554746002834, "grad_norm": 0.25626611709594727, "learning_rate": 3.880157295048704e-05, "loss": 0.183, "step": 7016 }, { "epoch": 1.4201578627808136, "grad_norm": 0.3871743083000183, "learning_rate": 3.8776420724406136e-05, "loss": 0.1843, "step": 7017 }, { "epoch": 1.4203602509613438, "grad_norm": 0.29843243956565857, "learning_rate": 3.875127469228458e-05, "loss": 0.2119, "step": 7018 }, { "epoch": 1.4205626391418742, "grad_norm": 0.2737175524234772, "learning_rate": 3.872613485666636e-05, "loss": 0.1581, "step": 7019 }, { "epoch": 1.4207650273224044, "grad_norm": 0.2567245364189148, "learning_rate": 3.870100122009488e-05, "loss": 0.1756, "step": 7020 }, { "epoch": 1.4209674155029346, "grad_norm": 0.2890596389770508, "learning_rate": 3.867587378511291e-05, "loss": 0.1906, "step": 7021 }, { "epoch": 1.421169803683465, "grad_norm": 0.3265067934989929, "learning_rate": 3.8650752554262536e-05, "loss": 0.2011, "step": 7022 }, { "epoch": 1.4213721918639952, "grad_norm": 0.30416449904441833, "learning_rate": 3.86256375300853e-05, "loss": 0.2116, "step": 7023 }, { "epoch": 1.4215745800445254, "grad_norm": 0.2644321024417877, "learning_rate": 3.8600528715122074e-05, "loss": 0.2059, "step": 7024 }, { "epoch": 1.4217769682250556, "grad_norm": 0.2900888919830322, "learning_rate": 3.8575426111913084e-05, "loss": 0.1599, "step": 7025 }, { "epoch": 1.4219793564055858, "grad_norm": 0.2844480574131012, "learning_rate": 3.855032972299797e-05, "loss": 0.2083, "step": 7026 }, { "epoch": 1.4221817445861162, "grad_norm": 0.3461550176143646, "learning_rate": 3.852523955091569e-05, "loss": 0.2172, "step": 7027 }, { "epoch": 1.4223841327666464, "grad_norm": 0.28394609689712524, "learning_rate": 3.8500155598204644e-05, "loss": 0.2022, "step": 7028 }, { "epoch": 1.4225865209471766, "grad_norm": 0.3086096942424774, "learning_rate": 3.847507786740254e-05, "loss": 0.2031, "step": 7029 }, { "epoch": 1.422788909127707, "grad_norm": 0.31477683782577515, "learning_rate": 3.845000636104649e-05, "loss": 0.1966, "step": 7030 }, { "epoch": 1.4229912973082373, "grad_norm": 0.27445054054260254, "learning_rate": 3.842494108167294e-05, "loss": 0.1893, "step": 7031 }, { "epoch": 1.4231936854887675, "grad_norm": 0.29484322667121887, "learning_rate": 3.839988203181777e-05, "loss": 0.1978, "step": 7032 }, { "epoch": 1.4233960736692977, "grad_norm": 0.33337923884391785, "learning_rate": 3.837482921401616e-05, "loss": 0.1961, "step": 7033 }, { "epoch": 1.4235984618498279, "grad_norm": 0.2541923224925995, "learning_rate": 3.834978263080271e-05, "loss": 0.1719, "step": 7034 }, { "epoch": 1.4238008500303583, "grad_norm": 0.28883957862854004, "learning_rate": 3.8324742284711366e-05, "loss": 0.1913, "step": 7035 }, { "epoch": 1.4240032382108885, "grad_norm": 0.31055620312690735, "learning_rate": 3.829970817827545e-05, "loss": 0.2066, "step": 7036 }, { "epoch": 1.4242056263914187, "grad_norm": 0.2655206620693207, "learning_rate": 3.8274680314027646e-05, "loss": 0.1796, "step": 7037 }, { "epoch": 1.424408014571949, "grad_norm": 0.28076305985450745, "learning_rate": 3.824965869450001e-05, "loss": 0.1964, "step": 7038 }, { "epoch": 1.4246104027524793, "grad_norm": 0.314505398273468, "learning_rate": 3.822464332222396e-05, "loss": 0.2218, "step": 7039 }, { "epoch": 1.4248127909330095, "grad_norm": 0.30106157064437866, "learning_rate": 3.819963419973031e-05, "loss": 0.2023, "step": 7040 }, { "epoch": 1.4250151791135397, "grad_norm": 0.2859259247779846, "learning_rate": 3.8174631329549203e-05, "loss": 0.1928, "step": 7041 }, { "epoch": 1.42521756729407, "grad_norm": 0.2816571295261383, "learning_rate": 3.814963471421017e-05, "loss": 0.1849, "step": 7042 }, { "epoch": 1.4254199554746003, "grad_norm": 0.27684688568115234, "learning_rate": 3.812464435624211e-05, "loss": 0.1767, "step": 7043 }, { "epoch": 1.4256223436551305, "grad_norm": 0.2891367971897125, "learning_rate": 3.8099660258173285e-05, "loss": 0.1881, "step": 7044 }, { "epoch": 1.4258247318356607, "grad_norm": 0.30000269412994385, "learning_rate": 3.8074682422531314e-05, "loss": 0.1921, "step": 7045 }, { "epoch": 1.4260271200161911, "grad_norm": 0.2510431706905365, "learning_rate": 3.804971085184321e-05, "loss": 0.1945, "step": 7046 }, { "epoch": 1.4262295081967213, "grad_norm": 0.3182806968688965, "learning_rate": 3.802474554863532e-05, "loss": 0.1756, "step": 7047 }, { "epoch": 1.4264318963772515, "grad_norm": 0.29959988594055176, "learning_rate": 3.799978651543341e-05, "loss": 0.2023, "step": 7048 }, { "epoch": 1.4266342845577817, "grad_norm": 0.3611461818218231, "learning_rate": 3.797483375476251e-05, "loss": 0.2177, "step": 7049 }, { "epoch": 1.4268366727383122, "grad_norm": 0.28023532032966614, "learning_rate": 3.79498872691471e-05, "loss": 0.2006, "step": 7050 }, { "epoch": 1.4268366727383122, "eval_loss": 0.26923832297325134, "eval_runtime": 0.7415, "eval_samples_per_second": 6.743, "eval_steps_per_second": 1.349, "step": 7050 }, { "epoch": 1.4270390609188424, "grad_norm": 0.3193739652633667, "learning_rate": 3.792494706111102e-05, "loss": 0.2008, "step": 7051 }, { "epoch": 1.4272414490993726, "grad_norm": 0.29817917943000793, "learning_rate": 3.790001313317745e-05, "loss": 0.1809, "step": 7052 }, { "epoch": 1.427443837279903, "grad_norm": 0.2573106288909912, "learning_rate": 3.787508548786893e-05, "loss": 0.189, "step": 7053 }, { "epoch": 1.4276462254604332, "grad_norm": 0.2731283903121948, "learning_rate": 3.785016412770741e-05, "loss": 0.1973, "step": 7054 }, { "epoch": 1.4278486136409634, "grad_norm": 0.3203198313713074, "learning_rate": 3.782524905521414e-05, "loss": 0.2265, "step": 7055 }, { "epoch": 1.4280510018214936, "grad_norm": 0.2824562191963196, "learning_rate": 3.780034027290978e-05, "loss": 0.1963, "step": 7056 }, { "epoch": 1.4282533900020238, "grad_norm": 0.2550790309906006, "learning_rate": 3.777543778331435e-05, "loss": 0.1694, "step": 7057 }, { "epoch": 1.4284557781825542, "grad_norm": 0.30352532863616943, "learning_rate": 3.7750541588947195e-05, "loss": 0.1808, "step": 7058 }, { "epoch": 1.4286581663630844, "grad_norm": 0.2527769207954407, "learning_rate": 3.772565169232707e-05, "loss": 0.1661, "step": 7059 }, { "epoch": 1.4288605545436146, "grad_norm": 0.30140969157218933, "learning_rate": 3.7700768095972074e-05, "loss": 0.176, "step": 7060 }, { "epoch": 1.429062942724145, "grad_norm": 0.318043977022171, "learning_rate": 3.767589080239966e-05, "loss": 0.189, "step": 7061 }, { "epoch": 1.4292653309046752, "grad_norm": 0.32130885124206543, "learning_rate": 3.7651019814126654e-05, "loss": 0.2084, "step": 7062 }, { "epoch": 1.4294677190852054, "grad_norm": 0.30957019329071045, "learning_rate": 3.762615513366925e-05, "loss": 0.2183, "step": 7063 }, { "epoch": 1.4296701072657356, "grad_norm": 0.31902942061424255, "learning_rate": 3.760129676354298e-05, "loss": 0.1922, "step": 7064 }, { "epoch": 1.4298724954462658, "grad_norm": 0.26623645424842834, "learning_rate": 3.757644470626276e-05, "loss": 0.1567, "step": 7065 }, { "epoch": 1.4300748836267962, "grad_norm": 0.2927635610103607, "learning_rate": 3.755159896434287e-05, "loss": 0.1668, "step": 7066 }, { "epoch": 1.4302772718073264, "grad_norm": 0.28568488359451294, "learning_rate": 3.752675954029693e-05, "loss": 0.1999, "step": 7067 }, { "epoch": 1.4304796599878566, "grad_norm": 0.25858959555625916, "learning_rate": 3.7501926436637934e-05, "loss": 0.1602, "step": 7068 }, { "epoch": 1.430682048168387, "grad_norm": 0.28040850162506104, "learning_rate": 3.7477099655878236e-05, "loss": 0.1841, "step": 7069 }, { "epoch": 1.4308844363489173, "grad_norm": 0.27585768699645996, "learning_rate": 3.7452279200529585e-05, "loss": 0.1719, "step": 7070 }, { "epoch": 1.4310868245294475, "grad_norm": 0.3029848337173462, "learning_rate": 3.742746507310299e-05, "loss": 0.1921, "step": 7071 }, { "epoch": 1.4312892127099777, "grad_norm": 0.308578759431839, "learning_rate": 3.74026572761089e-05, "loss": 0.2057, "step": 7072 }, { "epoch": 1.4314916008905079, "grad_norm": 0.3463033139705658, "learning_rate": 3.737785581205713e-05, "loss": 0.1898, "step": 7073 }, { "epoch": 1.4316939890710383, "grad_norm": 0.3103969693183899, "learning_rate": 3.735306068345681e-05, "loss": 0.1944, "step": 7074 }, { "epoch": 1.4318963772515685, "grad_norm": 0.2698296904563904, "learning_rate": 3.732827189281647e-05, "loss": 0.1753, "step": 7075 }, { "epoch": 1.4320987654320987, "grad_norm": 0.2952782213687897, "learning_rate": 3.730348944264398e-05, "loss": 0.2036, "step": 7076 }, { "epoch": 1.432301153612629, "grad_norm": 0.24207934737205505, "learning_rate": 3.7278713335446557e-05, "loss": 0.1865, "step": 7077 }, { "epoch": 1.4325035417931593, "grad_norm": 0.26546210050582886, "learning_rate": 3.7253943573730784e-05, "loss": 0.1585, "step": 7078 }, { "epoch": 1.4327059299736895, "grad_norm": 0.3293583393096924, "learning_rate": 3.722918016000263e-05, "loss": 0.1835, "step": 7079 }, { "epoch": 1.4329083181542197, "grad_norm": 0.319829523563385, "learning_rate": 3.720442309676733e-05, "loss": 0.1847, "step": 7080 }, { "epoch": 1.4331107063347501, "grad_norm": 0.2597779929637909, "learning_rate": 3.717967238652964e-05, "loss": 0.1772, "step": 7081 }, { "epoch": 1.4333130945152803, "grad_norm": 0.23758916556835175, "learning_rate": 3.7154928031793526e-05, "loss": 0.1486, "step": 7082 }, { "epoch": 1.4335154826958105, "grad_norm": 0.3300015926361084, "learning_rate": 3.713019003506237e-05, "loss": 0.1764, "step": 7083 }, { "epoch": 1.433717870876341, "grad_norm": 0.3280850350856781, "learning_rate": 3.71054583988389e-05, "loss": 0.192, "step": 7084 }, { "epoch": 1.4339202590568711, "grad_norm": 0.241069957613945, "learning_rate": 3.70807331256252e-05, "loss": 0.1598, "step": 7085 }, { "epoch": 1.4341226472374013, "grad_norm": 0.29670462012290955, "learning_rate": 3.705601421792273e-05, "loss": 0.2229, "step": 7086 }, { "epoch": 1.4343250354179315, "grad_norm": 0.2805311977863312, "learning_rate": 3.7031301678232266e-05, "loss": 0.1959, "step": 7087 }, { "epoch": 1.4345274235984617, "grad_norm": 0.2845379114151001, "learning_rate": 3.700659550905398e-05, "loss": 0.1719, "step": 7088 }, { "epoch": 1.4347298117789922, "grad_norm": 0.27402937412261963, "learning_rate": 3.698189571288737e-05, "loss": 0.1479, "step": 7089 }, { "epoch": 1.4349321999595224, "grad_norm": 0.37877580523490906, "learning_rate": 3.695720229223132e-05, "loss": 0.2259, "step": 7090 }, { "epoch": 1.4351345881400526, "grad_norm": 0.32928740978240967, "learning_rate": 3.6932515249584045e-05, "loss": 0.2107, "step": 7091 }, { "epoch": 1.435336976320583, "grad_norm": 0.29471495747566223, "learning_rate": 3.690783458744311e-05, "loss": 0.2261, "step": 7092 }, { "epoch": 1.4355393645011132, "grad_norm": 0.31165096163749695, "learning_rate": 3.688316030830549e-05, "loss": 0.1891, "step": 7093 }, { "epoch": 1.4357417526816434, "grad_norm": 0.4806102514266968, "learning_rate": 3.685849241466739e-05, "loss": 0.2203, "step": 7094 }, { "epoch": 1.4359441408621736, "grad_norm": 0.25433045625686646, "learning_rate": 3.6833830909024505e-05, "loss": 0.1395, "step": 7095 }, { "epoch": 1.4361465290427038, "grad_norm": 0.2732140123844147, "learning_rate": 3.680917579387181e-05, "loss": 0.1718, "step": 7096 }, { "epoch": 1.4363489172232342, "grad_norm": 0.3305196166038513, "learning_rate": 3.678452707170364e-05, "loss": 0.2393, "step": 7097 }, { "epoch": 1.4365513054037644, "grad_norm": 0.2531237304210663, "learning_rate": 3.675988474501373e-05, "loss": 0.1803, "step": 7098 }, { "epoch": 1.4367536935842946, "grad_norm": 0.2620668113231659, "learning_rate": 3.6735248816295096e-05, "loss": 0.1948, "step": 7099 }, { "epoch": 1.436956081764825, "grad_norm": 0.2772290110588074, "learning_rate": 3.671061928804016e-05, "loss": 0.1965, "step": 7100 }, { "epoch": 1.436956081764825, "eval_loss": 0.2654963731765747, "eval_runtime": 0.7381, "eval_samples_per_second": 6.775, "eval_steps_per_second": 1.355, "step": 7100 }, { "epoch": 1.4371584699453552, "grad_norm": 0.3738824427127838, "learning_rate": 3.6685996162740674e-05, "loss": 0.2033, "step": 7101 }, { "epoch": 1.4373608581258854, "grad_norm": 0.29637742042541504, "learning_rate": 3.6661379442887755e-05, "loss": 0.197, "step": 7102 }, { "epoch": 1.4375632463064156, "grad_norm": 0.284424364566803, "learning_rate": 3.663676913097186e-05, "loss": 0.1912, "step": 7103 }, { "epoch": 1.4377656344869458, "grad_norm": 0.3330419957637787, "learning_rate": 3.66121652294828e-05, "loss": 0.1742, "step": 7104 }, { "epoch": 1.4379680226674763, "grad_norm": 0.2758113741874695, "learning_rate": 3.6587567740909746e-05, "loss": 0.1907, "step": 7105 }, { "epoch": 1.4381704108480065, "grad_norm": 0.2782094478607178, "learning_rate": 3.65629766677412e-05, "loss": 0.1927, "step": 7106 }, { "epoch": 1.4383727990285367, "grad_norm": 0.3029071092605591, "learning_rate": 3.653839201246504e-05, "loss": 0.2061, "step": 7107 }, { "epoch": 1.438575187209067, "grad_norm": 0.3084353506565094, "learning_rate": 3.6513813777568485e-05, "loss": 0.2026, "step": 7108 }, { "epoch": 1.4387775753895973, "grad_norm": 0.31316351890563965, "learning_rate": 3.648924196553809e-05, "loss": 0.202, "step": 7109 }, { "epoch": 1.4389799635701275, "grad_norm": 0.31355270743370056, "learning_rate": 3.646467657885979e-05, "loss": 0.1952, "step": 7110 }, { "epoch": 1.4391823517506577, "grad_norm": 0.29359108209609985, "learning_rate": 3.6440117620018844e-05, "loss": 0.1893, "step": 7111 }, { "epoch": 1.439384739931188, "grad_norm": 0.27208781242370605, "learning_rate": 3.641556509149987e-05, "loss": 0.1721, "step": 7112 }, { "epoch": 1.4395871281117183, "grad_norm": 0.27961140871047974, "learning_rate": 3.639101899578684e-05, "loss": 0.1964, "step": 7113 }, { "epoch": 1.4397895162922485, "grad_norm": 0.2883656322956085, "learning_rate": 3.636647933536306e-05, "loss": 0.1949, "step": 7114 }, { "epoch": 1.439991904472779, "grad_norm": 0.29237887263298035, "learning_rate": 3.634194611271124e-05, "loss": 0.222, "step": 7115 }, { "epoch": 1.4401942926533091, "grad_norm": 0.5512852072715759, "learning_rate": 3.6317419330313316e-05, "loss": 0.201, "step": 7116 }, { "epoch": 1.4403966808338393, "grad_norm": 0.3003794550895691, "learning_rate": 3.6292898990650704e-05, "loss": 0.1926, "step": 7117 }, { "epoch": 1.4405990690143695, "grad_norm": 0.30134889483451843, "learning_rate": 3.62683850962041e-05, "loss": 0.214, "step": 7118 }, { "epoch": 1.4408014571948997, "grad_norm": 0.31887561082839966, "learning_rate": 3.624387764945355e-05, "loss": 0.1939, "step": 7119 }, { "epoch": 1.4410038453754301, "grad_norm": 0.30428701639175415, "learning_rate": 3.6219376652878476e-05, "loss": 0.2034, "step": 7120 }, { "epoch": 1.4412062335559603, "grad_norm": 0.2859341502189636, "learning_rate": 3.619488210895763e-05, "loss": 0.1735, "step": 7121 }, { "epoch": 1.4414086217364905, "grad_norm": 0.3013683259487152, "learning_rate": 3.617039402016912e-05, "loss": 0.2124, "step": 7122 }, { "epoch": 1.441611009917021, "grad_norm": 0.28692567348480225, "learning_rate": 3.614591238899039e-05, "loss": 0.2221, "step": 7123 }, { "epoch": 1.4418133980975512, "grad_norm": 0.3107813894748688, "learning_rate": 3.612143721789821e-05, "loss": 0.1963, "step": 7124 }, { "epoch": 1.4420157862780814, "grad_norm": 0.2541584074497223, "learning_rate": 3.609696850936877e-05, "loss": 0.1875, "step": 7125 }, { "epoch": 1.4422181744586116, "grad_norm": 0.2540287673473358, "learning_rate": 3.607250626587752e-05, "loss": 0.1891, "step": 7126 }, { "epoch": 1.4424205626391418, "grad_norm": 0.2570984661579132, "learning_rate": 3.604805048989929e-05, "loss": 0.1861, "step": 7127 }, { "epoch": 1.4426229508196722, "grad_norm": 0.23033881187438965, "learning_rate": 3.602360118390828e-05, "loss": 0.1512, "step": 7128 }, { "epoch": 1.4428253390002024, "grad_norm": 0.25252917408943176, "learning_rate": 3.5999158350378e-05, "loss": 0.1603, "step": 7129 }, { "epoch": 1.4430277271807326, "grad_norm": 0.276273250579834, "learning_rate": 3.5974721991781334e-05, "loss": 0.2062, "step": 7130 }, { "epoch": 1.443230115361263, "grad_norm": 0.3179681897163391, "learning_rate": 3.595029211059049e-05, "loss": 0.2001, "step": 7131 }, { "epoch": 1.4434325035417932, "grad_norm": 0.25583258271217346, "learning_rate": 3.592586870927701e-05, "loss": 0.1672, "step": 7132 }, { "epoch": 1.4436348917223234, "grad_norm": 0.2345746010541916, "learning_rate": 3.590145179031183e-05, "loss": 0.1596, "step": 7133 }, { "epoch": 1.4438372799028536, "grad_norm": 0.26826149225234985, "learning_rate": 3.5877041356165165e-05, "loss": 0.156, "step": 7134 }, { "epoch": 1.4440396680833838, "grad_norm": 0.2847994863986969, "learning_rate": 3.585263740930662e-05, "loss": 0.1912, "step": 7135 }, { "epoch": 1.4442420562639142, "grad_norm": 0.2812890410423279, "learning_rate": 3.5828239952205136e-05, "loss": 0.1913, "step": 7136 }, { "epoch": 1.4444444444444444, "grad_norm": 0.26837995648384094, "learning_rate": 3.580384898732899e-05, "loss": 0.1794, "step": 7137 }, { "epoch": 1.4446468326249746, "grad_norm": 0.28359875082969666, "learning_rate": 3.57794645171458e-05, "loss": 0.2171, "step": 7138 }, { "epoch": 1.444849220805505, "grad_norm": 0.2712598443031311, "learning_rate": 3.575508654412253e-05, "loss": 0.1673, "step": 7139 }, { "epoch": 1.4450516089860352, "grad_norm": 0.31262731552124023, "learning_rate": 3.5730715070725483e-05, "loss": 0.195, "step": 7140 }, { "epoch": 1.4452539971665654, "grad_norm": 0.2699423134326935, "learning_rate": 3.570635009942033e-05, "loss": 0.1895, "step": 7141 }, { "epoch": 1.4454563853470956, "grad_norm": 0.35048073530197144, "learning_rate": 3.568199163267203e-05, "loss": 0.2072, "step": 7142 }, { "epoch": 1.445658773527626, "grad_norm": 0.3334614038467407, "learning_rate": 3.565763967294495e-05, "loss": 0.2004, "step": 7143 }, { "epoch": 1.4458611617081563, "grad_norm": 0.3023090362548828, "learning_rate": 3.563329422270274e-05, "loss": 0.2038, "step": 7144 }, { "epoch": 1.4460635498886865, "grad_norm": 0.2510114908218384, "learning_rate": 3.5608955284408443e-05, "loss": 0.1772, "step": 7145 }, { "epoch": 1.4462659380692169, "grad_norm": 0.2934434115886688, "learning_rate": 3.5584622860524385e-05, "loss": 0.2137, "step": 7146 }, { "epoch": 1.446468326249747, "grad_norm": 0.2533113360404968, "learning_rate": 3.5560296953512295e-05, "loss": 0.1719, "step": 7147 }, { "epoch": 1.4466707144302773, "grad_norm": 0.2543116807937622, "learning_rate": 3.55359775658332e-05, "loss": 0.1822, "step": 7148 }, { "epoch": 1.4468731026108075, "grad_norm": 0.3006589710712433, "learning_rate": 3.551166469994748e-05, "loss": 0.1703, "step": 7149 }, { "epoch": 1.4470754907913377, "grad_norm": 0.28558582067489624, "learning_rate": 3.548735835831486e-05, "loss": 0.1909, "step": 7150 }, { "epoch": 1.4470754907913377, "eval_loss": 0.2670600414276123, "eval_runtime": 0.7391, "eval_samples_per_second": 6.765, "eval_steps_per_second": 1.353, "step": 7150 }, { "epoch": 1.447277878971868, "grad_norm": 0.3041171431541443, "learning_rate": 3.546305854339439e-05, "loss": 0.2061, "step": 7151 }, { "epoch": 1.4474802671523983, "grad_norm": 0.2890252470970154, "learning_rate": 3.543876525764449e-05, "loss": 0.1843, "step": 7152 }, { "epoch": 1.4476826553329285, "grad_norm": 0.40670862793922424, "learning_rate": 3.5414478503522873e-05, "loss": 0.2172, "step": 7153 }, { "epoch": 1.447885043513459, "grad_norm": 0.25703248381614685, "learning_rate": 3.5390198283486654e-05, "loss": 0.187, "step": 7154 }, { "epoch": 1.4480874316939891, "grad_norm": 0.31184110045433044, "learning_rate": 3.536592459999221e-05, "loss": 0.2156, "step": 7155 }, { "epoch": 1.4482898198745193, "grad_norm": 0.26967254281044006, "learning_rate": 3.5341657455495325e-05, "loss": 0.1816, "step": 7156 }, { "epoch": 1.4484922080550495, "grad_norm": 0.3085002601146698, "learning_rate": 3.531739685245109e-05, "loss": 0.2036, "step": 7157 }, { "epoch": 1.4486945962355797, "grad_norm": 0.25523659586906433, "learning_rate": 3.5293142793313925e-05, "loss": 0.1647, "step": 7158 }, { "epoch": 1.4488969844161101, "grad_norm": 0.281577467918396, "learning_rate": 3.526889528053765e-05, "loss": 0.2121, "step": 7159 }, { "epoch": 1.4490993725966403, "grad_norm": 0.2778119444847107, "learning_rate": 3.52446543165753e-05, "loss": 0.1907, "step": 7160 }, { "epoch": 1.4493017607771705, "grad_norm": 0.3081774413585663, "learning_rate": 3.522041990387935e-05, "loss": 0.1941, "step": 7161 }, { "epoch": 1.449504148957701, "grad_norm": 0.44540053606033325, "learning_rate": 3.519619204490161e-05, "loss": 0.1423, "step": 7162 }, { "epoch": 1.4497065371382312, "grad_norm": 0.31192874908447266, "learning_rate": 3.517197074209316e-05, "loss": 0.193, "step": 7163 }, { "epoch": 1.4499089253187614, "grad_norm": 0.3415822982788086, "learning_rate": 3.514775599790448e-05, "loss": 0.2248, "step": 7164 }, { "epoch": 1.4501113134992916, "grad_norm": 0.2829976975917816, "learning_rate": 3.512354781478537e-05, "loss": 0.197, "step": 7165 }, { "epoch": 1.4503137016798218, "grad_norm": 0.2971900701522827, "learning_rate": 3.509934619518494e-05, "loss": 0.193, "step": 7166 }, { "epoch": 1.4505160898603522, "grad_norm": 0.2710355818271637, "learning_rate": 3.5075151141551686e-05, "loss": 0.1712, "step": 7167 }, { "epoch": 1.4507184780408824, "grad_norm": 0.27177128195762634, "learning_rate": 3.5050962656333376e-05, "loss": 0.1729, "step": 7168 }, { "epoch": 1.4509208662214128, "grad_norm": 0.290659099817276, "learning_rate": 3.502678074197716e-05, "loss": 0.177, "step": 7169 }, { "epoch": 1.451123254401943, "grad_norm": 0.29393497109413147, "learning_rate": 3.500260540092952e-05, "loss": 0.2, "step": 7170 }, { "epoch": 1.4513256425824732, "grad_norm": 0.2999376952648163, "learning_rate": 3.497843663563626e-05, "loss": 0.1784, "step": 7171 }, { "epoch": 1.4515280307630034, "grad_norm": 0.2778777778148651, "learning_rate": 3.49542744485425e-05, "loss": 0.1785, "step": 7172 }, { "epoch": 1.4517304189435336, "grad_norm": 0.28878605365753174, "learning_rate": 3.493011884209275e-05, "loss": 0.1847, "step": 7173 }, { "epoch": 1.451932807124064, "grad_norm": 0.3202642798423767, "learning_rate": 3.49059698187308e-05, "loss": 0.1723, "step": 7174 }, { "epoch": 1.4521351953045942, "grad_norm": 0.26450544595718384, "learning_rate": 3.48818273808998e-05, "loss": 0.1764, "step": 7175 }, { "epoch": 1.4523375834851244, "grad_norm": 0.3330189883708954, "learning_rate": 3.485769153104222e-05, "loss": 0.2002, "step": 7176 }, { "epoch": 1.4525399716656548, "grad_norm": 0.2631731927394867, "learning_rate": 3.4833562271599896e-05, "loss": 0.176, "step": 7177 }, { "epoch": 1.452742359846185, "grad_norm": 0.2750820219516754, "learning_rate": 3.480943960501395e-05, "loss": 0.1472, "step": 7178 }, { "epoch": 1.4529447480267152, "grad_norm": 0.26182010769844055, "learning_rate": 3.478532353372487e-05, "loss": 0.1669, "step": 7179 }, { "epoch": 1.4531471362072454, "grad_norm": 0.3083231747150421, "learning_rate": 3.476121406017246e-05, "loss": 0.2078, "step": 7180 }, { "epoch": 1.4533495243877756, "grad_norm": 0.2545689344406128, "learning_rate": 3.473711118679587e-05, "loss": 0.156, "step": 7181 }, { "epoch": 1.453551912568306, "grad_norm": 0.2764873206615448, "learning_rate": 3.471301491603358e-05, "loss": 0.1938, "step": 7182 }, { "epoch": 1.4537543007488363, "grad_norm": 0.3199172616004944, "learning_rate": 3.468892525032339e-05, "loss": 0.2067, "step": 7183 }, { "epoch": 1.4539566889293665, "grad_norm": 0.27605000138282776, "learning_rate": 3.466484219210244e-05, "loss": 0.1767, "step": 7184 }, { "epoch": 1.454159077109897, "grad_norm": 0.2564321458339691, "learning_rate": 3.46407657438072e-05, "loss": 0.1897, "step": 7185 }, { "epoch": 1.454361465290427, "grad_norm": 0.28144845366477966, "learning_rate": 3.461669590787348e-05, "loss": 0.2016, "step": 7186 }, { "epoch": 1.4545638534709573, "grad_norm": 0.317074716091156, "learning_rate": 3.4592632686736406e-05, "loss": 0.2276, "step": 7187 }, { "epoch": 1.4547662416514875, "grad_norm": 0.2932002544403076, "learning_rate": 3.456857608283045e-05, "loss": 0.2215, "step": 7188 }, { "epoch": 1.4549686298320177, "grad_norm": 0.2682000994682312, "learning_rate": 3.454452609858939e-05, "loss": 0.201, "step": 7189 }, { "epoch": 1.4551710180125481, "grad_norm": 0.27240079641342163, "learning_rate": 3.452048273644638e-05, "loss": 0.1955, "step": 7190 }, { "epoch": 1.4553734061930783, "grad_norm": 0.2567487359046936, "learning_rate": 3.449644599883385e-05, "loss": 0.1824, "step": 7191 }, { "epoch": 1.4555757943736085, "grad_norm": 0.30283913016319275, "learning_rate": 3.447241588818358e-05, "loss": 0.2202, "step": 7192 }, { "epoch": 1.455778182554139, "grad_norm": 0.2953677773475647, "learning_rate": 3.444839240692671e-05, "loss": 0.1909, "step": 7193 }, { "epoch": 1.4559805707346691, "grad_norm": 0.28766822814941406, "learning_rate": 3.4424375557493674e-05, "loss": 0.1699, "step": 7194 }, { "epoch": 1.4561829589151993, "grad_norm": 0.267906129360199, "learning_rate": 3.4400365342314245e-05, "loss": 0.199, "step": 7195 }, { "epoch": 1.4563853470957295, "grad_norm": 0.25349098443984985, "learning_rate": 3.437636176381751e-05, "loss": 0.1693, "step": 7196 }, { "epoch": 1.4565877352762597, "grad_norm": 0.23480501770973206, "learning_rate": 3.4352364824431914e-05, "loss": 0.1416, "step": 7197 }, { "epoch": 1.4567901234567902, "grad_norm": 0.3415687680244446, "learning_rate": 3.4328374526585215e-05, "loss": 0.2447, "step": 7198 }, { "epoch": 1.4569925116373204, "grad_norm": 0.26703810691833496, "learning_rate": 3.430439087270449e-05, "loss": 0.1838, "step": 7199 }, { "epoch": 1.4571948998178508, "grad_norm": 0.378103107213974, "learning_rate": 3.428041386521618e-05, "loss": 0.1944, "step": 7200 }, { "epoch": 1.4571948998178508, "eval_loss": 0.265423983335495, "eval_runtime": 0.7377, "eval_samples_per_second": 6.778, "eval_steps_per_second": 1.356, "step": 7200 }, { "epoch": 1.457397287998381, "grad_norm": 0.31857749819755554, "learning_rate": 3.425644350654599e-05, "loss": 0.1793, "step": 7201 }, { "epoch": 1.4575996761789112, "grad_norm": 0.3083222806453705, "learning_rate": 3.4232479799119e-05, "loss": 0.1843, "step": 7202 }, { "epoch": 1.4578020643594414, "grad_norm": 0.29255810379981995, "learning_rate": 3.420852274535963e-05, "loss": 0.1863, "step": 7203 }, { "epoch": 1.4580044525399716, "grad_norm": 0.2882266342639923, "learning_rate": 3.418457234769161e-05, "loss": 0.2145, "step": 7204 }, { "epoch": 1.458206840720502, "grad_norm": 0.2893981337547302, "learning_rate": 3.4160628608537935e-05, "loss": 0.1946, "step": 7205 }, { "epoch": 1.4584092289010322, "grad_norm": 0.3118157386779785, "learning_rate": 3.4136691530321016e-05, "loss": 0.2245, "step": 7206 }, { "epoch": 1.4586116170815624, "grad_norm": 0.33280250430107117, "learning_rate": 3.411276111546254e-05, "loss": 0.1947, "step": 7207 }, { "epoch": 1.4588140052620928, "grad_norm": 0.2869172990322113, "learning_rate": 3.4088837366383565e-05, "loss": 0.1946, "step": 7208 }, { "epoch": 1.459016393442623, "grad_norm": 0.315771222114563, "learning_rate": 3.406492028550442e-05, "loss": 0.2258, "step": 7209 }, { "epoch": 1.4592187816231532, "grad_norm": 0.3115472197532654, "learning_rate": 3.404100987524479e-05, "loss": 0.1907, "step": 7210 }, { "epoch": 1.4594211698036834, "grad_norm": 0.345337450504303, "learning_rate": 3.401710613802368e-05, "loss": 0.2074, "step": 7211 }, { "epoch": 1.4596235579842136, "grad_norm": 0.3084411919116974, "learning_rate": 3.399320907625942e-05, "loss": 0.1961, "step": 7212 }, { "epoch": 1.459825946164744, "grad_norm": 0.31509360671043396, "learning_rate": 3.396931869236967e-05, "loss": 0.218, "step": 7213 }, { "epoch": 1.4600283343452742, "grad_norm": 0.25666624307632446, "learning_rate": 3.39454349887714e-05, "loss": 0.1809, "step": 7214 }, { "epoch": 1.4602307225258044, "grad_norm": 0.3312767446041107, "learning_rate": 3.392155796788091e-05, "loss": 0.2241, "step": 7215 }, { "epoch": 1.4604331107063349, "grad_norm": 0.28062111139297485, "learning_rate": 3.389768763211384e-05, "loss": 0.1524, "step": 7216 }, { "epoch": 1.460635498886865, "grad_norm": 0.24783366918563843, "learning_rate": 3.387382398388513e-05, "loss": 0.168, "step": 7217 }, { "epoch": 1.4608378870673953, "grad_norm": 0.23416613042354584, "learning_rate": 3.384996702560905e-05, "loss": 0.1463, "step": 7218 }, { "epoch": 1.4610402752479255, "grad_norm": 0.3014012277126312, "learning_rate": 3.382611675969921e-05, "loss": 0.1943, "step": 7219 }, { "epoch": 1.4612426634284557, "grad_norm": 0.30629321932792664, "learning_rate": 3.3802273188568514e-05, "loss": 0.1925, "step": 7220 }, { "epoch": 1.461445051608986, "grad_norm": 0.2842909097671509, "learning_rate": 3.3778436314629216e-05, "loss": 0.2063, "step": 7221 }, { "epoch": 1.4616474397895163, "grad_norm": 0.28999006748199463, "learning_rate": 3.3754606140292875e-05, "loss": 0.1774, "step": 7222 }, { "epoch": 1.4618498279700465, "grad_norm": 0.30419832468032837, "learning_rate": 3.3730782667970375e-05, "loss": 0.1816, "step": 7223 }, { "epoch": 1.462052216150577, "grad_norm": 0.2875352203845978, "learning_rate": 3.370696590007194e-05, "loss": 0.2169, "step": 7224 }, { "epoch": 1.462254604331107, "grad_norm": 0.40214404463768005, "learning_rate": 3.3683155839007086e-05, "loss": 0.206, "step": 7225 }, { "epoch": 1.4624569925116373, "grad_norm": 0.2744888365268707, "learning_rate": 3.36593524871847e-05, "loss": 0.1899, "step": 7226 }, { "epoch": 1.4626593806921675, "grad_norm": 0.28675928711891174, "learning_rate": 3.363555584701289e-05, "loss": 0.1886, "step": 7227 }, { "epoch": 1.4628617688726977, "grad_norm": 0.25819188356399536, "learning_rate": 3.361176592089919e-05, "loss": 0.1784, "step": 7228 }, { "epoch": 1.4630641570532281, "grad_norm": 0.2800363600254059, "learning_rate": 3.358798271125041e-05, "loss": 0.1907, "step": 7229 }, { "epoch": 1.4632665452337583, "grad_norm": 0.3032877445220947, "learning_rate": 3.3564206220472684e-05, "loss": 0.1924, "step": 7230 }, { "epoch": 1.4634689334142887, "grad_norm": 0.27191731333732605, "learning_rate": 3.354043645097147e-05, "loss": 0.1863, "step": 7231 }, { "epoch": 1.463671321594819, "grad_norm": 0.2605868875980377, "learning_rate": 3.351667340515154e-05, "loss": 0.1899, "step": 7232 }, { "epoch": 1.4638737097753491, "grad_norm": 0.2882217466831207, "learning_rate": 3.349291708541696e-05, "loss": 0.2119, "step": 7233 }, { "epoch": 1.4640760979558793, "grad_norm": 0.26905587315559387, "learning_rate": 3.346916749417123e-05, "loss": 0.1699, "step": 7234 }, { "epoch": 1.4642784861364095, "grad_norm": 0.30613499879837036, "learning_rate": 3.344542463381701e-05, "loss": 0.2023, "step": 7235 }, { "epoch": 1.46448087431694, "grad_norm": 0.3181071877479553, "learning_rate": 3.3421688506756386e-05, "loss": 0.2077, "step": 7236 }, { "epoch": 1.4646832624974702, "grad_norm": 0.34396564960479736, "learning_rate": 3.339795911539072e-05, "loss": 0.2227, "step": 7237 }, { "epoch": 1.4648856506780004, "grad_norm": 0.26022300124168396, "learning_rate": 3.33742364621207e-05, "loss": 0.1496, "step": 7238 }, { "epoch": 1.4650880388585308, "grad_norm": 0.3025625944137573, "learning_rate": 3.335052054934634e-05, "loss": 0.2067, "step": 7239 }, { "epoch": 1.465290427039061, "grad_norm": 0.3104478120803833, "learning_rate": 3.332681137946697e-05, "loss": 0.2209, "step": 7240 }, { "epoch": 1.4654928152195912, "grad_norm": 0.2558635175228119, "learning_rate": 3.3303108954881226e-05, "loss": 0.1905, "step": 7241 }, { "epoch": 1.4656952034001214, "grad_norm": 0.25812438130378723, "learning_rate": 3.327941327798708e-05, "loss": 0.1568, "step": 7242 }, { "epoch": 1.4658975915806516, "grad_norm": 0.2858171761035919, "learning_rate": 3.3255724351181804e-05, "loss": 0.1883, "step": 7243 }, { "epoch": 1.466099979761182, "grad_norm": 0.27138751745224, "learning_rate": 3.3232042176862e-05, "loss": 0.183, "step": 7244 }, { "epoch": 1.4663023679417122, "grad_norm": 0.2960588037967682, "learning_rate": 3.320836675742358e-05, "loss": 0.165, "step": 7245 }, { "epoch": 1.4665047561222424, "grad_norm": 0.31151270866394043, "learning_rate": 3.3184698095261766e-05, "loss": 0.195, "step": 7246 }, { "epoch": 1.4667071443027728, "grad_norm": 0.3337216079235077, "learning_rate": 3.3161036192771134e-05, "loss": 0.1813, "step": 7247 }, { "epoch": 1.466909532483303, "grad_norm": 0.2706492245197296, "learning_rate": 3.313738105234554e-05, "loss": 0.1679, "step": 7248 }, { "epoch": 1.4671119206638332, "grad_norm": 0.2831174433231354, "learning_rate": 3.311373267637813e-05, "loss": 0.2286, "step": 7249 }, { "epoch": 1.4673143088443634, "grad_norm": 0.32331645488739014, "learning_rate": 3.309009106726141e-05, "loss": 0.2308, "step": 7250 }, { "epoch": 1.4673143088443634, "eval_loss": 0.2653513550758362, "eval_runtime": 0.7361, "eval_samples_per_second": 6.793, "eval_steps_per_second": 1.359, "step": 7250 }, { "epoch": 1.4675166970248936, "grad_norm": 0.27429506182670593, "learning_rate": 3.30664562273872e-05, "loss": 0.1592, "step": 7251 }, { "epoch": 1.467719085205424, "grad_norm": 0.2732314467430115, "learning_rate": 3.304282815914662e-05, "loss": 0.2064, "step": 7252 }, { "epoch": 1.4679214733859542, "grad_norm": 0.3681529462337494, "learning_rate": 3.301920686493012e-05, "loss": 0.2259, "step": 7253 }, { "epoch": 1.4681238615664844, "grad_norm": 0.30304232239723206, "learning_rate": 3.299559234712745e-05, "loss": 0.22, "step": 7254 }, { "epoch": 1.4683262497470149, "grad_norm": 0.25034329295158386, "learning_rate": 3.297198460812767e-05, "loss": 0.1799, "step": 7255 }, { "epoch": 1.468528637927545, "grad_norm": 0.26537463068962097, "learning_rate": 3.294838365031917e-05, "loss": 0.1747, "step": 7256 }, { "epoch": 1.4687310261080753, "grad_norm": 0.2688654661178589, "learning_rate": 3.2924789476089644e-05, "loss": 0.1727, "step": 7257 }, { "epoch": 1.4689334142886055, "grad_norm": 0.2707471251487732, "learning_rate": 3.2901202087826124e-05, "loss": 0.1915, "step": 7258 }, { "epoch": 1.4691358024691357, "grad_norm": 0.30396607518196106, "learning_rate": 3.28776214879149e-05, "loss": 0.2073, "step": 7259 }, { "epoch": 1.469338190649666, "grad_norm": 0.29711097478866577, "learning_rate": 3.2854047678741625e-05, "loss": 0.1772, "step": 7260 }, { "epoch": 1.4695405788301963, "grad_norm": 0.3758523166179657, "learning_rate": 3.2830480662691265e-05, "loss": 0.2179, "step": 7261 }, { "epoch": 1.4697429670107267, "grad_norm": 0.2720712125301361, "learning_rate": 3.280692044214807e-05, "loss": 0.184, "step": 7262 }, { "epoch": 1.469945355191257, "grad_norm": 0.2819099724292755, "learning_rate": 3.27833670194956e-05, "loss": 0.1723, "step": 7263 }, { "epoch": 1.470147743371787, "grad_norm": 0.3061216175556183, "learning_rate": 3.2759820397116766e-05, "loss": 0.1959, "step": 7264 }, { "epoch": 1.4703501315523173, "grad_norm": 0.3212586045265198, "learning_rate": 3.273628057739378e-05, "loss": 0.1751, "step": 7265 }, { "epoch": 1.4705525197328475, "grad_norm": 0.2917107343673706, "learning_rate": 3.2712747562708115e-05, "loss": 0.1878, "step": 7266 }, { "epoch": 1.470754907913378, "grad_norm": 0.3021541237831116, "learning_rate": 3.2689221355440615e-05, "loss": 0.1802, "step": 7267 }, { "epoch": 1.4709572960939081, "grad_norm": 0.3198321461677551, "learning_rate": 3.266570195797142e-05, "loss": 0.205, "step": 7268 }, { "epoch": 1.4711596842744383, "grad_norm": 0.254639208316803, "learning_rate": 3.264218937267996e-05, "loss": 0.1688, "step": 7269 }, { "epoch": 1.4713620724549688, "grad_norm": 0.26952847838401794, "learning_rate": 3.261868360194501e-05, "loss": 0.1957, "step": 7270 }, { "epoch": 1.471564460635499, "grad_norm": 0.3483128845691681, "learning_rate": 3.259518464814466e-05, "loss": 0.2195, "step": 7271 }, { "epoch": 1.4717668488160291, "grad_norm": 0.2697324752807617, "learning_rate": 3.2571692513656226e-05, "loss": 0.1582, "step": 7272 }, { "epoch": 1.4719692369965593, "grad_norm": 0.24079424142837524, "learning_rate": 3.254820720085643e-05, "loss": 0.1647, "step": 7273 }, { "epoch": 1.4721716251770895, "grad_norm": 0.2517320513725281, "learning_rate": 3.252472871212125e-05, "loss": 0.1839, "step": 7274 }, { "epoch": 1.47237401335762, "grad_norm": 0.2581874430179596, "learning_rate": 3.250125704982603e-05, "loss": 0.1523, "step": 7275 }, { "epoch": 1.4725764015381502, "grad_norm": 0.27980589866638184, "learning_rate": 3.247779221634535e-05, "loss": 0.2085, "step": 7276 }, { "epoch": 1.4727787897186804, "grad_norm": 0.26071032881736755, "learning_rate": 3.245433421405315e-05, "loss": 0.1746, "step": 7277 }, { "epoch": 1.4729811778992108, "grad_norm": 0.3794530928134918, "learning_rate": 3.243088304532268e-05, "loss": 0.2054, "step": 7278 }, { "epoch": 1.473183566079741, "grad_norm": 0.28098902106285095, "learning_rate": 3.240743871252646e-05, "loss": 0.1807, "step": 7279 }, { "epoch": 1.4733859542602712, "grad_norm": 0.25913292169570923, "learning_rate": 3.238400121803635e-05, "loss": 0.1559, "step": 7280 }, { "epoch": 1.4735883424408014, "grad_norm": 0.2775214910507202, "learning_rate": 3.2360570564223514e-05, "loss": 0.2044, "step": 7281 }, { "epoch": 1.4737907306213316, "grad_norm": 0.31280606985092163, "learning_rate": 3.233714675345841e-05, "loss": 0.2048, "step": 7282 }, { "epoch": 1.473993118801862, "grad_norm": 0.337022602558136, "learning_rate": 3.231372978811082e-05, "loss": 0.1898, "step": 7283 }, { "epoch": 1.4741955069823922, "grad_norm": 0.2549079954624176, "learning_rate": 3.229031967054983e-05, "loss": 0.1852, "step": 7284 }, { "epoch": 1.4743978951629224, "grad_norm": 0.2797609865665436, "learning_rate": 3.226691640314382e-05, "loss": 0.1924, "step": 7285 }, { "epoch": 1.4746002833434528, "grad_norm": 0.29348504543304443, "learning_rate": 3.2243519988260495e-05, "loss": 0.198, "step": 7286 }, { "epoch": 1.474802671523983, "grad_norm": 0.3945770561695099, "learning_rate": 3.2220130428266874e-05, "loss": 0.2054, "step": 7287 }, { "epoch": 1.4750050597045132, "grad_norm": 0.2882711887359619, "learning_rate": 3.2196747725529234e-05, "loss": 0.1845, "step": 7288 }, { "epoch": 1.4752074478850434, "grad_norm": 0.25591593980789185, "learning_rate": 3.217337188241321e-05, "loss": 0.1597, "step": 7289 }, { "epoch": 1.4754098360655736, "grad_norm": 0.31044450402259827, "learning_rate": 3.2150002901283714e-05, "loss": 0.1921, "step": 7290 }, { "epoch": 1.475612224246104, "grad_norm": 0.2782125473022461, "learning_rate": 3.2126640784504956e-05, "loss": 0.1686, "step": 7291 }, { "epoch": 1.4758146124266343, "grad_norm": 0.28518062829971313, "learning_rate": 3.210328553444053e-05, "loss": 0.2184, "step": 7292 }, { "epoch": 1.4760170006071647, "grad_norm": 0.3196036219596863, "learning_rate": 3.207993715345328e-05, "loss": 0.2072, "step": 7293 }, { "epoch": 1.4762193887876949, "grad_norm": 0.28817251324653625, "learning_rate": 3.205659564390527e-05, "loss": 0.1819, "step": 7294 }, { "epoch": 1.476421776968225, "grad_norm": 0.27735039591789246, "learning_rate": 3.203326100815799e-05, "loss": 0.2075, "step": 7295 }, { "epoch": 1.4766241651487553, "grad_norm": 0.2327238917350769, "learning_rate": 3.2009933248572196e-05, "loss": 0.1262, "step": 7296 }, { "epoch": 1.4768265533292855, "grad_norm": 0.2707853615283966, "learning_rate": 3.1986612367507954e-05, "loss": 0.2087, "step": 7297 }, { "epoch": 1.477028941509816, "grad_norm": 0.272924542427063, "learning_rate": 3.1963298367324613e-05, "loss": 0.1893, "step": 7298 }, { "epoch": 1.477231329690346, "grad_norm": 0.2852308750152588, "learning_rate": 3.193999125038083e-05, "loss": 0.1875, "step": 7299 }, { "epoch": 1.4774337178708763, "grad_norm": 0.28374990820884705, "learning_rate": 3.191669101903459e-05, "loss": 0.2186, "step": 7300 }, { "epoch": 1.4774337178708763, "eval_loss": 0.26597627997398376, "eval_runtime": 0.737, "eval_samples_per_second": 6.784, "eval_steps_per_second": 1.357, "step": 7300 }, { "epoch": 1.4776361060514067, "grad_norm": 0.24514828622341156, "learning_rate": 3.1893397675643176e-05, "loss": 0.1843, "step": 7301 }, { "epoch": 1.477838494231937, "grad_norm": 0.29631340503692627, "learning_rate": 3.187011122256314e-05, "loss": 0.1901, "step": 7302 }, { "epoch": 1.4780408824124671, "grad_norm": 0.313414603471756, "learning_rate": 3.184683166215038e-05, "loss": 0.1935, "step": 7303 }, { "epoch": 1.4782432705929973, "grad_norm": 0.289034366607666, "learning_rate": 3.1823558996760064e-05, "loss": 0.1774, "step": 7304 }, { "epoch": 1.4784456587735275, "grad_norm": 0.25100448727607727, "learning_rate": 3.180029322874668e-05, "loss": 0.1883, "step": 7305 }, { "epoch": 1.478648046954058, "grad_norm": 0.2747223675251007, "learning_rate": 3.177703436046401e-05, "loss": 0.2212, "step": 7306 }, { "epoch": 1.4788504351345881, "grad_norm": 0.31175172328948975, "learning_rate": 3.175378239426515e-05, "loss": 0.2123, "step": 7307 }, { "epoch": 1.4790528233151183, "grad_norm": 0.3189109265804291, "learning_rate": 3.17305373325025e-05, "loss": 0.1731, "step": 7308 }, { "epoch": 1.4792552114956488, "grad_norm": 0.285255491733551, "learning_rate": 3.170729917752773e-05, "loss": 0.1806, "step": 7309 }, { "epoch": 1.479457599676179, "grad_norm": 0.26634594798088074, "learning_rate": 3.1684067931691844e-05, "loss": 0.1765, "step": 7310 }, { "epoch": 1.4796599878567092, "grad_norm": 0.2741389274597168, "learning_rate": 3.1660843597345135e-05, "loss": 0.2044, "step": 7311 }, { "epoch": 1.4798623760372394, "grad_norm": 0.34997615218162537, "learning_rate": 3.16376261768372e-05, "loss": 0.2245, "step": 7312 }, { "epoch": 1.4800647642177696, "grad_norm": 0.3089703917503357, "learning_rate": 3.1614415672516914e-05, "loss": 0.206, "step": 7313 }, { "epoch": 1.4802671523983, "grad_norm": 0.2964693009853363, "learning_rate": 3.1591212086732504e-05, "loss": 0.1781, "step": 7314 }, { "epoch": 1.4804695405788302, "grad_norm": 0.3101029098033905, "learning_rate": 3.1568015421831475e-05, "loss": 0.1944, "step": 7315 }, { "epoch": 1.4806719287593604, "grad_norm": 0.3056146502494812, "learning_rate": 3.154482568016057e-05, "loss": 0.1865, "step": 7316 }, { "epoch": 1.4808743169398908, "grad_norm": 0.2606217861175537, "learning_rate": 3.15216428640659e-05, "loss": 0.1797, "step": 7317 }, { "epoch": 1.481076705120421, "grad_norm": 0.24992996454238892, "learning_rate": 3.149846697589288e-05, "loss": 0.1795, "step": 7318 }, { "epoch": 1.4812790933009512, "grad_norm": 0.2621716558933258, "learning_rate": 3.14752980179862e-05, "loss": 0.2016, "step": 7319 }, { "epoch": 1.4814814814814814, "grad_norm": 0.2822463810443878, "learning_rate": 3.1452135992689836e-05, "loss": 0.2244, "step": 7320 }, { "epoch": 1.4816838696620118, "grad_norm": 0.29446855187416077, "learning_rate": 3.1428980902347084e-05, "loss": 0.1815, "step": 7321 }, { "epoch": 1.481886257842542, "grad_norm": 0.24313370883464813, "learning_rate": 3.140583274930055e-05, "loss": 0.1586, "step": 7322 }, { "epoch": 1.4820886460230722, "grad_norm": 0.31904950737953186, "learning_rate": 3.1382691535892086e-05, "loss": 0.1789, "step": 7323 }, { "epoch": 1.4822910342036026, "grad_norm": 0.3379060626029968, "learning_rate": 3.135955726446291e-05, "loss": 0.2011, "step": 7324 }, { "epoch": 1.4824934223841328, "grad_norm": 0.29935017228126526, "learning_rate": 3.133642993735349e-05, "loss": 0.2265, "step": 7325 }, { "epoch": 1.482695810564663, "grad_norm": 0.30864277482032776, "learning_rate": 3.1313309556903626e-05, "loss": 0.2143, "step": 7326 }, { "epoch": 1.4828981987451932, "grad_norm": 0.2781793475151062, "learning_rate": 3.1290196125452366e-05, "loss": 0.1825, "step": 7327 }, { "epoch": 1.4831005869257234, "grad_norm": 0.2661908268928528, "learning_rate": 3.12670896453381e-05, "loss": 0.1537, "step": 7328 }, { "epoch": 1.4833029751062539, "grad_norm": 0.26809945702552795, "learning_rate": 3.12439901188985e-05, "loss": 0.1801, "step": 7329 }, { "epoch": 1.483505363286784, "grad_norm": 0.24053886532783508, "learning_rate": 3.1220897548470526e-05, "loss": 0.1554, "step": 7330 }, { "epoch": 1.4837077514673143, "grad_norm": 0.7539495229721069, "learning_rate": 3.1197811936390456e-05, "loss": 0.2089, "step": 7331 }, { "epoch": 1.4839101396478447, "grad_norm": 0.2663455009460449, "learning_rate": 3.117473328499384e-05, "loss": 0.1806, "step": 7332 }, { "epoch": 1.4841125278283749, "grad_norm": 0.3148098587989807, "learning_rate": 3.115166159661553e-05, "loss": 0.2188, "step": 7333 }, { "epoch": 1.484314916008905, "grad_norm": 0.2693808078765869, "learning_rate": 3.112859687358969e-05, "loss": 0.2097, "step": 7334 }, { "epoch": 1.4845173041894353, "grad_norm": 0.23006850481033325, "learning_rate": 3.110553911824975e-05, "loss": 0.1786, "step": 7335 }, { "epoch": 1.4847196923699655, "grad_norm": 0.3721303939819336, "learning_rate": 3.108248833292846e-05, "loss": 0.2057, "step": 7336 }, { "epoch": 1.484922080550496, "grad_norm": 0.2560385763645172, "learning_rate": 3.105944451995786e-05, "loss": 0.1713, "step": 7337 }, { "epoch": 1.485124468731026, "grad_norm": 0.28081420063972473, "learning_rate": 3.103640768166928e-05, "loss": 0.1893, "step": 7338 }, { "epoch": 1.4853268569115563, "grad_norm": 0.2766065001487732, "learning_rate": 3.101337782039334e-05, "loss": 0.1871, "step": 7339 }, { "epoch": 1.4855292450920867, "grad_norm": 0.2626553475856781, "learning_rate": 3.0990354938459964e-05, "loss": 0.1924, "step": 7340 }, { "epoch": 1.485731633272617, "grad_norm": 0.3107207417488098, "learning_rate": 3.096733903819837e-05, "loss": 0.1927, "step": 7341 }, { "epoch": 1.4859340214531471, "grad_norm": 0.2315855324268341, "learning_rate": 3.094433012193706e-05, "loss": 0.1639, "step": 7342 }, { "epoch": 1.4861364096336773, "grad_norm": 0.24030828475952148, "learning_rate": 3.092132819200383e-05, "loss": 0.1924, "step": 7343 }, { "epoch": 1.4863387978142075, "grad_norm": 0.30377745628356934, "learning_rate": 3.089833325072578e-05, "loss": 0.1586, "step": 7344 }, { "epoch": 1.486541185994738, "grad_norm": 0.2831876873970032, "learning_rate": 3.08753453004293e-05, "loss": 0.2345, "step": 7345 }, { "epoch": 1.4867435741752681, "grad_norm": 0.24797627329826355, "learning_rate": 3.085236434344008e-05, "loss": 0.1671, "step": 7346 }, { "epoch": 1.4869459623557983, "grad_norm": 0.27497178316116333, "learning_rate": 3.082939038208306e-05, "loss": 0.183, "step": 7347 }, { "epoch": 1.4871483505363288, "grad_norm": 0.3301697075366974, "learning_rate": 3.080642341868252e-05, "loss": 0.2028, "step": 7348 }, { "epoch": 1.487350738716859, "grad_norm": 0.2448139786720276, "learning_rate": 3.078346345556202e-05, "loss": 0.1587, "step": 7349 }, { "epoch": 1.4875531268973892, "grad_norm": 0.2885264754295349, "learning_rate": 3.0760510495044413e-05, "loss": 0.1975, "step": 7350 }, { "epoch": 1.4875531268973892, "eval_loss": 0.26434725522994995, "eval_runtime": 0.7415, "eval_samples_per_second": 6.743, "eval_steps_per_second": 1.349, "step": 7350 }, { "epoch": 1.4877555150779194, "grad_norm": 0.2968370318412781, "learning_rate": 3.0737564539451835e-05, "loss": 0.1849, "step": 7351 }, { "epoch": 1.4879579032584498, "grad_norm": 0.33912134170532227, "learning_rate": 3.0714625591105704e-05, "loss": 0.2101, "step": 7352 }, { "epoch": 1.48816029143898, "grad_norm": 0.25544485449790955, "learning_rate": 3.069169365232676e-05, "loss": 0.1996, "step": 7353 }, { "epoch": 1.4883626796195102, "grad_norm": 0.3111973702907562, "learning_rate": 3.0668768725435004e-05, "loss": 0.2041, "step": 7354 }, { "epoch": 1.4885650678000406, "grad_norm": 0.2621065378189087, "learning_rate": 3.0645850812749743e-05, "loss": 0.17, "step": 7355 }, { "epoch": 1.4887674559805708, "grad_norm": 0.2855996787548065, "learning_rate": 3.062293991658958e-05, "loss": 0.2126, "step": 7356 }, { "epoch": 1.488969844161101, "grad_norm": 0.28797435760498047, "learning_rate": 3.060003603927238e-05, "loss": 0.1567, "step": 7357 }, { "epoch": 1.4891722323416312, "grad_norm": 0.2602500915527344, "learning_rate": 3.0577139183115346e-05, "loss": 0.1521, "step": 7358 }, { "epoch": 1.4893746205221614, "grad_norm": 0.2664164900779724, "learning_rate": 3.0554249350434905e-05, "loss": 0.1871, "step": 7359 }, { "epoch": 1.4895770087026918, "grad_norm": 0.3115113377571106, "learning_rate": 3.053136654354687e-05, "loss": 0.2127, "step": 7360 }, { "epoch": 1.489779396883222, "grad_norm": 0.2899424731731415, "learning_rate": 3.0508490764766208e-05, "loss": 0.1914, "step": 7361 }, { "epoch": 1.4899817850637522, "grad_norm": 0.29101699590682983, "learning_rate": 3.0485622016407277e-05, "loss": 0.1894, "step": 7362 }, { "epoch": 1.4901841732442827, "grad_norm": 0.3022737205028534, "learning_rate": 3.046276030078371e-05, "loss": 0.1906, "step": 7363 }, { "epoch": 1.4903865614248129, "grad_norm": 0.2707030773162842, "learning_rate": 3.043990562020842e-05, "loss": 0.1956, "step": 7364 }, { "epoch": 1.490588949605343, "grad_norm": 0.2698463201522827, "learning_rate": 3.0417057976993578e-05, "loss": 0.1597, "step": 7365 }, { "epoch": 1.4907913377858732, "grad_norm": 0.278828501701355, "learning_rate": 3.0394217373450695e-05, "loss": 0.1911, "step": 7366 }, { "epoch": 1.4909937259664034, "grad_norm": 0.25586068630218506, "learning_rate": 3.037138381189053e-05, "loss": 0.1566, "step": 7367 }, { "epoch": 1.4911961141469339, "grad_norm": 0.3123515546321869, "learning_rate": 3.0348557294623136e-05, "loss": 0.2018, "step": 7368 }, { "epoch": 1.491398502327464, "grad_norm": 0.3360455334186554, "learning_rate": 3.032573782395789e-05, "loss": 0.1848, "step": 7369 }, { "epoch": 1.4916008905079943, "grad_norm": 0.32559528946876526, "learning_rate": 3.0302925402203396e-05, "loss": 0.1947, "step": 7370 }, { "epoch": 1.4918032786885247, "grad_norm": 0.2493031769990921, "learning_rate": 3.028012003166758e-05, "loss": 0.1699, "step": 7371 }, { "epoch": 1.492005666869055, "grad_norm": 0.2718972861766815, "learning_rate": 3.0257321714657673e-05, "loss": 0.205, "step": 7372 }, { "epoch": 1.492208055049585, "grad_norm": 0.3305763900279999, "learning_rate": 3.0234530453480137e-05, "loss": 0.1773, "step": 7373 }, { "epoch": 1.4924104432301153, "grad_norm": 0.23928435146808624, "learning_rate": 3.0211746250440775e-05, "loss": 0.1497, "step": 7374 }, { "epoch": 1.4926128314106455, "grad_norm": 0.29377129673957825, "learning_rate": 3.0188969107844655e-05, "loss": 0.199, "step": 7375 }, { "epoch": 1.492815219591176, "grad_norm": 0.31887951493263245, "learning_rate": 3.0166199027996113e-05, "loss": 0.1961, "step": 7376 }, { "epoch": 1.4930176077717061, "grad_norm": 0.3151382505893707, "learning_rate": 3.01434360131988e-05, "loss": 0.1694, "step": 7377 }, { "epoch": 1.4932199959522363, "grad_norm": 0.2822098135948181, "learning_rate": 3.0120680065755635e-05, "loss": 0.2054, "step": 7378 }, { "epoch": 1.4934223841327667, "grad_norm": 0.2944871485233307, "learning_rate": 3.009793118796882e-05, "loss": 0.1962, "step": 7379 }, { "epoch": 1.493624772313297, "grad_norm": 0.314345121383667, "learning_rate": 3.0075189382139856e-05, "loss": 0.1739, "step": 7380 }, { "epoch": 1.4938271604938271, "grad_norm": 0.3003896176815033, "learning_rate": 3.0052454650569528e-05, "loss": 0.1954, "step": 7381 }, { "epoch": 1.4940295486743573, "grad_norm": 0.29708629846572876, "learning_rate": 3.0029726995557904e-05, "loss": 0.1756, "step": 7382 }, { "epoch": 1.4942319368548878, "grad_norm": 0.30133432149887085, "learning_rate": 3.0007006419404283e-05, "loss": 0.1944, "step": 7383 }, { "epoch": 1.494434325035418, "grad_norm": 0.2833723723888397, "learning_rate": 2.9984292924407332e-05, "loss": 0.2044, "step": 7384 }, { "epoch": 1.4946367132159482, "grad_norm": 0.2950754761695862, "learning_rate": 2.9961586512864947e-05, "loss": 0.1766, "step": 7385 }, { "epoch": 1.4948391013964786, "grad_norm": 0.2856682240962982, "learning_rate": 2.9938887187074314e-05, "loss": 0.1868, "step": 7386 }, { "epoch": 1.4950414895770088, "grad_norm": 0.23965318500995636, "learning_rate": 2.9916194949331956e-05, "loss": 0.1623, "step": 7387 }, { "epoch": 1.495243877757539, "grad_norm": 0.30067017674446106, "learning_rate": 2.9893509801933615e-05, "loss": 0.198, "step": 7388 }, { "epoch": 1.4954462659380692, "grad_norm": 0.2467557042837143, "learning_rate": 2.9870831747174333e-05, "loss": 0.17, "step": 7389 }, { "epoch": 1.4956486541185994, "grad_norm": 0.2811994254589081, "learning_rate": 2.9848160787348435e-05, "loss": 0.2015, "step": 7390 }, { "epoch": 1.4958510422991298, "grad_norm": 0.27143871784210205, "learning_rate": 2.982549692474954e-05, "loss": 0.1726, "step": 7391 }, { "epoch": 1.49605343047966, "grad_norm": 0.297503799200058, "learning_rate": 2.980284016167053e-05, "loss": 0.2209, "step": 7392 }, { "epoch": 1.4962558186601902, "grad_norm": 0.27570927143096924, "learning_rate": 2.978019050040358e-05, "loss": 0.2072, "step": 7393 }, { "epoch": 1.4964582068407206, "grad_norm": 0.2997628450393677, "learning_rate": 2.975754794324015e-05, "loss": 0.2031, "step": 7394 }, { "epoch": 1.4966605950212508, "grad_norm": 0.27644211053848267, "learning_rate": 2.9734912492470968e-05, "loss": 0.2034, "step": 7395 }, { "epoch": 1.496862983201781, "grad_norm": 0.25040486454963684, "learning_rate": 2.971228415038606e-05, "loss": 0.1694, "step": 7396 }, { "epoch": 1.4970653713823112, "grad_norm": 0.30418914556503296, "learning_rate": 2.9689662919274718e-05, "loss": 0.2076, "step": 7397 }, { "epoch": 1.4972677595628414, "grad_norm": 0.3006996512413025, "learning_rate": 2.9667048801425536e-05, "loss": 0.1918, "step": 7398 }, { "epoch": 1.4974701477433718, "grad_norm": 0.276265412569046, "learning_rate": 2.9644441799126345e-05, "loss": 0.1718, "step": 7399 }, { "epoch": 1.497672535923902, "grad_norm": 0.2626464068889618, "learning_rate": 2.9621841914664307e-05, "loss": 0.2176, "step": 7400 }, { "epoch": 1.497672535923902, "eval_loss": 0.2646274268627167, "eval_runtime": 0.74, "eval_samples_per_second": 6.756, "eval_steps_per_second": 1.351, "step": 7400 }, { "epoch": 1.4978749241044322, "grad_norm": 0.24904023110866547, "learning_rate": 2.9599249150325838e-05, "loss": 0.1877, "step": 7401 }, { "epoch": 1.4980773122849627, "grad_norm": 0.2674250304698944, "learning_rate": 2.957666350839663e-05, "loss": 0.1725, "step": 7402 }, { "epoch": 1.4982797004654929, "grad_norm": 0.2647687792778015, "learning_rate": 2.9554084991161666e-05, "loss": 0.1758, "step": 7403 }, { "epoch": 1.498482088646023, "grad_norm": 0.3034684658050537, "learning_rate": 2.9531513600905236e-05, "loss": 0.211, "step": 7404 }, { "epoch": 1.4986844768265533, "grad_norm": 0.2592996060848236, "learning_rate": 2.9508949339910807e-05, "loss": 0.1882, "step": 7405 }, { "epoch": 1.4988868650070835, "grad_norm": 0.34100887179374695, "learning_rate": 2.9486392210461224e-05, "loss": 0.2016, "step": 7406 }, { "epoch": 1.4990892531876139, "grad_norm": 0.29518815875053406, "learning_rate": 2.94638422148386e-05, "loss": 0.1946, "step": 7407 }, { "epoch": 1.499291641368144, "grad_norm": 0.26552388072013855, "learning_rate": 2.944129935532428e-05, "loss": 0.1635, "step": 7408 }, { "epoch": 1.4994940295486743, "grad_norm": 0.27012595534324646, "learning_rate": 2.941876363419893e-05, "loss": 0.1588, "step": 7409 }, { "epoch": 1.4996964177292047, "grad_norm": 0.29524192214012146, "learning_rate": 2.9396235053742483e-05, "loss": 0.2042, "step": 7410 }, { "epoch": 1.499898805909735, "grad_norm": 0.2819133400917053, "learning_rate": 2.9373713616234133e-05, "loss": 0.1783, "step": 7411 }, { "epoch": 1.500101194090265, "grad_norm": 0.3101344704627991, "learning_rate": 2.935119932395236e-05, "loss": 0.205, "step": 7412 }, { "epoch": 1.5003035822707953, "grad_norm": 0.26377272605895996, "learning_rate": 2.9328692179174933e-05, "loss": 0.2038, "step": 7413 }, { "epoch": 1.5005059704513255, "grad_norm": 0.24763913452625275, "learning_rate": 2.9306192184178884e-05, "loss": 0.1719, "step": 7414 }, { "epoch": 1.500708358631856, "grad_norm": 0.2799322009086609, "learning_rate": 2.9283699341240534e-05, "loss": 0.1862, "step": 7415 }, { "epoch": 1.5009107468123861, "grad_norm": 0.3978762626647949, "learning_rate": 2.9261213652635466e-05, "loss": 0.2023, "step": 7416 }, { "epoch": 1.5011131349929165, "grad_norm": 0.2425425797700882, "learning_rate": 2.923873512063854e-05, "loss": 0.1713, "step": 7417 }, { "epoch": 1.5013155231734467, "grad_norm": 0.30181166529655457, "learning_rate": 2.921626374752391e-05, "loss": 0.2113, "step": 7418 }, { "epoch": 1.501517911353977, "grad_norm": 0.33585092425346375, "learning_rate": 2.9193799535564993e-05, "loss": 0.2352, "step": 7419 }, { "epoch": 1.5017202995345071, "grad_norm": 0.28944873809814453, "learning_rate": 2.917134248703447e-05, "loss": 0.1691, "step": 7420 }, { "epoch": 1.5019226877150373, "grad_norm": 0.2766030430793762, "learning_rate": 2.9148892604204325e-05, "loss": 0.1836, "step": 7421 }, { "epoch": 1.5021250758955675, "grad_norm": 0.24718813598155975, "learning_rate": 2.9126449889345787e-05, "loss": 0.1926, "step": 7422 }, { "epoch": 1.502327464076098, "grad_norm": 0.26735419034957886, "learning_rate": 2.910401434472938e-05, "loss": 0.1768, "step": 7423 }, { "epoch": 1.5025298522566282, "grad_norm": 0.2996431589126587, "learning_rate": 2.9081585972624913e-05, "loss": 0.1888, "step": 7424 }, { "epoch": 1.5027322404371586, "grad_norm": 0.27923211455345154, "learning_rate": 2.905916477530143e-05, "loss": 0.1834, "step": 7425 }, { "epoch": 1.5029346286176888, "grad_norm": 0.2527241110801697, "learning_rate": 2.903675075502731e-05, "loss": 0.1671, "step": 7426 }, { "epoch": 1.503137016798219, "grad_norm": 0.2642647325992584, "learning_rate": 2.9014343914070108e-05, "loss": 0.1937, "step": 7427 }, { "epoch": 1.5033394049787492, "grad_norm": 0.27124494314193726, "learning_rate": 2.8991944254696746e-05, "loss": 0.195, "step": 7428 }, { "epoch": 1.5035417931592794, "grad_norm": 0.2615841031074524, "learning_rate": 2.8969551779173388e-05, "loss": 0.1582, "step": 7429 }, { "epoch": 1.5037441813398098, "grad_norm": 0.2523174285888672, "learning_rate": 2.8947166489765465e-05, "loss": 0.1935, "step": 7430 }, { "epoch": 1.50394656952034, "grad_norm": 0.2733636498451233, "learning_rate": 2.892478838873768e-05, "loss": 0.1796, "step": 7431 }, { "epoch": 1.5041489577008704, "grad_norm": 0.26868680119514465, "learning_rate": 2.8902417478354037e-05, "loss": 0.192, "step": 7432 }, { "epoch": 1.5043513458814006, "grad_norm": 0.28141334652900696, "learning_rate": 2.8880053760877767e-05, "loss": 0.203, "step": 7433 }, { "epoch": 1.5045537340619308, "grad_norm": 0.25075894594192505, "learning_rate": 2.8857697238571402e-05, "loss": 0.2052, "step": 7434 }, { "epoch": 1.504756122242461, "grad_norm": 0.282585084438324, "learning_rate": 2.883534791369674e-05, "loss": 0.1852, "step": 7435 }, { "epoch": 1.5049585104229912, "grad_norm": 0.3138660490512848, "learning_rate": 2.881300578851487e-05, "loss": 0.2139, "step": 7436 }, { "epoch": 1.5051608986035214, "grad_norm": 0.257865309715271, "learning_rate": 2.8790670865286107e-05, "loss": 0.1865, "step": 7437 }, { "epoch": 1.5053632867840518, "grad_norm": 0.2725944519042969, "learning_rate": 2.8768343146270072e-05, "loss": 0.187, "step": 7438 }, { "epoch": 1.505565674964582, "grad_norm": 0.37270545959472656, "learning_rate": 2.8746022633725656e-05, "loss": 0.2131, "step": 7439 }, { "epoch": 1.5057680631451125, "grad_norm": 0.2656130790710449, "learning_rate": 2.8723709329911007e-05, "loss": 0.1774, "step": 7440 }, { "epoch": 1.5059704513256427, "grad_norm": 0.25046414136886597, "learning_rate": 2.8701403237083557e-05, "loss": 0.1662, "step": 7441 }, { "epoch": 1.5061728395061729, "grad_norm": 0.27029305696487427, "learning_rate": 2.86791043575e-05, "loss": 0.1936, "step": 7442 }, { "epoch": 1.506375227686703, "grad_norm": 0.2958330512046814, "learning_rate": 2.86568126934163e-05, "loss": 0.1896, "step": 7443 }, { "epoch": 1.5065776158672333, "grad_norm": 0.2662060856819153, "learning_rate": 2.8634528247087668e-05, "loss": 0.17, "step": 7444 }, { "epoch": 1.5067800040477635, "grad_norm": 0.3227637708187103, "learning_rate": 2.8612251020768665e-05, "loss": 0.2281, "step": 7445 }, { "epoch": 1.5069823922282939, "grad_norm": 0.28649061918258667, "learning_rate": 2.858998101671305e-05, "loss": 0.1995, "step": 7446 }, { "epoch": 1.507184780408824, "grad_norm": 0.2932646870613098, "learning_rate": 2.8567718237173857e-05, "loss": 0.1908, "step": 7447 }, { "epoch": 1.5073871685893545, "grad_norm": 0.28228962421417236, "learning_rate": 2.854546268440339e-05, "loss": 0.1763, "step": 7448 }, { "epoch": 1.5075895567698847, "grad_norm": 0.2924324870109558, "learning_rate": 2.8523214360653293e-05, "loss": 0.2125, "step": 7449 }, { "epoch": 1.507791944950415, "grad_norm": 0.31800493597984314, "learning_rate": 2.8500973268174324e-05, "loss": 0.1897, "step": 7450 }, { "epoch": 1.507791944950415, "eval_loss": 0.2675067186355591, "eval_runtime": 0.7394, "eval_samples_per_second": 6.762, "eval_steps_per_second": 1.352, "step": 7450 }, { "epoch": 1.507994333130945, "grad_norm": 0.3096625804901123, "learning_rate": 2.847873940921666e-05, "loss": 0.188, "step": 7451 }, { "epoch": 1.5081967213114753, "grad_norm": 0.5369827151298523, "learning_rate": 2.8456512786029676e-05, "loss": 0.2071, "step": 7452 }, { "epoch": 1.5083991094920055, "grad_norm": 0.2631036639213562, "learning_rate": 2.8434293400862022e-05, "loss": 0.1894, "step": 7453 }, { "epoch": 1.508601497672536, "grad_norm": 0.2811294198036194, "learning_rate": 2.8412081255961644e-05, "loss": 0.1778, "step": 7454 }, { "epoch": 1.5088038858530661, "grad_norm": 0.26449280977249146, "learning_rate": 2.8389876353575705e-05, "loss": 0.1761, "step": 7455 }, { "epoch": 1.5090062740335966, "grad_norm": 0.28584322333335876, "learning_rate": 2.8367678695950695e-05, "loss": 0.182, "step": 7456 }, { "epoch": 1.5092086622141268, "grad_norm": 0.24409325420856476, "learning_rate": 2.8345488285332324e-05, "loss": 0.17, "step": 7457 }, { "epoch": 1.509411050394657, "grad_norm": 0.2807093560695648, "learning_rate": 2.8323305123965583e-05, "loss": 0.1972, "step": 7458 }, { "epoch": 1.5096134385751871, "grad_norm": 0.2984221577644348, "learning_rate": 2.8301129214094735e-05, "loss": 0.2116, "step": 7459 }, { "epoch": 1.5098158267557173, "grad_norm": 0.27237123250961304, "learning_rate": 2.8278960557963298e-05, "loss": 0.1982, "step": 7460 }, { "epoch": 1.5100182149362478, "grad_norm": 0.2626419961452484, "learning_rate": 2.8256799157814074e-05, "loss": 0.168, "step": 7461 }, { "epoch": 1.510220603116778, "grad_norm": 0.26899945735931396, "learning_rate": 2.8234645015889127e-05, "loss": 0.2049, "step": 7462 }, { "epoch": 1.5104229912973084, "grad_norm": 0.28485849499702454, "learning_rate": 2.8212498134429766e-05, "loss": 0.2242, "step": 7463 }, { "epoch": 1.5106253794778386, "grad_norm": 0.27648624777793884, "learning_rate": 2.8190358515676584e-05, "loss": 0.1857, "step": 7464 }, { "epoch": 1.5108277676583688, "grad_norm": 0.20960475504398346, "learning_rate": 2.816822616186945e-05, "loss": 0.1252, "step": 7465 }, { "epoch": 1.511030155838899, "grad_norm": 0.2709536552429199, "learning_rate": 2.8146101075247457e-05, "loss": 0.1989, "step": 7466 }, { "epoch": 1.5112325440194292, "grad_norm": 0.2901017963886261, "learning_rate": 2.8123983258049e-05, "loss": 0.1991, "step": 7467 }, { "epoch": 1.5114349321999594, "grad_norm": 0.28593146800994873, "learning_rate": 2.8101872712511745e-05, "loss": 0.1977, "step": 7468 }, { "epoch": 1.5116373203804898, "grad_norm": 0.321687787771225, "learning_rate": 2.8079769440872582e-05, "loss": 0.1963, "step": 7469 }, { "epoch": 1.51183970856102, "grad_norm": 0.2690439522266388, "learning_rate": 2.8057673445367694e-05, "loss": 0.1718, "step": 7470 }, { "epoch": 1.5120420967415504, "grad_norm": 0.29314038157463074, "learning_rate": 2.8035584728232557e-05, "loss": 0.176, "step": 7471 }, { "epoch": 1.5122444849220806, "grad_norm": 0.3003440499305725, "learning_rate": 2.8013503291701813e-05, "loss": 0.2047, "step": 7472 }, { "epoch": 1.5124468731026108, "grad_norm": 0.266367107629776, "learning_rate": 2.799142913800946e-05, "loss": 0.1916, "step": 7473 }, { "epoch": 1.512649261283141, "grad_norm": 0.2905641198158264, "learning_rate": 2.7969362269388732e-05, "loss": 0.1944, "step": 7474 }, { "epoch": 1.5128516494636712, "grad_norm": 0.39335983991622925, "learning_rate": 2.794730268807212e-05, "loss": 0.2294, "step": 7475 }, { "epoch": 1.5130540376442014, "grad_norm": 0.24299614131450653, "learning_rate": 2.792525039629138e-05, "loss": 0.1626, "step": 7476 }, { "epoch": 1.5132564258247319, "grad_norm": 0.35958167910575867, "learning_rate": 2.7903205396277542e-05, "loss": 0.1963, "step": 7477 }, { "epoch": 1.513458814005262, "grad_norm": 0.30342257022857666, "learning_rate": 2.7881167690260867e-05, "loss": 0.2193, "step": 7478 }, { "epoch": 1.5136612021857925, "grad_norm": 0.27842989563941956, "learning_rate": 2.7859137280470915e-05, "loss": 0.1961, "step": 7479 }, { "epoch": 1.5138635903663227, "grad_norm": 0.30261504650115967, "learning_rate": 2.783711416913649e-05, "loss": 0.208, "step": 7480 }, { "epoch": 1.5140659785468529, "grad_norm": 0.2848835289478302, "learning_rate": 2.7815098358485646e-05, "loss": 0.2145, "step": 7481 }, { "epoch": 1.514268366727383, "grad_norm": 0.29721975326538086, "learning_rate": 2.7793089850745736e-05, "loss": 0.1871, "step": 7482 }, { "epoch": 1.5144707549079133, "grad_norm": 0.30782490968704224, "learning_rate": 2.777108864814333e-05, "loss": 0.2136, "step": 7483 }, { "epoch": 1.5146731430884435, "grad_norm": 0.23784612119197845, "learning_rate": 2.7749094752904292e-05, "loss": 0.1535, "step": 7484 }, { "epoch": 1.514875531268974, "grad_norm": 0.27230408787727356, "learning_rate": 2.7727108167253712e-05, "loss": 0.216, "step": 7485 }, { "epoch": 1.515077919449504, "grad_norm": 0.28563883900642395, "learning_rate": 2.7705128893415987e-05, "loss": 0.1972, "step": 7486 }, { "epoch": 1.5152803076300345, "grad_norm": 0.300144761800766, "learning_rate": 2.768315693361474e-05, "loss": 0.2125, "step": 7487 }, { "epoch": 1.5154826958105647, "grad_norm": 0.2823977768421173, "learning_rate": 2.7661192290072857e-05, "loss": 0.1948, "step": 7488 }, { "epoch": 1.515685083991095, "grad_norm": 0.3386089503765106, "learning_rate": 2.7639234965012505e-05, "loss": 0.2125, "step": 7489 }, { "epoch": 1.5158874721716251, "grad_norm": 0.3780275881290436, "learning_rate": 2.7617284960655075e-05, "loss": 0.2796, "step": 7490 }, { "epoch": 1.5160898603521553, "grad_norm": 0.23867158591747284, "learning_rate": 2.7595342279221258e-05, "loss": 0.1654, "step": 7491 }, { "epoch": 1.5162922485326857, "grad_norm": 0.263566255569458, "learning_rate": 2.7573406922930978e-05, "loss": 0.1919, "step": 7492 }, { "epoch": 1.516494636713216, "grad_norm": 0.28585851192474365, "learning_rate": 2.7551478894003413e-05, "loss": 0.1762, "step": 7493 }, { "epoch": 1.5166970248937464, "grad_norm": 0.2942905128002167, "learning_rate": 2.7529558194657024e-05, "loss": 0.2138, "step": 7494 }, { "epoch": 1.5168994130742766, "grad_norm": 0.285055011510849, "learning_rate": 2.7507644827109514e-05, "loss": 0.1866, "step": 7495 }, { "epoch": 1.5171018012548068, "grad_norm": 0.3072595000267029, "learning_rate": 2.748573879357784e-05, "loss": 0.2122, "step": 7496 }, { "epoch": 1.517304189435337, "grad_norm": 0.24192121624946594, "learning_rate": 2.7463840096278236e-05, "loss": 0.1836, "step": 7497 }, { "epoch": 1.5175065776158672, "grad_norm": 0.2530333697795868, "learning_rate": 2.7441948737426183e-05, "loss": 0.1618, "step": 7498 }, { "epoch": 1.5177089657963974, "grad_norm": 0.3010926842689514, "learning_rate": 2.7420064719236404e-05, "loss": 0.1759, "step": 7499 }, { "epoch": 1.5179113539769278, "grad_norm": 0.26380324363708496, "learning_rate": 2.7398188043922912e-05, "loss": 0.1671, "step": 7500 }, { "epoch": 1.5179113539769278, "eval_loss": 0.26698416471481323, "eval_runtime": 0.7399, "eval_samples_per_second": 6.757, "eval_steps_per_second": 1.351, "step": 7500 }, { "epoch": 1.518113742157458, "grad_norm": 0.31068721413612366, "learning_rate": 2.7376318713698957e-05, "loss": 0.208, "step": 7501 }, { "epoch": 1.5183161303379884, "grad_norm": 0.24927808344364166, "learning_rate": 2.7354456730777035e-05, "loss": 0.1741, "step": 7502 }, { "epoch": 1.5185185185185186, "grad_norm": 0.2789771258831024, "learning_rate": 2.733260209736891e-05, "loss": 0.1886, "step": 7503 }, { "epoch": 1.5187209066990488, "grad_norm": 0.28663644194602966, "learning_rate": 2.7310754815685624e-05, "loss": 0.159, "step": 7504 }, { "epoch": 1.518923294879579, "grad_norm": 0.254955917596817, "learning_rate": 2.7288914887937456e-05, "loss": 0.1687, "step": 7505 }, { "epoch": 1.5191256830601092, "grad_norm": 0.28779321908950806, "learning_rate": 2.7267082316333913e-05, "loss": 0.1851, "step": 7506 }, { "epoch": 1.5193280712406394, "grad_norm": 0.2844981551170349, "learning_rate": 2.724525710308381e-05, "loss": 0.2195, "step": 7507 }, { "epoch": 1.5195304594211698, "grad_norm": 0.3059309124946594, "learning_rate": 2.7223439250395188e-05, "loss": 0.198, "step": 7508 }, { "epoch": 1.5197328476017, "grad_norm": 0.2609426975250244, "learning_rate": 2.7201628760475352e-05, "loss": 0.1752, "step": 7509 }, { "epoch": 1.5199352357822304, "grad_norm": 0.2921501398086548, "learning_rate": 2.717982563553084e-05, "loss": 0.1863, "step": 7510 }, { "epoch": 1.5201376239627606, "grad_norm": 0.3341974914073944, "learning_rate": 2.715802987776749e-05, "loss": 0.1851, "step": 7511 }, { "epoch": 1.5203400121432908, "grad_norm": 0.27599576115608215, "learning_rate": 2.7136241489390356e-05, "loss": 0.187, "step": 7512 }, { "epoch": 1.520542400323821, "grad_norm": 0.23091015219688416, "learning_rate": 2.7114460472603754e-05, "loss": 0.1639, "step": 7513 }, { "epoch": 1.5207447885043512, "grad_norm": 0.2507723569869995, "learning_rate": 2.709268682961126e-05, "loss": 0.1831, "step": 7514 }, { "epoch": 1.5209471766848814, "grad_norm": 0.29208138585090637, "learning_rate": 2.7070920562615733e-05, "loss": 0.2016, "step": 7515 }, { "epoch": 1.5211495648654119, "grad_norm": 0.24572382867336273, "learning_rate": 2.704916167381919e-05, "loss": 0.1622, "step": 7516 }, { "epoch": 1.521351953045942, "grad_norm": 0.2684371769428253, "learning_rate": 2.7027410165423016e-05, "loss": 0.1711, "step": 7517 }, { "epoch": 1.5215543412264725, "grad_norm": 0.2745975852012634, "learning_rate": 2.7005666039627788e-05, "loss": 0.1599, "step": 7518 }, { "epoch": 1.5217567294070027, "grad_norm": 0.2740444839000702, "learning_rate": 2.6983929298633348e-05, "loss": 0.189, "step": 7519 }, { "epoch": 1.5219591175875329, "grad_norm": 0.32473820447921753, "learning_rate": 2.6962199944638788e-05, "loss": 0.2143, "step": 7520 }, { "epoch": 1.522161505768063, "grad_norm": 0.29582443833351135, "learning_rate": 2.694047797984247e-05, "loss": 0.2101, "step": 7521 }, { "epoch": 1.5223638939485933, "grad_norm": 0.31342431902885437, "learning_rate": 2.6918763406441973e-05, "loss": 0.1809, "step": 7522 }, { "epoch": 1.5225662821291237, "grad_norm": 0.2780219614505768, "learning_rate": 2.6897056226634175e-05, "loss": 0.1788, "step": 7523 }, { "epoch": 1.522768670309654, "grad_norm": 0.27648964524269104, "learning_rate": 2.6875356442615162e-05, "loss": 0.1759, "step": 7524 }, { "epoch": 1.5229710584901843, "grad_norm": 0.3273850083351135, "learning_rate": 2.6853664056580285e-05, "loss": 0.2186, "step": 7525 }, { "epoch": 1.5231734466707145, "grad_norm": 0.2797602117061615, "learning_rate": 2.6831979070724177e-05, "loss": 0.1941, "step": 7526 }, { "epoch": 1.5233758348512447, "grad_norm": 0.3109395503997803, "learning_rate": 2.6810301487240686e-05, "loss": 0.1836, "step": 7527 }, { "epoch": 1.523578223031775, "grad_norm": 0.25964033603668213, "learning_rate": 2.6788631308322908e-05, "loss": 0.1594, "step": 7528 }, { "epoch": 1.5237806112123051, "grad_norm": 0.38888150453567505, "learning_rate": 2.6766968536163218e-05, "loss": 0.2233, "step": 7529 }, { "epoch": 1.5239829993928353, "grad_norm": 0.3075036406517029, "learning_rate": 2.6745313172953233e-05, "loss": 0.2083, "step": 7530 }, { "epoch": 1.5241853875733657, "grad_norm": 0.2736060619354248, "learning_rate": 2.6723665220883798e-05, "loss": 0.1835, "step": 7531 }, { "epoch": 1.524387775753896, "grad_norm": 0.2539604902267456, "learning_rate": 2.6702024682145043e-05, "loss": 0.1743, "step": 7532 }, { "epoch": 1.5245901639344264, "grad_norm": 0.2822546660900116, "learning_rate": 2.6680391558926333e-05, "loss": 0.1909, "step": 7533 }, { "epoch": 1.5247925521149566, "grad_norm": 0.34847140312194824, "learning_rate": 2.6658765853416256e-05, "loss": 0.2242, "step": 7534 }, { "epoch": 1.5249949402954868, "grad_norm": 0.3049599528312683, "learning_rate": 2.663714756780269e-05, "loss": 0.196, "step": 7535 }, { "epoch": 1.525197328476017, "grad_norm": 0.2608265280723572, "learning_rate": 2.661553670427276e-05, "loss": 0.1903, "step": 7536 }, { "epoch": 1.5253997166565472, "grad_norm": 0.31801798939704895, "learning_rate": 2.6593933265012794e-05, "loss": 0.2038, "step": 7537 }, { "epoch": 1.5256021048370774, "grad_norm": 0.31101664900779724, "learning_rate": 2.6572337252208455e-05, "loss": 0.1776, "step": 7538 }, { "epoch": 1.5258044930176078, "grad_norm": 0.25785762071609497, "learning_rate": 2.6550748668044512e-05, "loss": 0.2037, "step": 7539 }, { "epoch": 1.526006881198138, "grad_norm": 0.3033619523048401, "learning_rate": 2.6529167514705144e-05, "loss": 0.1924, "step": 7540 }, { "epoch": 1.5262092693786684, "grad_norm": 0.2987357974052429, "learning_rate": 2.6507593794373696e-05, "loss": 0.1882, "step": 7541 }, { "epoch": 1.5264116575591986, "grad_norm": 0.347606897354126, "learning_rate": 2.648602750923276e-05, "loss": 0.2135, "step": 7542 }, { "epoch": 1.5266140457397288, "grad_norm": 0.28092533349990845, "learning_rate": 2.6464468661464183e-05, "loss": 0.2041, "step": 7543 }, { "epoch": 1.526816433920259, "grad_norm": 0.2929527163505554, "learning_rate": 2.6442917253249065e-05, "loss": 0.1787, "step": 7544 }, { "epoch": 1.5270188221007892, "grad_norm": 0.30891942977905273, "learning_rate": 2.6421373286767758e-05, "loss": 0.1835, "step": 7545 }, { "epoch": 1.5272212102813194, "grad_norm": 0.331102579832077, "learning_rate": 2.6399836764199846e-05, "loss": 0.2143, "step": 7546 }, { "epoch": 1.5274235984618498, "grad_norm": 0.34742066264152527, "learning_rate": 2.637830768772418e-05, "loss": 0.2306, "step": 7547 }, { "epoch": 1.5276259866423803, "grad_norm": 0.2488187551498413, "learning_rate": 2.6356786059518833e-05, "loss": 0.1625, "step": 7548 }, { "epoch": 1.5278283748229105, "grad_norm": 0.294069766998291, "learning_rate": 2.6335271881761148e-05, "loss": 0.1875, "step": 7549 }, { "epoch": 1.5280307630034407, "grad_norm": 0.3056446611881256, "learning_rate": 2.631376515662769e-05, "loss": 0.2226, "step": 7550 }, { "epoch": 1.5280307630034407, "eval_loss": 0.2665191888809204, "eval_runtime": 0.7394, "eval_samples_per_second": 6.762, "eval_steps_per_second": 1.352, "step": 7550 }, { "epoch": 1.5282331511839709, "grad_norm": 0.3098391890525818, "learning_rate": 2.62922658862943e-05, "loss": 0.2245, "step": 7551 }, { "epoch": 1.528435539364501, "grad_norm": 0.2537216544151306, "learning_rate": 2.6270774072936033e-05, "loss": 0.1839, "step": 7552 }, { "epoch": 1.5286379275450312, "grad_norm": 0.25821807980537415, "learning_rate": 2.624928971872722e-05, "loss": 0.1435, "step": 7553 }, { "epoch": 1.5288403157255617, "grad_norm": 0.2951766550540924, "learning_rate": 2.6227812825841412e-05, "loss": 0.1989, "step": 7554 }, { "epoch": 1.5290427039060919, "grad_norm": 0.2677651047706604, "learning_rate": 2.6206343396451427e-05, "loss": 0.1952, "step": 7555 }, { "epoch": 1.5292450920866223, "grad_norm": 0.25762563943862915, "learning_rate": 2.6184881432729304e-05, "loss": 0.2038, "step": 7556 }, { "epoch": 1.5294474802671525, "grad_norm": 0.2664134204387665, "learning_rate": 2.6163426936846346e-05, "loss": 0.2034, "step": 7557 }, { "epoch": 1.5296498684476827, "grad_norm": 0.26822999119758606, "learning_rate": 2.614197991097309e-05, "loss": 0.1711, "step": 7558 }, { "epoch": 1.529852256628213, "grad_norm": 0.25438639521598816, "learning_rate": 2.612054035727932e-05, "loss": 0.2009, "step": 7559 }, { "epoch": 1.530054644808743, "grad_norm": 0.2914297878742218, "learning_rate": 2.6099108277934103e-05, "loss": 0.1888, "step": 7560 }, { "epoch": 1.5302570329892733, "grad_norm": 0.2948731780052185, "learning_rate": 2.6077683675105645e-05, "loss": 0.2113, "step": 7561 }, { "epoch": 1.5304594211698037, "grad_norm": 0.28212210536003113, "learning_rate": 2.6056266550961495e-05, "loss": 0.1551, "step": 7562 }, { "epoch": 1.530661809350334, "grad_norm": 0.2819977104663849, "learning_rate": 2.6034856907668414e-05, "loss": 0.1761, "step": 7563 }, { "epoch": 1.5308641975308643, "grad_norm": 0.24788232147693634, "learning_rate": 2.6013454747392408e-05, "loss": 0.1629, "step": 7564 }, { "epoch": 1.5310665857113945, "grad_norm": 0.22148968279361725, "learning_rate": 2.599206007229872e-05, "loss": 0.1467, "step": 7565 }, { "epoch": 1.5312689738919247, "grad_norm": 0.2706809937953949, "learning_rate": 2.5970672884551826e-05, "loss": 0.1757, "step": 7566 }, { "epoch": 1.531471362072455, "grad_norm": 0.2636460065841675, "learning_rate": 2.594929318631547e-05, "loss": 0.1862, "step": 7567 }, { "epoch": 1.5316737502529851, "grad_norm": 0.2994755804538727, "learning_rate": 2.592792097975263e-05, "loss": 0.1901, "step": 7568 }, { "epoch": 1.5318761384335153, "grad_norm": 0.28958430886268616, "learning_rate": 2.5906556267025517e-05, "loss": 0.1998, "step": 7569 }, { "epoch": 1.5320785266140458, "grad_norm": 0.24085618555545807, "learning_rate": 2.5885199050295585e-05, "loss": 0.1703, "step": 7570 }, { "epoch": 1.532280914794576, "grad_norm": 0.2558616101741791, "learning_rate": 2.5863849331723532e-05, "loss": 0.1694, "step": 7571 }, { "epoch": 1.5324833029751064, "grad_norm": 0.32883763313293457, "learning_rate": 2.5842507113469304e-05, "loss": 0.2227, "step": 7572 }, { "epoch": 1.5326856911556366, "grad_norm": 0.26645487546920776, "learning_rate": 2.5821172397692085e-05, "loss": 0.1865, "step": 7573 }, { "epoch": 1.5328880793361668, "grad_norm": 0.3352511525154114, "learning_rate": 2.5799845186550285e-05, "loss": 0.1929, "step": 7574 }, { "epoch": 1.533090467516697, "grad_norm": 0.2677745223045349, "learning_rate": 2.5778525482201575e-05, "loss": 0.1783, "step": 7575 }, { "epoch": 1.5332928556972272, "grad_norm": 0.2944205105304718, "learning_rate": 2.5757213286802873e-05, "loss": 0.1758, "step": 7576 }, { "epoch": 1.5334952438777574, "grad_norm": 0.2755123972892761, "learning_rate": 2.5735908602510294e-05, "loss": 0.1712, "step": 7577 }, { "epoch": 1.5336976320582878, "grad_norm": 0.33217665553092957, "learning_rate": 2.571461143147925e-05, "loss": 0.1853, "step": 7578 }, { "epoch": 1.5339000202388182, "grad_norm": 0.3142439126968384, "learning_rate": 2.5693321775864356e-05, "loss": 0.2122, "step": 7579 }, { "epoch": 1.5341024084193484, "grad_norm": 0.2924261689186096, "learning_rate": 2.5672039637819456e-05, "loss": 0.1793, "step": 7580 }, { "epoch": 1.5343047965998786, "grad_norm": 0.2553829550743103, "learning_rate": 2.565076501949769e-05, "loss": 0.1344, "step": 7581 }, { "epoch": 1.5345071847804088, "grad_norm": 0.2889065444469452, "learning_rate": 2.5629497923051404e-05, "loss": 0.1833, "step": 7582 }, { "epoch": 1.534709572960939, "grad_norm": 0.2888411581516266, "learning_rate": 2.5608238350632118e-05, "loss": 0.1948, "step": 7583 }, { "epoch": 1.5349119611414692, "grad_norm": 0.3007155954837799, "learning_rate": 2.5586986304390704e-05, "loss": 0.218, "step": 7584 }, { "epoch": 1.5351143493219996, "grad_norm": 0.3101598620414734, "learning_rate": 2.5565741786477204e-05, "loss": 0.1716, "step": 7585 }, { "epoch": 1.5353167375025298, "grad_norm": 0.29315635561943054, "learning_rate": 2.5544504799040925e-05, "loss": 0.1707, "step": 7586 }, { "epoch": 1.5355191256830603, "grad_norm": 0.3432648777961731, "learning_rate": 2.552327534423039e-05, "loss": 0.2077, "step": 7587 }, { "epoch": 1.5357215138635905, "grad_norm": 0.28879377245903015, "learning_rate": 2.5502053424193384e-05, "loss": 0.1842, "step": 7588 }, { "epoch": 1.5359239020441207, "grad_norm": 0.2736664414405823, "learning_rate": 2.548083904107692e-05, "loss": 0.201, "step": 7589 }, { "epoch": 1.5361262902246509, "grad_norm": 0.24728530645370483, "learning_rate": 2.545963219702724e-05, "loss": 0.1777, "step": 7590 }, { "epoch": 1.536328678405181, "grad_norm": 0.3157190978527069, "learning_rate": 2.5438432894189824e-05, "loss": 0.2011, "step": 7591 }, { "epoch": 1.5365310665857113, "grad_norm": 0.28984534740448, "learning_rate": 2.5417241134709403e-05, "loss": 0.2377, "step": 7592 }, { "epoch": 1.5367334547662417, "grad_norm": 0.2625545263290405, "learning_rate": 2.539605692072994e-05, "loss": 0.2019, "step": 7593 }, { "epoch": 1.5369358429467719, "grad_norm": 0.2867405116558075, "learning_rate": 2.5374880254394628e-05, "loss": 0.1882, "step": 7594 }, { "epoch": 1.5371382311273023, "grad_norm": 0.3031767010688782, "learning_rate": 2.5353711137845892e-05, "loss": 0.1957, "step": 7595 }, { "epoch": 1.5373406193078325, "grad_norm": 0.254905641078949, "learning_rate": 2.5332549573225416e-05, "loss": 0.182, "step": 7596 }, { "epoch": 1.5375430074883627, "grad_norm": 0.5628305077552795, "learning_rate": 2.5311395562674066e-05, "loss": 0.1782, "step": 7597 }, { "epoch": 1.537745395668893, "grad_norm": 0.35798123478889465, "learning_rate": 2.5290249108332042e-05, "loss": 0.2021, "step": 7598 }, { "epoch": 1.537947783849423, "grad_norm": 0.3951321840286255, "learning_rate": 2.5269110212338697e-05, "loss": 0.2244, "step": 7599 }, { "epoch": 1.5381501720299533, "grad_norm": 0.3249993622303009, "learning_rate": 2.5247978876832633e-05, "loss": 0.1882, "step": 7600 }, { "epoch": 1.5381501720299533, "eval_loss": 0.2659483253955841, "eval_runtime": 0.7356, "eval_samples_per_second": 6.797, "eval_steps_per_second": 1.359, "step": 7600 }, { "epoch": 1.5383525602104837, "grad_norm": 0.300327330827713, "learning_rate": 2.5226855103951706e-05, "loss": 0.1931, "step": 7601 }, { "epoch": 1.538554948391014, "grad_norm": 0.2503814995288849, "learning_rate": 2.5205738895832998e-05, "loss": 0.189, "step": 7602 }, { "epoch": 1.5387573365715443, "grad_norm": 0.27575522661209106, "learning_rate": 2.5184630254612817e-05, "loss": 0.2085, "step": 7603 }, { "epoch": 1.5389597247520745, "grad_norm": 0.2803657054901123, "learning_rate": 2.516352918242675e-05, "loss": 0.1885, "step": 7604 }, { "epoch": 1.5391621129326047, "grad_norm": 0.2981926500797272, "learning_rate": 2.5142435681409516e-05, "loss": 0.2364, "step": 7605 }, { "epoch": 1.539364501113135, "grad_norm": 0.2323596477508545, "learning_rate": 2.5121349753695168e-05, "loss": 0.1475, "step": 7606 }, { "epoch": 1.5395668892936651, "grad_norm": 0.27811673283576965, "learning_rate": 2.5100271401416962e-05, "loss": 0.1532, "step": 7607 }, { "epoch": 1.5397692774741953, "grad_norm": 0.3188576102256775, "learning_rate": 2.5079200626707377e-05, "loss": 0.2101, "step": 7608 }, { "epoch": 1.5399716656547258, "grad_norm": 0.2822002172470093, "learning_rate": 2.505813743169815e-05, "loss": 0.2018, "step": 7609 }, { "epoch": 1.5401740538352562, "grad_norm": 0.26504015922546387, "learning_rate": 2.50370818185202e-05, "loss": 0.1743, "step": 7610 }, { "epoch": 1.5403764420157864, "grad_norm": 0.27246421575546265, "learning_rate": 2.501603378930375e-05, "loss": 0.2185, "step": 7611 }, { "epoch": 1.5405788301963166, "grad_norm": 0.2715609073638916, "learning_rate": 2.499499334617821e-05, "loss": 0.2015, "step": 7612 }, { "epoch": 1.5407812183768468, "grad_norm": 0.2291431874036789, "learning_rate": 2.4973960491272207e-05, "loss": 0.178, "step": 7613 }, { "epoch": 1.540983606557377, "grad_norm": 0.34131288528442383, "learning_rate": 2.495293522671366e-05, "loss": 0.2127, "step": 7614 }, { "epoch": 1.5411859947379072, "grad_norm": 0.24734556674957275, "learning_rate": 2.4931917554629656e-05, "loss": 0.1733, "step": 7615 }, { "epoch": 1.5413883829184376, "grad_norm": 0.3385285437107086, "learning_rate": 2.491090747714655e-05, "loss": 0.2291, "step": 7616 }, { "epoch": 1.5415907710989678, "grad_norm": 0.27316218614578247, "learning_rate": 2.4889904996389936e-05, "loss": 0.1838, "step": 7617 }, { "epoch": 1.5417931592794982, "grad_norm": 0.27246224880218506, "learning_rate": 2.48689101144846e-05, "loss": 0.1803, "step": 7618 }, { "epoch": 1.5419955474600284, "grad_norm": 0.27257344126701355, "learning_rate": 2.4847922833554603e-05, "loss": 0.1823, "step": 7619 }, { "epoch": 1.5421979356405586, "grad_norm": 0.2625625431537628, "learning_rate": 2.4826943155723215e-05, "loss": 0.1701, "step": 7620 }, { "epoch": 1.5424003238210888, "grad_norm": 0.3060097396373749, "learning_rate": 2.4805971083112933e-05, "loss": 0.1876, "step": 7621 }, { "epoch": 1.542602712001619, "grad_norm": 0.2705547511577606, "learning_rate": 2.4785006617845497e-05, "loss": 0.1905, "step": 7622 }, { "epoch": 1.5428051001821492, "grad_norm": 0.2615959644317627, "learning_rate": 2.4764049762041874e-05, "loss": 0.1679, "step": 7623 }, { "epoch": 1.5430074883626796, "grad_norm": 0.2921803891658783, "learning_rate": 2.474310051782225e-05, "loss": 0.1873, "step": 7624 }, { "epoch": 1.5432098765432098, "grad_norm": 0.2853137254714966, "learning_rate": 2.4722158887306047e-05, "loss": 0.1915, "step": 7625 }, { "epoch": 1.5434122647237403, "grad_norm": 0.32528531551361084, "learning_rate": 2.470122487261194e-05, "loss": 0.2114, "step": 7626 }, { "epoch": 1.5436146529042705, "grad_norm": 0.3797064423561096, "learning_rate": 2.468029847585781e-05, "loss": 0.2072, "step": 7627 }, { "epoch": 1.5438170410848007, "grad_norm": 0.23940971493721008, "learning_rate": 2.4659379699160746e-05, "loss": 0.1513, "step": 7628 }, { "epoch": 1.5440194292653309, "grad_norm": 0.28272753953933716, "learning_rate": 2.4638468544637093e-05, "loss": 0.1674, "step": 7629 }, { "epoch": 1.544221817445861, "grad_norm": 0.28421708941459656, "learning_rate": 2.4617565014402444e-05, "loss": 0.1915, "step": 7630 }, { "epoch": 1.5444242056263913, "grad_norm": 0.2440253049135208, "learning_rate": 2.459666911057158e-05, "loss": 0.1747, "step": 7631 }, { "epoch": 1.5446265938069217, "grad_norm": 0.26639822125434875, "learning_rate": 2.4575780835258544e-05, "loss": 0.1807, "step": 7632 }, { "epoch": 1.5448289819874519, "grad_norm": 0.30751553177833557, "learning_rate": 2.455490019057658e-05, "loss": 0.2205, "step": 7633 }, { "epoch": 1.5450313701679823, "grad_norm": 0.2998270094394684, "learning_rate": 2.4534027178638184e-05, "loss": 0.1782, "step": 7634 }, { "epoch": 1.5452337583485125, "grad_norm": 0.29298698902130127, "learning_rate": 2.451316180155505e-05, "loss": 0.1996, "step": 7635 }, { "epoch": 1.5454361465290427, "grad_norm": 0.317030131816864, "learning_rate": 2.4492304061438143e-05, "loss": 0.1811, "step": 7636 }, { "epoch": 1.545638534709573, "grad_norm": 0.2533614933490753, "learning_rate": 2.4471453960397617e-05, "loss": 0.1518, "step": 7637 }, { "epoch": 1.545840922890103, "grad_norm": 0.34969180822372437, "learning_rate": 2.4450611500542864e-05, "loss": 0.2098, "step": 7638 }, { "epoch": 1.5460433110706335, "grad_norm": 0.26460447907447815, "learning_rate": 2.442977668398251e-05, "loss": 0.1834, "step": 7639 }, { "epoch": 1.5462456992511637, "grad_norm": 0.29677653312683105, "learning_rate": 2.44089495128244e-05, "loss": 0.1944, "step": 7640 }, { "epoch": 1.5464480874316942, "grad_norm": 0.24911251664161682, "learning_rate": 2.4388129989175613e-05, "loss": 0.1702, "step": 7641 }, { "epoch": 1.5466504756122244, "grad_norm": 0.28196683526039124, "learning_rate": 2.4367318115142446e-05, "loss": 0.1501, "step": 7642 }, { "epoch": 1.5468528637927546, "grad_norm": 0.32040929794311523, "learning_rate": 2.4346513892830423e-05, "loss": 0.2017, "step": 7643 }, { "epoch": 1.5470552519732848, "grad_norm": 0.319196492433548, "learning_rate": 2.432571732434431e-05, "loss": 0.2117, "step": 7644 }, { "epoch": 1.547257640153815, "grad_norm": 0.3500312268733978, "learning_rate": 2.4304928411788064e-05, "loss": 0.2345, "step": 7645 }, { "epoch": 1.5474600283343452, "grad_norm": 0.30522117018699646, "learning_rate": 2.4284147157264913e-05, "loss": 0.2093, "step": 7646 }, { "epoch": 1.5476624165148756, "grad_norm": 0.3220088481903076, "learning_rate": 2.4263373562877278e-05, "loss": 0.1842, "step": 7647 }, { "epoch": 1.5478648046954058, "grad_norm": 0.2923586368560791, "learning_rate": 2.42426076307268e-05, "loss": 0.2043, "step": 7648 }, { "epoch": 1.5480671928759362, "grad_norm": 0.2700873911380768, "learning_rate": 2.4221849362914373e-05, "loss": 0.1741, "step": 7649 }, { "epoch": 1.5482695810564664, "grad_norm": 0.2897738516330719, "learning_rate": 2.4201098761540098e-05, "loss": 0.179, "step": 7650 }, { "epoch": 1.5482695810564664, "eval_loss": 0.26571616530418396, "eval_runtime": 0.7387, "eval_samples_per_second": 6.769, "eval_steps_per_second": 1.354, "step": 7650 }, { "epoch": 1.5484719692369966, "grad_norm": 0.2596552073955536, "learning_rate": 2.4180355828703303e-05, "loss": 0.1644, "step": 7651 }, { "epoch": 1.5486743574175268, "grad_norm": 0.2775457203388214, "learning_rate": 2.415962056650254e-05, "loss": 0.194, "step": 7652 }, { "epoch": 1.548876745598057, "grad_norm": 0.257010281085968, "learning_rate": 2.4138892977035576e-05, "loss": 0.173, "step": 7653 }, { "epoch": 1.5490791337785872, "grad_norm": 0.2591480612754822, "learning_rate": 2.4118173062399418e-05, "loss": 0.1568, "step": 7654 }, { "epoch": 1.5492815219591176, "grad_norm": 0.2812730371952057, "learning_rate": 2.40974608246903e-05, "loss": 0.1773, "step": 7655 }, { "epoch": 1.5494839101396478, "grad_norm": 0.3284202814102173, "learning_rate": 2.4076756266003652e-05, "loss": 0.1861, "step": 7656 }, { "epoch": 1.5496862983201782, "grad_norm": 0.3464701473712921, "learning_rate": 2.405605938843416e-05, "loss": 0.1946, "step": 7657 }, { "epoch": 1.5498886865007084, "grad_norm": 0.27741116285324097, "learning_rate": 2.40353701940757e-05, "loss": 0.199, "step": 7658 }, { "epoch": 1.5500910746812386, "grad_norm": 0.2622848451137543, "learning_rate": 2.4014688685021402e-05, "loss": 0.1846, "step": 7659 }, { "epoch": 1.5502934628617688, "grad_norm": 0.29812902212142944, "learning_rate": 2.399401486336359e-05, "loss": 0.1926, "step": 7660 }, { "epoch": 1.550495851042299, "grad_norm": 0.2542051672935486, "learning_rate": 2.3973348731193834e-05, "loss": 0.1708, "step": 7661 }, { "epoch": 1.5506982392228292, "grad_norm": 0.37435442209243774, "learning_rate": 2.395269029060292e-05, "loss": 0.1908, "step": 7662 }, { "epoch": 1.5509006274033597, "grad_norm": 0.28960543870925903, "learning_rate": 2.393203954368085e-05, "loss": 0.1697, "step": 7663 }, { "epoch": 1.5511030155838899, "grad_norm": 0.30785804986953735, "learning_rate": 2.3911396492516836e-05, "loss": 0.1839, "step": 7664 }, { "epoch": 1.5513054037644203, "grad_norm": 0.31761255860328674, "learning_rate": 2.3890761139199346e-05, "loss": 0.2377, "step": 7665 }, { "epoch": 1.5515077919449505, "grad_norm": 0.28986719250679016, "learning_rate": 2.387013348581604e-05, "loss": 0.2144, "step": 7666 }, { "epoch": 1.5517101801254807, "grad_norm": 0.25136709213256836, "learning_rate": 2.3849513534453793e-05, "loss": 0.1715, "step": 7667 }, { "epoch": 1.5519125683060109, "grad_norm": 0.2762148082256317, "learning_rate": 2.3828901287198746e-05, "loss": 0.1847, "step": 7668 }, { "epoch": 1.552114956486541, "grad_norm": 0.3051958382129669, "learning_rate": 2.3808296746136195e-05, "loss": 0.2113, "step": 7669 }, { "epoch": 1.5523173446670715, "grad_norm": 0.2701307237148285, "learning_rate": 2.3787699913350724e-05, "loss": 0.191, "step": 7670 }, { "epoch": 1.5525197328476017, "grad_norm": 0.31726446747779846, "learning_rate": 2.3767110790926107e-05, "loss": 0.2092, "step": 7671 }, { "epoch": 1.5527221210281321, "grad_norm": 0.27608180046081543, "learning_rate": 2.3746529380945292e-05, "loss": 0.156, "step": 7672 }, { "epoch": 1.5529245092086623, "grad_norm": 0.2835337519645691, "learning_rate": 2.372595568549052e-05, "loss": 0.2033, "step": 7673 }, { "epoch": 1.5531268973891925, "grad_norm": 0.2694108188152313, "learning_rate": 2.370538970664321e-05, "loss": 0.1905, "step": 7674 }, { "epoch": 1.5533292855697227, "grad_norm": 0.2937160134315491, "learning_rate": 2.3684831446484025e-05, "loss": 0.1881, "step": 7675 }, { "epoch": 1.553531673750253, "grad_norm": 0.27962058782577515, "learning_rate": 2.366428090709283e-05, "loss": 0.1992, "step": 7676 }, { "epoch": 1.5537340619307831, "grad_norm": 0.31856977939605713, "learning_rate": 2.3643738090548706e-05, "loss": 0.2171, "step": 7677 }, { "epoch": 1.5539364501113135, "grad_norm": 0.31848418712615967, "learning_rate": 2.362320299892996e-05, "loss": 0.2048, "step": 7678 }, { "epoch": 1.5541388382918437, "grad_norm": 0.2699210047721863, "learning_rate": 2.360267563431413e-05, "loss": 0.2099, "step": 7679 }, { "epoch": 1.5543412264723742, "grad_norm": 0.3069915473461151, "learning_rate": 2.3582155998777954e-05, "loss": 0.2065, "step": 7680 }, { "epoch": 1.5545436146529044, "grad_norm": 0.24691063165664673, "learning_rate": 2.3561644094397382e-05, "loss": 0.1647, "step": 7681 }, { "epoch": 1.5547460028334346, "grad_norm": 0.29513469338417053, "learning_rate": 2.3541139923247614e-05, "loss": 0.2003, "step": 7682 }, { "epoch": 1.5549483910139648, "grad_norm": 0.2617926001548767, "learning_rate": 2.3520643487403026e-05, "loss": 0.1643, "step": 7683 }, { "epoch": 1.555150779194495, "grad_norm": 0.26496002078056335, "learning_rate": 2.3500154788937244e-05, "loss": 0.2033, "step": 7684 }, { "epoch": 1.5553531673750252, "grad_norm": 0.3225070536136627, "learning_rate": 2.347967382992309e-05, "loss": 0.184, "step": 7685 }, { "epoch": 1.5555555555555556, "grad_norm": 0.2362879514694214, "learning_rate": 2.345920061243263e-05, "loss": 0.1651, "step": 7686 }, { "epoch": 1.5557579437360858, "grad_norm": 0.2885676920413971, "learning_rate": 2.3438735138537116e-05, "loss": 0.1702, "step": 7687 }, { "epoch": 1.5559603319166162, "grad_norm": 0.28008636832237244, "learning_rate": 2.341827741030702e-05, "loss": 0.1835, "step": 7688 }, { "epoch": 1.5561627200971464, "grad_norm": 0.27259331941604614, "learning_rate": 2.339782742981207e-05, "loss": 0.1785, "step": 7689 }, { "epoch": 1.5563651082776766, "grad_norm": 0.24857011437416077, "learning_rate": 2.337738519912115e-05, "loss": 0.1748, "step": 7690 }, { "epoch": 1.5565674964582068, "grad_norm": 0.28130772709846497, "learning_rate": 2.3356950720302405e-05, "loss": 0.1955, "step": 7691 }, { "epoch": 1.556769884638737, "grad_norm": 0.3307057023048401, "learning_rate": 2.3336523995423188e-05, "loss": 0.2333, "step": 7692 }, { "epoch": 1.5569722728192672, "grad_norm": 0.2997671067714691, "learning_rate": 2.331610502655005e-05, "loss": 0.1984, "step": 7693 }, { "epoch": 1.5571746609997976, "grad_norm": 0.25136569142341614, "learning_rate": 2.3295693815748763e-05, "loss": 0.1646, "step": 7694 }, { "epoch": 1.5573770491803278, "grad_norm": 0.2711356282234192, "learning_rate": 2.3275290365084336e-05, "loss": 0.2123, "step": 7695 }, { "epoch": 1.5575794373608582, "grad_norm": 0.2791728377342224, "learning_rate": 2.3254894676620964e-05, "loss": 0.2155, "step": 7696 }, { "epoch": 1.5577818255413884, "grad_norm": 0.24361151456832886, "learning_rate": 2.323450675242207e-05, "loss": 0.1759, "step": 7697 }, { "epoch": 1.5579842137219186, "grad_norm": 0.2774006426334381, "learning_rate": 2.321412659455029e-05, "loss": 0.1939, "step": 7698 }, { "epoch": 1.5581866019024488, "grad_norm": 0.29601287841796875, "learning_rate": 2.3193754205067475e-05, "loss": 0.2084, "step": 7699 }, { "epoch": 1.558388990082979, "grad_norm": 0.2615942656993866, "learning_rate": 2.31733895860347e-05, "loss": 0.216, "step": 7700 }, { "epoch": 1.558388990082979, "eval_loss": 0.2588045597076416, "eval_runtime": 0.7405, "eval_samples_per_second": 6.752, "eval_steps_per_second": 1.35, "step": 7700 }, { "epoch": 1.5585913782635095, "grad_norm": 0.29083025455474854, "learning_rate": 2.3153032739512226e-05, "loss": 0.196, "step": 7701 }, { "epoch": 1.5587937664440397, "grad_norm": 0.27147766947746277, "learning_rate": 2.313268366755955e-05, "loss": 0.1938, "step": 7702 }, { "epoch": 1.55899615462457, "grad_norm": 0.2409001886844635, "learning_rate": 2.3112342372235395e-05, "loss": 0.1785, "step": 7703 }, { "epoch": 1.5591985428051003, "grad_norm": 0.2982461452484131, "learning_rate": 2.3092008855597657e-05, "loss": 0.1916, "step": 7704 }, { "epoch": 1.5594009309856305, "grad_norm": 0.30464914441108704, "learning_rate": 2.307168311970347e-05, "loss": 0.2271, "step": 7705 }, { "epoch": 1.5596033191661607, "grad_norm": 0.2842462956905365, "learning_rate": 2.3051365166609197e-05, "loss": 0.1751, "step": 7706 }, { "epoch": 1.5598057073466909, "grad_norm": 0.2824763357639313, "learning_rate": 2.303105499837037e-05, "loss": 0.192, "step": 7707 }, { "epoch": 1.560008095527221, "grad_norm": 0.27954190969467163, "learning_rate": 2.3010752617041786e-05, "loss": 0.183, "step": 7708 }, { "epoch": 1.5602104837077515, "grad_norm": 0.3235276937484741, "learning_rate": 2.299045802467741e-05, "loss": 0.2573, "step": 7709 }, { "epoch": 1.5604128718882817, "grad_norm": 0.2770065367221832, "learning_rate": 2.2970171223330438e-05, "loss": 0.197, "step": 7710 }, { "epoch": 1.5606152600688121, "grad_norm": 0.31573495268821716, "learning_rate": 2.294989221505327e-05, "loss": 0.219, "step": 7711 }, { "epoch": 1.5608176482493423, "grad_norm": 0.31908997893333435, "learning_rate": 2.292962100189754e-05, "loss": 0.2058, "step": 7712 }, { "epoch": 1.5610200364298725, "grad_norm": 0.2873445451259613, "learning_rate": 2.290935758591406e-05, "loss": 0.1732, "step": 7713 }, { "epoch": 1.5612224246104027, "grad_norm": 0.2437688261270523, "learning_rate": 2.2889101969152882e-05, "loss": 0.1679, "step": 7714 }, { "epoch": 1.561424812790933, "grad_norm": 0.2525308132171631, "learning_rate": 2.2868854153663243e-05, "loss": 0.1584, "step": 7715 }, { "epoch": 1.5616272009714631, "grad_norm": 0.3853948414325714, "learning_rate": 2.284861414149365e-05, "loss": 0.2164, "step": 7716 }, { "epoch": 1.5618295891519935, "grad_norm": 0.2611543536186218, "learning_rate": 2.282838193469171e-05, "loss": 0.1748, "step": 7717 }, { "epoch": 1.5620319773325237, "grad_norm": 0.27541646361351013, "learning_rate": 2.280815753530433e-05, "loss": 0.1741, "step": 7718 }, { "epoch": 1.5622343655130542, "grad_norm": 0.3103647232055664, "learning_rate": 2.2787940945377604e-05, "loss": 0.2263, "step": 7719 }, { "epoch": 1.5624367536935844, "grad_norm": 0.26702842116355896, "learning_rate": 2.2767732166956834e-05, "loss": 0.1878, "step": 7720 }, { "epoch": 1.5626391418741146, "grad_norm": 0.24968096613883972, "learning_rate": 2.2747531202086537e-05, "loss": 0.1762, "step": 7721 }, { "epoch": 1.5628415300546448, "grad_norm": 0.2748836278915405, "learning_rate": 2.2727338052810433e-05, "loss": 0.1903, "step": 7722 }, { "epoch": 1.563043918235175, "grad_norm": 0.2877596914768219, "learning_rate": 2.2707152721171455e-05, "loss": 0.1981, "step": 7723 }, { "epoch": 1.5632463064157052, "grad_norm": 0.34964072704315186, "learning_rate": 2.2686975209211737e-05, "loss": 0.1866, "step": 7724 }, { "epoch": 1.5634486945962356, "grad_norm": 0.3263644874095917, "learning_rate": 2.2666805518972633e-05, "loss": 0.2138, "step": 7725 }, { "epoch": 1.5636510827767658, "grad_norm": 0.28281792998313904, "learning_rate": 2.2646643652494692e-05, "loss": 0.2022, "step": 7726 }, { "epoch": 1.5638534709572962, "grad_norm": 0.27802562713623047, "learning_rate": 2.2626489611817692e-05, "loss": 0.1902, "step": 7727 }, { "epoch": 1.5640558591378264, "grad_norm": 0.28225937485694885, "learning_rate": 2.26063433989806e-05, "loss": 0.2079, "step": 7728 }, { "epoch": 1.5642582473183566, "grad_norm": 0.2651384472846985, "learning_rate": 2.2586205016021612e-05, "loss": 0.1713, "step": 7729 }, { "epoch": 1.5644606354988868, "grad_norm": 0.4168740212917328, "learning_rate": 2.2566074464978092e-05, "loss": 0.2111, "step": 7730 }, { "epoch": 1.564663023679417, "grad_norm": 0.25438109040260315, "learning_rate": 2.254595174788665e-05, "loss": 0.1627, "step": 7731 }, { "epoch": 1.5648654118599474, "grad_norm": 0.352674275636673, "learning_rate": 2.25258368667831e-05, "loss": 0.2266, "step": 7732 }, { "epoch": 1.5650678000404776, "grad_norm": 0.2683711647987366, "learning_rate": 2.2505729823702458e-05, "loss": 0.1713, "step": 7733 }, { "epoch": 1.565270188221008, "grad_norm": 0.2740427553653717, "learning_rate": 2.2485630620678922e-05, "loss": 0.1937, "step": 7734 }, { "epoch": 1.5654725764015383, "grad_norm": 0.2754979431629181, "learning_rate": 2.2465539259745937e-05, "loss": 0.179, "step": 7735 }, { "epoch": 1.5656749645820685, "grad_norm": 0.29458338022232056, "learning_rate": 2.244545574293613e-05, "loss": 0.2165, "step": 7736 }, { "epoch": 1.5658773527625987, "grad_norm": 0.25274816155433655, "learning_rate": 2.2425380072281332e-05, "loss": 0.1507, "step": 7737 }, { "epoch": 1.5660797409431289, "grad_norm": 0.26597270369529724, "learning_rate": 2.240531224981264e-05, "loss": 0.1598, "step": 7738 }, { "epoch": 1.566282129123659, "grad_norm": 0.29303765296936035, "learning_rate": 2.238525227756022e-05, "loss": 0.2107, "step": 7739 }, { "epoch": 1.5664845173041895, "grad_norm": 0.28918832540512085, "learning_rate": 2.2365200157553577e-05, "loss": 0.2219, "step": 7740 }, { "epoch": 1.5666869054847197, "grad_norm": 0.2689541280269623, "learning_rate": 2.2345155891821367e-05, "loss": 0.1744, "step": 7741 }, { "epoch": 1.56688929366525, "grad_norm": 0.3268440067768097, "learning_rate": 2.2325119482391467e-05, "loss": 0.2066, "step": 7742 }, { "epoch": 1.5670916818457803, "grad_norm": 0.3139539659023285, "learning_rate": 2.230509093129095e-05, "loss": 0.1822, "step": 7743 }, { "epoch": 1.5672940700263105, "grad_norm": 0.27447032928466797, "learning_rate": 2.228507024054608e-05, "loss": 0.1818, "step": 7744 }, { "epoch": 1.5674964582068407, "grad_norm": 0.2501983940601349, "learning_rate": 2.2265057412182343e-05, "loss": 0.1698, "step": 7745 }, { "epoch": 1.567698846387371, "grad_norm": 0.240719735622406, "learning_rate": 2.2245052448224445e-05, "loss": 0.1456, "step": 7746 }, { "epoch": 1.567901234567901, "grad_norm": 0.336001992225647, "learning_rate": 2.2225055350696267e-05, "loss": 0.1782, "step": 7747 }, { "epoch": 1.5681036227484315, "grad_norm": 0.28484591841697693, "learning_rate": 2.22050661216209e-05, "loss": 0.2143, "step": 7748 }, { "epoch": 1.5683060109289617, "grad_norm": 0.3016730844974518, "learning_rate": 2.2185084763020647e-05, "loss": 0.1828, "step": 7749 }, { "epoch": 1.5685083991094921, "grad_norm": 0.30280449986457825, "learning_rate": 2.2165111276916994e-05, "loss": 0.1791, "step": 7750 }, { "epoch": 1.5685083991094921, "eval_loss": 0.2602272033691406, "eval_runtime": 0.7375, "eval_samples_per_second": 6.78, "eval_steps_per_second": 1.356, "step": 7750 }, { "epoch": 1.5687107872900223, "grad_norm": 0.27848196029663086, "learning_rate": 2.214514566533069e-05, "loss": 0.1732, "step": 7751 }, { "epoch": 1.5689131754705525, "grad_norm": 0.29691216349601746, "learning_rate": 2.2125187930281633e-05, "loss": 0.2041, "step": 7752 }, { "epoch": 1.5691155636510827, "grad_norm": 0.26868849992752075, "learning_rate": 2.2105238073788937e-05, "loss": 0.1715, "step": 7753 }, { "epoch": 1.569317951831613, "grad_norm": 0.26445257663726807, "learning_rate": 2.20852960978709e-05, "loss": 0.1615, "step": 7754 }, { "epoch": 1.5695203400121431, "grad_norm": 0.27450621128082275, "learning_rate": 2.2065362004545053e-05, "loss": 0.1829, "step": 7755 }, { "epoch": 1.5697227281926736, "grad_norm": 0.2744404673576355, "learning_rate": 2.2045435795828128e-05, "loss": 0.1749, "step": 7756 }, { "epoch": 1.5699251163732038, "grad_norm": 0.2578592598438263, "learning_rate": 2.2025517473736035e-05, "loss": 0.1653, "step": 7757 }, { "epoch": 1.5701275045537342, "grad_norm": 0.3351926803588867, "learning_rate": 2.2005607040283905e-05, "loss": 0.1907, "step": 7758 }, { "epoch": 1.5703298927342644, "grad_norm": 0.30829042196273804, "learning_rate": 2.198570449748608e-05, "loss": 0.1687, "step": 7759 }, { "epoch": 1.5705322809147946, "grad_norm": 0.34289824962615967, "learning_rate": 2.19658098473561e-05, "loss": 0.2137, "step": 7760 }, { "epoch": 1.5707346690953248, "grad_norm": 0.24221114814281464, "learning_rate": 2.194592309190665e-05, "loss": 0.1739, "step": 7761 }, { "epoch": 1.570937057275855, "grad_norm": 0.2792688012123108, "learning_rate": 2.1926044233149678e-05, "loss": 0.207, "step": 7762 }, { "epoch": 1.5711394454563854, "grad_norm": 0.25298550724983215, "learning_rate": 2.190617327309634e-05, "loss": 0.1634, "step": 7763 }, { "epoch": 1.5713418336369156, "grad_norm": 0.27479955554008484, "learning_rate": 2.1886310213756965e-05, "loss": 0.1746, "step": 7764 }, { "epoch": 1.571544221817446, "grad_norm": 0.2710821330547333, "learning_rate": 2.1866455057141078e-05, "loss": 0.1784, "step": 7765 }, { "epoch": 1.5717466099979762, "grad_norm": 0.3006027340888977, "learning_rate": 2.1846607805257426e-05, "loss": 0.1959, "step": 7766 }, { "epoch": 1.5719489981785064, "grad_norm": 0.3054922819137573, "learning_rate": 2.1826768460113943e-05, "loss": 0.1932, "step": 7767 }, { "epoch": 1.5721513863590366, "grad_norm": 0.3049944043159485, "learning_rate": 2.1806937023717767e-05, "loss": 0.2066, "step": 7768 }, { "epoch": 1.5723537745395668, "grad_norm": 0.2929500639438629, "learning_rate": 2.178711349807523e-05, "loss": 0.1834, "step": 7769 }, { "epoch": 1.572556162720097, "grad_norm": 0.2628859877586365, "learning_rate": 2.1767297885191862e-05, "loss": 0.1958, "step": 7770 }, { "epoch": 1.5727585509006274, "grad_norm": 0.30760657787323, "learning_rate": 2.1747490187072418e-05, "loss": 0.1956, "step": 7771 }, { "epoch": 1.5729609390811576, "grad_norm": 0.2853303551673889, "learning_rate": 2.1727690405720814e-05, "loss": 0.1978, "step": 7772 }, { "epoch": 1.573163327261688, "grad_norm": 0.2791989743709564, "learning_rate": 2.1707898543140203e-05, "loss": 0.1951, "step": 7773 }, { "epoch": 1.5733657154422183, "grad_norm": 0.3224547207355499, "learning_rate": 2.16881146013329e-05, "loss": 0.1915, "step": 7774 }, { "epoch": 1.5735681036227485, "grad_norm": 0.23901736736297607, "learning_rate": 2.166833858230045e-05, "loss": 0.1616, "step": 7775 }, { "epoch": 1.5737704918032787, "grad_norm": 0.3068159520626068, "learning_rate": 2.1648570488043575e-05, "loss": 0.1809, "step": 7776 }, { "epoch": 1.5739728799838089, "grad_norm": 0.2887951135635376, "learning_rate": 2.162881032056221e-05, "loss": 0.2189, "step": 7777 }, { "epoch": 1.574175268164339, "grad_norm": 0.27136558294296265, "learning_rate": 2.160905808185547e-05, "loss": 0.194, "step": 7778 }, { "epoch": 1.5743776563448695, "grad_norm": 0.26406726241111755, "learning_rate": 2.1589313773921684e-05, "loss": 0.1766, "step": 7779 }, { "epoch": 1.5745800445253997, "grad_norm": 0.369335800409317, "learning_rate": 2.156957739875838e-05, "loss": 0.2335, "step": 7780 }, { "epoch": 1.57478243270593, "grad_norm": 0.2897554934024811, "learning_rate": 2.154984895836227e-05, "loss": 0.1566, "step": 7781 }, { "epoch": 1.5749848208864603, "grad_norm": 0.26279720664024353, "learning_rate": 2.1530128454729315e-05, "loss": 0.1961, "step": 7782 }, { "epoch": 1.5751872090669905, "grad_norm": 0.32653093338012695, "learning_rate": 2.1510415889854553e-05, "loss": 0.2299, "step": 7783 }, { "epoch": 1.5753895972475207, "grad_norm": 0.34345322847366333, "learning_rate": 2.1490711265732332e-05, "loss": 0.215, "step": 7784 }, { "epoch": 1.575591985428051, "grad_norm": 0.30096638202667236, "learning_rate": 2.147101458435615e-05, "loss": 0.1803, "step": 7785 }, { "epoch": 1.575794373608581, "grad_norm": 0.25723904371261597, "learning_rate": 2.1451325847718716e-05, "loss": 0.1529, "step": 7786 }, { "epoch": 1.5759967617891115, "grad_norm": 0.30034515261650085, "learning_rate": 2.1431645057811943e-05, "loss": 0.1939, "step": 7787 }, { "epoch": 1.5761991499696417, "grad_norm": 0.33007681369781494, "learning_rate": 2.141197221662691e-05, "loss": 0.216, "step": 7788 }, { "epoch": 1.5764015381501721, "grad_norm": 0.2949335277080536, "learning_rate": 2.1392307326153903e-05, "loss": 0.2043, "step": 7789 }, { "epoch": 1.5766039263307023, "grad_norm": 0.24349358677864075, "learning_rate": 2.137265038838243e-05, "loss": 0.158, "step": 7790 }, { "epoch": 1.5768063145112325, "grad_norm": 0.2552421987056732, "learning_rate": 2.1353001405301155e-05, "loss": 0.1746, "step": 7791 }, { "epoch": 1.5770087026917627, "grad_norm": 0.2736150324344635, "learning_rate": 2.133336037889797e-05, "loss": 0.1802, "step": 7792 }, { "epoch": 1.577211090872293, "grad_norm": 0.31855508685112, "learning_rate": 2.1313727311159948e-05, "loss": 0.2008, "step": 7793 }, { "epoch": 1.5774134790528234, "grad_norm": 0.2970321476459503, "learning_rate": 2.129410220407334e-05, "loss": 0.2028, "step": 7794 }, { "epoch": 1.5776158672333536, "grad_norm": 0.25379669666290283, "learning_rate": 2.127448505962363e-05, "loss": 0.189, "step": 7795 }, { "epoch": 1.577818255413884, "grad_norm": 0.25099897384643555, "learning_rate": 2.1254875879795454e-05, "loss": 0.1859, "step": 7796 }, { "epoch": 1.5780206435944142, "grad_norm": 0.32417258620262146, "learning_rate": 2.123527466657268e-05, "loss": 0.2089, "step": 7797 }, { "epoch": 1.5782230317749444, "grad_norm": 0.33070147037506104, "learning_rate": 2.1215681421938338e-05, "loss": 0.1779, "step": 7798 }, { "epoch": 1.5784254199554746, "grad_norm": 0.312674343585968, "learning_rate": 2.1196096147874677e-05, "loss": 0.2029, "step": 7799 }, { "epoch": 1.5786278081360048, "grad_norm": 0.33172622323036194, "learning_rate": 2.1176518846363136e-05, "loss": 0.2064, "step": 7800 }, { "epoch": 1.5786278081360048, "eval_loss": 0.25997477769851685, "eval_runtime": 0.741, "eval_samples_per_second": 6.748, "eval_steps_per_second": 1.35, "step": 7800 }, { "epoch": 1.578830196316535, "grad_norm": 0.34337952733039856, "learning_rate": 2.1156949519384328e-05, "loss": 0.2133, "step": 7801 }, { "epoch": 1.5790325844970654, "grad_norm": 0.27992233633995056, "learning_rate": 2.113738816891808e-05, "loss": 0.1891, "step": 7802 }, { "epoch": 1.5792349726775956, "grad_norm": 0.35382652282714844, "learning_rate": 2.1117834796943392e-05, "loss": 0.1931, "step": 7803 }, { "epoch": 1.579437360858126, "grad_norm": 0.24651670455932617, "learning_rate": 2.1098289405438487e-05, "loss": 0.1603, "step": 7804 }, { "epoch": 1.5796397490386562, "grad_norm": 0.28411322832107544, "learning_rate": 2.107875199638075e-05, "loss": 0.1937, "step": 7805 }, { "epoch": 1.5798421372191864, "grad_norm": 0.26361873745918274, "learning_rate": 2.1059222571746785e-05, "loss": 0.1676, "step": 7806 }, { "epoch": 1.5800445253997166, "grad_norm": 0.2438620775938034, "learning_rate": 2.1039701133512346e-05, "loss": 0.1646, "step": 7807 }, { "epoch": 1.5802469135802468, "grad_norm": 0.27672767639160156, "learning_rate": 2.102018768365244e-05, "loss": 0.1771, "step": 7808 }, { "epoch": 1.580449301760777, "grad_norm": 0.2786938548088074, "learning_rate": 2.100068222414121e-05, "loss": 0.1899, "step": 7809 }, { "epoch": 1.5806516899413074, "grad_norm": 0.2908424139022827, "learning_rate": 2.098118475695202e-05, "loss": 0.1776, "step": 7810 }, { "epoch": 1.5808540781218376, "grad_norm": 0.2723267078399658, "learning_rate": 2.0961695284057438e-05, "loss": 0.185, "step": 7811 }, { "epoch": 1.581056466302368, "grad_norm": 0.30924347043037415, "learning_rate": 2.0942213807429166e-05, "loss": 0.2106, "step": 7812 }, { "epoch": 1.5812588544828983, "grad_norm": 0.2639710605144501, "learning_rate": 2.092274032903817e-05, "loss": 0.1744, "step": 7813 }, { "epoch": 1.5814612426634285, "grad_norm": 0.28179532289505005, "learning_rate": 2.090327485085456e-05, "loss": 0.1761, "step": 7814 }, { "epoch": 1.5816636308439587, "grad_norm": 0.26997724175453186, "learning_rate": 2.0883817374847646e-05, "loss": 0.1628, "step": 7815 }, { "epoch": 1.5818660190244889, "grad_norm": 0.26997148990631104, "learning_rate": 2.0864367902985927e-05, "loss": 0.197, "step": 7816 }, { "epoch": 1.582068407205019, "grad_norm": 0.3067236542701721, "learning_rate": 2.0844926437237112e-05, "loss": 0.2307, "step": 7817 }, { "epoch": 1.5822707953855495, "grad_norm": 0.3065381944179535, "learning_rate": 2.082549297956806e-05, "loss": 0.2012, "step": 7818 }, { "epoch": 1.5824731835660797, "grad_norm": 0.28395718336105347, "learning_rate": 2.0806067531944874e-05, "loss": 0.1967, "step": 7819 }, { "epoch": 1.58267557174661, "grad_norm": 0.27498477697372437, "learning_rate": 2.0786650096332805e-05, "loss": 0.1584, "step": 7820 }, { "epoch": 1.5828779599271403, "grad_norm": 0.3125120997428894, "learning_rate": 2.0767240674696297e-05, "loss": 0.1938, "step": 7821 }, { "epoch": 1.5830803481076705, "grad_norm": 0.3111341595649719, "learning_rate": 2.0747839268998994e-05, "loss": 0.1993, "step": 7822 }, { "epoch": 1.5832827362882007, "grad_norm": 0.28946933150291443, "learning_rate": 2.072844588120374e-05, "loss": 0.1821, "step": 7823 }, { "epoch": 1.583485124468731, "grad_norm": 0.25271058082580566, "learning_rate": 2.070906051327254e-05, "loss": 0.1729, "step": 7824 }, { "epoch": 1.5836875126492613, "grad_norm": 0.293929785490036, "learning_rate": 2.0689683167166597e-05, "loss": 0.178, "step": 7825 }, { "epoch": 1.5838899008297915, "grad_norm": 0.36047351360321045, "learning_rate": 2.0670313844846335e-05, "loss": 0.179, "step": 7826 }, { "epoch": 1.584092289010322, "grad_norm": 0.2902386486530304, "learning_rate": 2.065095254827133e-05, "loss": 0.2103, "step": 7827 }, { "epoch": 1.5842946771908522, "grad_norm": 0.26246702671051025, "learning_rate": 2.0631599279400328e-05, "loss": 0.1896, "step": 7828 }, { "epoch": 1.5844970653713824, "grad_norm": 0.2978105843067169, "learning_rate": 2.0612254040191314e-05, "loss": 0.207, "step": 7829 }, { "epoch": 1.5846994535519126, "grad_norm": 0.2901962995529175, "learning_rate": 2.0592916832601428e-05, "loss": 0.1672, "step": 7830 }, { "epoch": 1.5849018417324428, "grad_norm": 0.29052335023880005, "learning_rate": 2.0573587658587002e-05, "loss": 0.1933, "step": 7831 }, { "epoch": 1.585104229912973, "grad_norm": 0.30138322710990906, "learning_rate": 2.055426652010356e-05, "loss": 0.2104, "step": 7832 }, { "epoch": 1.5853066180935034, "grad_norm": 0.29424503445625305, "learning_rate": 2.0534953419105828e-05, "loss": 0.2047, "step": 7833 }, { "epoch": 1.5855090062740336, "grad_norm": 0.25280043482780457, "learning_rate": 2.051564835754769e-05, "loss": 0.1862, "step": 7834 }, { "epoch": 1.585711394454564, "grad_norm": 0.21730902791023254, "learning_rate": 2.0496351337382224e-05, "loss": 0.1382, "step": 7835 }, { "epoch": 1.5859137826350942, "grad_norm": 0.26311731338500977, "learning_rate": 2.0477062360561716e-05, "loss": 0.1818, "step": 7836 }, { "epoch": 1.5861161708156244, "grad_norm": 0.30090418457984924, "learning_rate": 2.0457781429037604e-05, "loss": 0.2145, "step": 7837 }, { "epoch": 1.5863185589961546, "grad_norm": 0.2743259072303772, "learning_rate": 2.043850854476055e-05, "loss": 0.1798, "step": 7838 }, { "epoch": 1.5865209471766848, "grad_norm": 0.2695198059082031, "learning_rate": 2.041924370968037e-05, "loss": 0.1826, "step": 7839 }, { "epoch": 1.586723335357215, "grad_norm": 0.25866490602493286, "learning_rate": 2.0399986925746072e-05, "loss": 0.1824, "step": 7840 }, { "epoch": 1.5869257235377454, "grad_norm": 0.28438708186149597, "learning_rate": 2.038073819490587e-05, "loss": 0.2106, "step": 7841 }, { "epoch": 1.5871281117182756, "grad_norm": 0.29071030020713806, "learning_rate": 2.0361497519107144e-05, "loss": 0.1937, "step": 7842 }, { "epoch": 1.587330499898806, "grad_norm": 0.257099986076355, "learning_rate": 2.034226490029646e-05, "loss": 0.1674, "step": 7843 }, { "epoch": 1.5875328880793362, "grad_norm": 0.31228503584861755, "learning_rate": 2.0323040340419575e-05, "loss": 0.1575, "step": 7844 }, { "epoch": 1.5877352762598664, "grad_norm": 0.2626998722553253, "learning_rate": 2.030382384142142e-05, "loss": 0.1673, "step": 7845 }, { "epoch": 1.5879376644403966, "grad_norm": 0.25823095440864563, "learning_rate": 2.0284615405246132e-05, "loss": 0.1646, "step": 7846 }, { "epoch": 1.5881400526209268, "grad_norm": 0.3161360025405884, "learning_rate": 2.026541503383702e-05, "loss": 0.2215, "step": 7847 }, { "epoch": 1.588342440801457, "grad_norm": 0.27048152685165405, "learning_rate": 2.0246222729136565e-05, "loss": 0.1853, "step": 7848 }, { "epoch": 1.5885448289819875, "grad_norm": 0.3523916006088257, "learning_rate": 2.022703849308645e-05, "loss": 0.1951, "step": 7849 }, { "epoch": 1.5887472171625177, "grad_norm": 0.2627606987953186, "learning_rate": 2.0207862327627526e-05, "loss": 0.1871, "step": 7850 }, { "epoch": 1.5887472171625177, "eval_loss": 0.2618047893047333, "eval_runtime": 0.7404, "eval_samples_per_second": 6.753, "eval_steps_per_second": 1.351, "step": 7850 }, { "epoch": 1.588949605343048, "grad_norm": 0.33208194375038147, "learning_rate": 2.0188694234699835e-05, "loss": 0.1907, "step": 7851 }, { "epoch": 1.5891519935235783, "grad_norm": 0.32101473212242126, "learning_rate": 2.0169534216242626e-05, "loss": 0.196, "step": 7852 }, { "epoch": 1.5893543817041085, "grad_norm": 0.3157752454280853, "learning_rate": 2.015038227419428e-05, "loss": 0.1779, "step": 7853 }, { "epoch": 1.5895567698846387, "grad_norm": 0.32359275221824646, "learning_rate": 2.0131238410492416e-05, "loss": 0.1995, "step": 7854 }, { "epoch": 1.5897591580651689, "grad_norm": 0.2650693356990814, "learning_rate": 2.011210262707379e-05, "loss": 0.1822, "step": 7855 }, { "epoch": 1.5899615462456993, "grad_norm": 0.24113725125789642, "learning_rate": 2.0092974925874365e-05, "loss": 0.1699, "step": 7856 }, { "epoch": 1.5901639344262295, "grad_norm": 0.2838568687438965, "learning_rate": 2.007385530882928e-05, "loss": 0.1952, "step": 7857 }, { "epoch": 1.59036632260676, "grad_norm": 0.2544390857219696, "learning_rate": 2.0054743777872864e-05, "loss": 0.148, "step": 7858 }, { "epoch": 1.5905687107872901, "grad_norm": 0.2745042145252228, "learning_rate": 2.003564033493862e-05, "loss": 0.1686, "step": 7859 }, { "epoch": 1.5907710989678203, "grad_norm": 0.29645898938179016, "learning_rate": 2.001654498195922e-05, "loss": 0.2024, "step": 7860 }, { "epoch": 1.5909734871483505, "grad_norm": 0.2774428129196167, "learning_rate": 1.999745772086655e-05, "loss": 0.1713, "step": 7861 }, { "epoch": 1.5911758753288807, "grad_norm": 0.2696341872215271, "learning_rate": 1.997837855359165e-05, "loss": 0.1901, "step": 7862 }, { "epoch": 1.591378263509411, "grad_norm": 0.26396724581718445, "learning_rate": 1.995930748206475e-05, "loss": 0.177, "step": 7863 }, { "epoch": 1.5915806516899413, "grad_norm": 0.2746181786060333, "learning_rate": 1.9940244508215255e-05, "loss": 0.1755, "step": 7864 }, { "epoch": 1.5917830398704715, "grad_norm": 0.22371596097946167, "learning_rate": 1.9921189633971772e-05, "loss": 0.1732, "step": 7865 }, { "epoch": 1.591985428051002, "grad_norm": 0.25550487637519836, "learning_rate": 1.9902142861262063e-05, "loss": 0.1551, "step": 7866 }, { "epoch": 1.5921878162315322, "grad_norm": 0.27655959129333496, "learning_rate": 1.988310419201308e-05, "loss": 0.206, "step": 7867 }, { "epoch": 1.5923902044120624, "grad_norm": 0.2964775860309601, "learning_rate": 1.9864073628150958e-05, "loss": 0.1677, "step": 7868 }, { "epoch": 1.5925925925925926, "grad_norm": 0.2599323093891144, "learning_rate": 1.9845051171601005e-05, "loss": 0.1946, "step": 7869 }, { "epoch": 1.5927949807731228, "grad_norm": 0.27363482117652893, "learning_rate": 1.982603682428772e-05, "loss": 0.211, "step": 7870 }, { "epoch": 1.592997368953653, "grad_norm": 0.322819322347641, "learning_rate": 1.98070305881348e-05, "loss": 0.2174, "step": 7871 }, { "epoch": 1.5931997571341834, "grad_norm": 0.3303944170475006, "learning_rate": 1.9788032465065054e-05, "loss": 0.1916, "step": 7872 }, { "epoch": 1.5934021453147136, "grad_norm": 0.24966974556446075, "learning_rate": 1.976904245700052e-05, "loss": 0.1766, "step": 7873 }, { "epoch": 1.593604533495244, "grad_norm": 0.27133068442344666, "learning_rate": 1.9750060565862417e-05, "loss": 0.1797, "step": 7874 }, { "epoch": 1.5938069216757742, "grad_norm": 0.3636033535003662, "learning_rate": 1.973108679357113e-05, "loss": 0.2262, "step": 7875 }, { "epoch": 1.5940093098563044, "grad_norm": 0.3111708164215088, "learning_rate": 1.9712121142046237e-05, "loss": 0.1893, "step": 7876 }, { "epoch": 1.5942116980368346, "grad_norm": 0.3064495325088501, "learning_rate": 1.969316361320647e-05, "loss": 0.1886, "step": 7877 }, { "epoch": 1.5944140862173648, "grad_norm": 0.2953830361366272, "learning_rate": 1.9674214208969754e-05, "loss": 0.2083, "step": 7878 }, { "epoch": 1.594616474397895, "grad_norm": 0.29661932587623596, "learning_rate": 1.9655272931253197e-05, "loss": 0.198, "step": 7879 }, { "epoch": 1.5948188625784254, "grad_norm": 0.3036056458950043, "learning_rate": 1.963633978197308e-05, "loss": 0.2531, "step": 7880 }, { "epoch": 1.5950212507589556, "grad_norm": 0.24966773390769958, "learning_rate": 1.961741476304486e-05, "loss": 0.1692, "step": 7881 }, { "epoch": 1.595223638939486, "grad_norm": 0.2774880528450012, "learning_rate": 1.959849787638317e-05, "loss": 0.2304, "step": 7882 }, { "epoch": 1.5954260271200162, "grad_norm": 0.2703644931316376, "learning_rate": 1.957958912390182e-05, "loss": 0.207, "step": 7883 }, { "epoch": 1.5956284153005464, "grad_norm": 0.308747798204422, "learning_rate": 1.956068850751379e-05, "loss": 0.2115, "step": 7884 }, { "epoch": 1.5958308034810766, "grad_norm": 0.2647557854652405, "learning_rate": 1.9541796029131278e-05, "loss": 0.1739, "step": 7885 }, { "epoch": 1.5960331916616068, "grad_norm": 0.3066260814666748, "learning_rate": 1.9522911690665592e-05, "loss": 0.2116, "step": 7886 }, { "epoch": 1.5962355798421373, "grad_norm": 0.2615651488304138, "learning_rate": 1.950403549402726e-05, "loss": 0.1891, "step": 7887 }, { "epoch": 1.5964379680226675, "grad_norm": 0.28497937321662903, "learning_rate": 1.9485167441125995e-05, "loss": 0.1681, "step": 7888 }, { "epoch": 1.5966403562031979, "grad_norm": 0.2624446153640747, "learning_rate": 1.9466307533870643e-05, "loss": 0.1786, "step": 7889 }, { "epoch": 1.596842744383728, "grad_norm": 0.31461378931999207, "learning_rate": 1.9447455774169276e-05, "loss": 0.1967, "step": 7890 }, { "epoch": 1.5970451325642583, "grad_norm": 0.31892645359039307, "learning_rate": 1.9428612163929093e-05, "loss": 0.1966, "step": 7891 }, { "epoch": 1.5972475207447885, "grad_norm": 0.27543261647224426, "learning_rate": 1.9409776705056516e-05, "loss": 0.1968, "step": 7892 }, { "epoch": 1.5974499089253187, "grad_norm": 0.2699550986289978, "learning_rate": 1.9390949399457104e-05, "loss": 0.2061, "step": 7893 }, { "epoch": 1.5976522971058489, "grad_norm": 0.269255667924881, "learning_rate": 1.9372130249035638e-05, "loss": 0.2118, "step": 7894 }, { "epoch": 1.5978546852863793, "grad_norm": 0.26168566942214966, "learning_rate": 1.935331925569599e-05, "loss": 0.1675, "step": 7895 }, { "epoch": 1.5980570734669095, "grad_norm": 0.25531288981437683, "learning_rate": 1.9334516421341276e-05, "loss": 0.1679, "step": 7896 }, { "epoch": 1.59825946164744, "grad_norm": 0.2488667219877243, "learning_rate": 1.931572174787378e-05, "loss": 0.1667, "step": 7897 }, { "epoch": 1.5984618498279701, "grad_norm": 0.2918820381164551, "learning_rate": 1.929693523719496e-05, "loss": 0.2051, "step": 7898 }, { "epoch": 1.5986642380085003, "grad_norm": 0.3507806360721588, "learning_rate": 1.927815689120541e-05, "loss": 0.2313, "step": 7899 }, { "epoch": 1.5988666261890305, "grad_norm": 0.26194295287132263, "learning_rate": 1.925938671180495e-05, "loss": 0.2001, "step": 7900 }, { "epoch": 1.5988666261890305, "eval_loss": 0.2608179450035095, "eval_runtime": 0.7369, "eval_samples_per_second": 6.785, "eval_steps_per_second": 1.357, "step": 7900 }, { "epoch": 1.5990690143695607, "grad_norm": 0.29185500741004944, "learning_rate": 1.924062470089253e-05, "loss": 0.1777, "step": 7901 }, { "epoch": 1.599271402550091, "grad_norm": 0.30840426683425903, "learning_rate": 1.922187086036632e-05, "loss": 0.2055, "step": 7902 }, { "epoch": 1.5994737907306213, "grad_norm": 0.2773885428905487, "learning_rate": 1.9203125192123584e-05, "loss": 0.203, "step": 7903 }, { "epoch": 1.5996761789111515, "grad_norm": 0.2688390910625458, "learning_rate": 1.918438769806088e-05, "loss": 0.1963, "step": 7904 }, { "epoch": 1.599878567091682, "grad_norm": 0.3111615478992462, "learning_rate": 1.9165658380073838e-05, "loss": 0.1637, "step": 7905 }, { "epoch": 1.6000809552722122, "grad_norm": 0.2527560293674469, "learning_rate": 1.9146937240057295e-05, "loss": 0.1517, "step": 7906 }, { "epoch": 1.6002833434527424, "grad_norm": 0.3315982222557068, "learning_rate": 1.912822427990526e-05, "loss": 0.1985, "step": 7907 }, { "epoch": 1.6004857316332726, "grad_norm": 0.3067420423030853, "learning_rate": 1.9109519501510907e-05, "loss": 0.198, "step": 7908 }, { "epoch": 1.6006881198138028, "grad_norm": 0.2605978846549988, "learning_rate": 1.9090822906766616e-05, "loss": 0.1716, "step": 7909 }, { "epoch": 1.600890507994333, "grad_norm": 0.25154924392700195, "learning_rate": 1.9072134497563877e-05, "loss": 0.1828, "step": 7910 }, { "epoch": 1.6010928961748634, "grad_norm": 0.32552117109298706, "learning_rate": 1.9053454275793403e-05, "loss": 0.1962, "step": 7911 }, { "epoch": 1.6012952843553938, "grad_norm": 0.25949302315711975, "learning_rate": 1.903478224334507e-05, "loss": 0.1519, "step": 7912 }, { "epoch": 1.601497672535924, "grad_norm": 0.2832423746585846, "learning_rate": 1.9016118402107907e-05, "loss": 0.1901, "step": 7913 }, { "epoch": 1.6017000607164542, "grad_norm": 0.30006927251815796, "learning_rate": 1.899746275397014e-05, "loss": 0.2194, "step": 7914 }, { "epoch": 1.6019024488969844, "grad_norm": 0.2572685480117798, "learning_rate": 1.897881530081913e-05, "loss": 0.1503, "step": 7915 }, { "epoch": 1.6021048370775146, "grad_norm": 0.25450223684310913, "learning_rate": 1.8960176044541468e-05, "loss": 0.1649, "step": 7916 }, { "epoch": 1.6023072252580448, "grad_norm": 0.25192636251449585, "learning_rate": 1.894154498702283e-05, "loss": 0.1719, "step": 7917 }, { "epoch": 1.6025096134385752, "grad_norm": 0.2704130709171295, "learning_rate": 1.8922922130148135e-05, "loss": 0.2517, "step": 7918 }, { "epoch": 1.6027120016191054, "grad_norm": 0.24531984329223633, "learning_rate": 1.8904307475801453e-05, "loss": 0.1886, "step": 7919 }, { "epoch": 1.6029143897996359, "grad_norm": 0.25487950444221497, "learning_rate": 1.8885701025865998e-05, "loss": 0.1867, "step": 7920 }, { "epoch": 1.603116777980166, "grad_norm": 0.2579203248023987, "learning_rate": 1.88671027822242e-05, "loss": 0.1704, "step": 7921 }, { "epoch": 1.6033191661606963, "grad_norm": 0.2408556193113327, "learning_rate": 1.884851274675763e-05, "loss": 0.1505, "step": 7922 }, { "epoch": 1.6035215543412265, "grad_norm": 0.3046492040157318, "learning_rate": 1.8829930921347016e-05, "loss": 0.1735, "step": 7923 }, { "epoch": 1.6037239425217567, "grad_norm": 0.272602915763855, "learning_rate": 1.8811357307872292e-05, "loss": 0.1881, "step": 7924 }, { "epoch": 1.6039263307022869, "grad_norm": 0.2787960469722748, "learning_rate": 1.8792791908212527e-05, "loss": 0.1472, "step": 7925 }, { "epoch": 1.6041287188828173, "grad_norm": 0.26666730642318726, "learning_rate": 1.8774234724245977e-05, "loss": 0.1689, "step": 7926 }, { "epoch": 1.6043311070633475, "grad_norm": 0.2414676994085312, "learning_rate": 1.875568575785007e-05, "loss": 0.1747, "step": 7927 }, { "epoch": 1.604533495243878, "grad_norm": 0.2558421492576599, "learning_rate": 1.8737145010901392e-05, "loss": 0.1747, "step": 7928 }, { "epoch": 1.604735883424408, "grad_norm": 0.2758699357509613, "learning_rate": 1.87186124852757e-05, "loss": 0.2173, "step": 7929 }, { "epoch": 1.6049382716049383, "grad_norm": 0.26736441254615784, "learning_rate": 1.870008818284792e-05, "loss": 0.1528, "step": 7930 }, { "epoch": 1.6051406597854685, "grad_norm": 0.310761034488678, "learning_rate": 1.868157210549215e-05, "loss": 0.2001, "step": 7931 }, { "epoch": 1.6053430479659987, "grad_norm": 0.299629807472229, "learning_rate": 1.866306425508164e-05, "loss": 0.192, "step": 7932 }, { "epoch": 1.605545436146529, "grad_norm": 0.29254329204559326, "learning_rate": 1.8644564633488836e-05, "loss": 0.1932, "step": 7933 }, { "epoch": 1.6057478243270593, "grad_norm": 0.2738923132419586, "learning_rate": 1.862607324258534e-05, "loss": 0.213, "step": 7934 }, { "epoch": 1.6059502125075895, "grad_norm": 0.26249390840530396, "learning_rate": 1.860759008424189e-05, "loss": 0.1909, "step": 7935 }, { "epoch": 1.60615260068812, "grad_norm": 0.3084731996059418, "learning_rate": 1.858911516032844e-05, "loss": 0.2246, "step": 7936 }, { "epoch": 1.6063549888686501, "grad_norm": 0.2792483866214752, "learning_rate": 1.857064847271409e-05, "loss": 0.1892, "step": 7937 }, { "epoch": 1.6065573770491803, "grad_norm": 0.30546650290489197, "learning_rate": 1.8552190023267112e-05, "loss": 0.1726, "step": 7938 }, { "epoch": 1.6067597652297105, "grad_norm": 0.2785407304763794, "learning_rate": 1.8533739813854912e-05, "loss": 0.1678, "step": 7939 }, { "epoch": 1.6069621534102407, "grad_norm": 0.3166220486164093, "learning_rate": 1.8515297846344093e-05, "loss": 0.2157, "step": 7940 }, { "epoch": 1.607164541590771, "grad_norm": 0.30899468064308167, "learning_rate": 1.8496864122600434e-05, "loss": 0.1936, "step": 7941 }, { "epoch": 1.6073669297713014, "grad_norm": 0.25260448455810547, "learning_rate": 1.847843864448886e-05, "loss": 0.1749, "step": 7942 }, { "epoch": 1.6075693179518318, "grad_norm": 0.2828786373138428, "learning_rate": 1.846002141387346e-05, "loss": 0.1664, "step": 7943 }, { "epoch": 1.607771706132362, "grad_norm": 0.2762792110443115, "learning_rate": 1.8441612432617517e-05, "loss": 0.1799, "step": 7944 }, { "epoch": 1.6079740943128922, "grad_norm": 0.2897892892360687, "learning_rate": 1.8423211702583442e-05, "loss": 0.2078, "step": 7945 }, { "epoch": 1.6081764824934224, "grad_norm": 0.2666051685810089, "learning_rate": 1.840481922563283e-05, "loss": 0.1653, "step": 7946 }, { "epoch": 1.6083788706739526, "grad_norm": 0.2855166792869568, "learning_rate": 1.8386435003626436e-05, "loss": 0.1844, "step": 7947 }, { "epoch": 1.6085812588544828, "grad_norm": 0.22759123146533966, "learning_rate": 1.8368059038424192e-05, "loss": 0.1561, "step": 7948 }, { "epoch": 1.6087836470350132, "grad_norm": 0.2589799761772156, "learning_rate": 1.8349691331885178e-05, "loss": 0.1857, "step": 7949 }, { "epoch": 1.6089860352155434, "grad_norm": 0.3158065974712372, "learning_rate": 1.8331331885867643e-05, "loss": 0.2109, "step": 7950 }, { "epoch": 1.6089860352155434, "eval_loss": 0.2593998312950134, "eval_runtime": 0.7395, "eval_samples_per_second": 6.762, "eval_steps_per_second": 1.352, "step": 7950 }, { "epoch": 1.6091884233960738, "grad_norm": 0.3034374415874481, "learning_rate": 1.831298070222902e-05, "loss": 0.205, "step": 7951 }, { "epoch": 1.609390811576604, "grad_norm": 0.271475225687027, "learning_rate": 1.8294637782825875e-05, "loss": 0.1823, "step": 7952 }, { "epoch": 1.6095931997571342, "grad_norm": 0.3029545247554779, "learning_rate": 1.827630312951395e-05, "loss": 0.1872, "step": 7953 }, { "epoch": 1.6097955879376644, "grad_norm": 0.2891842722892761, "learning_rate": 1.8257976744148153e-05, "loss": 0.1942, "step": 7954 }, { "epoch": 1.6099979761181946, "grad_norm": 0.30320534110069275, "learning_rate": 1.8239658628582567e-05, "loss": 0.2053, "step": 7955 }, { "epoch": 1.6102003642987248, "grad_norm": 0.2838584780693054, "learning_rate": 1.822134878467041e-05, "loss": 0.1848, "step": 7956 }, { "epoch": 1.6104027524792552, "grad_norm": 0.3041428327560425, "learning_rate": 1.8203047214264103e-05, "loss": 0.1858, "step": 7957 }, { "epoch": 1.6106051406597854, "grad_norm": 0.2750832438468933, "learning_rate": 1.818475391921518e-05, "loss": 0.1843, "step": 7958 }, { "epoch": 1.6108075288403159, "grad_norm": 0.3491036593914032, "learning_rate": 1.816646890137439e-05, "loss": 0.1402, "step": 7959 }, { "epoch": 1.611009917020846, "grad_norm": 0.23265735805034637, "learning_rate": 1.8148192162591605e-05, "loss": 0.1547, "step": 7960 }, { "epoch": 1.6112123052013763, "grad_norm": 0.26172006130218506, "learning_rate": 1.8129923704715868e-05, "loss": 0.1739, "step": 7961 }, { "epoch": 1.6114146933819065, "grad_norm": 0.25250962376594543, "learning_rate": 1.81116635295954e-05, "loss": 0.1906, "step": 7962 }, { "epoch": 1.6116170815624367, "grad_norm": 0.3147255480289459, "learning_rate": 1.8093411639077572e-05, "loss": 0.2193, "step": 7963 }, { "epoch": 1.6118194697429669, "grad_norm": 0.2867109775543213, "learning_rate": 1.8075168035008917e-05, "loss": 0.2072, "step": 7964 }, { "epoch": 1.6120218579234973, "grad_norm": 0.28083083033561707, "learning_rate": 1.805693271923514e-05, "loss": 0.1933, "step": 7965 }, { "epoch": 1.6122242461040275, "grad_norm": 0.28652331233024597, "learning_rate": 1.803870569360109e-05, "loss": 0.1897, "step": 7966 }, { "epoch": 1.612426634284558, "grad_norm": 0.3437660038471222, "learning_rate": 1.8020486959950777e-05, "loss": 0.1855, "step": 7967 }, { "epoch": 1.612629022465088, "grad_norm": 0.35121816396713257, "learning_rate": 1.8002276520127405e-05, "loss": 0.2342, "step": 7968 }, { "epoch": 1.6128314106456183, "grad_norm": 0.28837767243385315, "learning_rate": 1.7984074375973292e-05, "loss": 0.2186, "step": 7969 }, { "epoch": 1.6130337988261485, "grad_norm": 0.3354552984237671, "learning_rate": 1.796588052932996e-05, "loss": 0.2227, "step": 7970 }, { "epoch": 1.6132361870066787, "grad_norm": 0.3056231439113617, "learning_rate": 1.7947694982038054e-05, "loss": 0.214, "step": 7971 }, { "epoch": 1.613438575187209, "grad_norm": 0.2902771830558777, "learning_rate": 1.7929517735937405e-05, "loss": 0.17, "step": 7972 }, { "epoch": 1.6136409633677393, "grad_norm": 0.24810972809791565, "learning_rate": 1.7911348792867e-05, "loss": 0.1818, "step": 7973 }, { "epoch": 1.6138433515482697, "grad_norm": 0.303475022315979, "learning_rate": 1.7893188154664984e-05, "loss": 0.2232, "step": 7974 }, { "epoch": 1.6140457397288, "grad_norm": 0.2750754952430725, "learning_rate": 1.787503582316864e-05, "loss": 0.1794, "step": 7975 }, { "epoch": 1.6142481279093301, "grad_norm": 0.2861912250518799, "learning_rate": 1.785689180021445e-05, "loss": 0.2084, "step": 7976 }, { "epoch": 1.6144505160898603, "grad_norm": 0.3108361065387726, "learning_rate": 1.7838756087638032e-05, "loss": 0.1995, "step": 7977 }, { "epoch": 1.6146529042703905, "grad_norm": 0.24518351256847382, "learning_rate": 1.7820628687274165e-05, "loss": 0.1889, "step": 7978 }, { "epoch": 1.6148552924509207, "grad_norm": 0.2598412036895752, "learning_rate": 1.7802509600956783e-05, "loss": 0.1951, "step": 7979 }, { "epoch": 1.6150576806314512, "grad_norm": 0.2555294334888458, "learning_rate": 1.7784398830519e-05, "loss": 0.1794, "step": 7980 }, { "epoch": 1.6152600688119814, "grad_norm": 0.2353818565607071, "learning_rate": 1.7766296377793058e-05, "loss": 0.1348, "step": 7981 }, { "epoch": 1.6154624569925118, "grad_norm": 0.3107486069202423, "learning_rate": 1.774820224461038e-05, "loss": 0.2107, "step": 7982 }, { "epoch": 1.615664845173042, "grad_norm": 0.29552415013313293, "learning_rate": 1.773011643280157e-05, "loss": 0.1962, "step": 7983 }, { "epoch": 1.6158672333535722, "grad_norm": 0.27952340245246887, "learning_rate": 1.7712038944196296e-05, "loss": 0.205, "step": 7984 }, { "epoch": 1.6160696215341024, "grad_norm": 0.3028942346572876, "learning_rate": 1.76939697806235e-05, "loss": 0.1742, "step": 7985 }, { "epoch": 1.6162720097146326, "grad_norm": 0.29906564950942993, "learning_rate": 1.7675908943911202e-05, "loss": 0.1792, "step": 7986 }, { "epoch": 1.6164743978951628, "grad_norm": 0.285399854183197, "learning_rate": 1.7657856435886623e-05, "loss": 0.1989, "step": 7987 }, { "epoch": 1.6166767860756932, "grad_norm": 0.26634126901626587, "learning_rate": 1.763981225837612e-05, "loss": 0.1761, "step": 7988 }, { "epoch": 1.6168791742562234, "grad_norm": 0.3051709532737732, "learning_rate": 1.7621776413205225e-05, "loss": 0.2189, "step": 7989 }, { "epoch": 1.6170815624367538, "grad_norm": 0.29787564277648926, "learning_rate": 1.7603748902198604e-05, "loss": 0.1883, "step": 7990 }, { "epoch": 1.617283950617284, "grad_norm": 0.30989035964012146, "learning_rate": 1.75857297271801e-05, "loss": 0.2233, "step": 7991 }, { "epoch": 1.6174863387978142, "grad_norm": 0.256502240896225, "learning_rate": 1.7567718889972683e-05, "loss": 0.1822, "step": 7992 }, { "epoch": 1.6176887269783444, "grad_norm": 0.26667922735214233, "learning_rate": 1.754971639239853e-05, "loss": 0.177, "step": 7993 }, { "epoch": 1.6178911151588746, "grad_norm": 0.2559124827384949, "learning_rate": 1.7531722236278936e-05, "loss": 0.169, "step": 7994 }, { "epoch": 1.6180935033394048, "grad_norm": 0.2702696919441223, "learning_rate": 1.7513736423434345e-05, "loss": 0.1685, "step": 7995 }, { "epoch": 1.6182958915199352, "grad_norm": 0.2676244080066681, "learning_rate": 1.7495758955684392e-05, "loss": 0.1805, "step": 7996 }, { "epoch": 1.6184982797004654, "grad_norm": 0.2880995273590088, "learning_rate": 1.7477789834847837e-05, "loss": 0.1847, "step": 7997 }, { "epoch": 1.6187006678809959, "grad_norm": 0.27174052596092224, "learning_rate": 1.7459829062742605e-05, "loss": 0.182, "step": 7998 }, { "epoch": 1.618903056061526, "grad_norm": 0.2822988033294678, "learning_rate": 1.7441876641185795e-05, "loss": 0.2077, "step": 7999 }, { "epoch": 1.6191054442420563, "grad_norm": 0.30251890420913696, "learning_rate": 1.742393257199363e-05, "loss": 0.197, "step": 8000 }, { "epoch": 1.6191054442420563, "eval_loss": 0.25870126485824585, "eval_runtime": 0.7362, "eval_samples_per_second": 6.791, "eval_steps_per_second": 1.358, "step": 8000 }, { "epoch": 1.6193078324225865, "grad_norm": 0.3131512403488159, "learning_rate": 1.740599685698151e-05, "loss": 0.2221, "step": 8001 }, { "epoch": 1.6195102206031167, "grad_norm": 0.28746527433395386, "learning_rate": 1.7388069497963967e-05, "loss": 0.1911, "step": 8002 }, { "epoch": 1.619712608783647, "grad_norm": 0.30155429244041443, "learning_rate": 1.7370150496754722e-05, "loss": 0.2029, "step": 8003 }, { "epoch": 1.6199149969641773, "grad_norm": 0.2872103452682495, "learning_rate": 1.7352239855166628e-05, "loss": 0.1808, "step": 8004 }, { "epoch": 1.6201173851447077, "grad_norm": 0.32985246181488037, "learning_rate": 1.7334337575011693e-05, "loss": 0.2171, "step": 8005 }, { "epoch": 1.620319773325238, "grad_norm": 0.24914324283599854, "learning_rate": 1.731644365810108e-05, "loss": 0.1695, "step": 8006 }, { "epoch": 1.620522161505768, "grad_norm": 0.3195323646068573, "learning_rate": 1.72985581062451e-05, "loss": 0.1846, "step": 8007 }, { "epoch": 1.6207245496862983, "grad_norm": 0.2776571810245514, "learning_rate": 1.7280680921253244e-05, "loss": 0.1696, "step": 8008 }, { "epoch": 1.6209269378668285, "grad_norm": 0.2772268056869507, "learning_rate": 1.7262812104934124e-05, "loss": 0.2042, "step": 8009 }, { "epoch": 1.6211293260473587, "grad_norm": 0.2957817316055298, "learning_rate": 1.724495165909553e-05, "loss": 0.173, "step": 8010 }, { "epoch": 1.6213317142278891, "grad_norm": 0.27109494805336, "learning_rate": 1.7227099585544383e-05, "loss": 0.156, "step": 8011 }, { "epoch": 1.6215341024084193, "grad_norm": 0.2688787281513214, "learning_rate": 1.7209255886086772e-05, "loss": 0.2092, "step": 8012 }, { "epoch": 1.6217364905889498, "grad_norm": 0.2872391641139984, "learning_rate": 1.7191420562527937e-05, "loss": 0.2132, "step": 8013 }, { "epoch": 1.62193887876948, "grad_norm": 0.2846378684043884, "learning_rate": 1.717359361667228e-05, "loss": 0.1788, "step": 8014 }, { "epoch": 1.6221412669500102, "grad_norm": 0.2909705638885498, "learning_rate": 1.715577505032332e-05, "loss": 0.1824, "step": 8015 }, { "epoch": 1.6223436551305404, "grad_norm": 0.3115446865558624, "learning_rate": 1.7137964865283772e-05, "loss": 0.1996, "step": 8016 }, { "epoch": 1.6225460433110706, "grad_norm": 0.26595601439476013, "learning_rate": 1.7120163063355477e-05, "loss": 0.1705, "step": 8017 }, { "epoch": 1.6227484314916008, "grad_norm": 0.29795345664024353, "learning_rate": 1.7102369646339433e-05, "loss": 0.1812, "step": 8018 }, { "epoch": 1.6229508196721312, "grad_norm": 0.30830883979797363, "learning_rate": 1.7084584616035792e-05, "loss": 0.214, "step": 8019 }, { "epoch": 1.6231532078526614, "grad_norm": 0.24998906254768372, "learning_rate": 1.706680797424386e-05, "loss": 0.18, "step": 8020 }, { "epoch": 1.6233555960331918, "grad_norm": 0.25060421228408813, "learning_rate": 1.7049039722762093e-05, "loss": 0.1782, "step": 8021 }, { "epoch": 1.623557984213722, "grad_norm": 0.310921847820282, "learning_rate": 1.7031279863388083e-05, "loss": 0.2213, "step": 8022 }, { "epoch": 1.6237603723942522, "grad_norm": 0.2564176321029663, "learning_rate": 1.701352839791861e-05, "loss": 0.1784, "step": 8023 }, { "epoch": 1.6239627605747824, "grad_norm": 0.3001807928085327, "learning_rate": 1.699578532814955e-05, "loss": 0.222, "step": 8024 }, { "epoch": 1.6241651487553126, "grad_norm": 0.34355929493904114, "learning_rate": 1.6978050655875987e-05, "loss": 0.2239, "step": 8025 }, { "epoch": 1.6243675369358428, "grad_norm": 0.27204430103302, "learning_rate": 1.6960324382892123e-05, "loss": 0.1633, "step": 8026 }, { "epoch": 1.6245699251163732, "grad_norm": 0.2537243962287903, "learning_rate": 1.6942606510991334e-05, "loss": 0.1327, "step": 8027 }, { "epoch": 1.6247723132969034, "grad_norm": 0.2479013055562973, "learning_rate": 1.692489704196607e-05, "loss": 0.1683, "step": 8028 }, { "epoch": 1.6249747014774338, "grad_norm": 0.28489288687705994, "learning_rate": 1.6907195977608036e-05, "loss": 0.1985, "step": 8029 }, { "epoch": 1.625177089657964, "grad_norm": 0.25739508867263794, "learning_rate": 1.6889503319708032e-05, "loss": 0.1932, "step": 8030 }, { "epoch": 1.6253794778384942, "grad_norm": 0.2719423472881317, "learning_rate": 1.6871819070056017e-05, "loss": 0.1664, "step": 8031 }, { "epoch": 1.6255818660190244, "grad_norm": 0.2762310802936554, "learning_rate": 1.685414323044109e-05, "loss": 0.1725, "step": 8032 }, { "epoch": 1.6257842541995546, "grad_norm": 0.2826240658760071, "learning_rate": 1.683647580265151e-05, "loss": 0.1978, "step": 8033 }, { "epoch": 1.625986642380085, "grad_norm": 0.2921895384788513, "learning_rate": 1.681881678847468e-05, "loss": 0.162, "step": 8034 }, { "epoch": 1.6261890305606153, "grad_norm": 0.2500416338443756, "learning_rate": 1.680116618969716e-05, "loss": 0.1404, "step": 8035 }, { "epoch": 1.6263914187411457, "grad_norm": 0.3063223659992218, "learning_rate": 1.6783524008104647e-05, "loss": 0.1942, "step": 8036 }, { "epoch": 1.6265938069216759, "grad_norm": 0.30133017897605896, "learning_rate": 1.6765890245481997e-05, "loss": 0.1747, "step": 8037 }, { "epoch": 1.626796195102206, "grad_norm": 0.3058010935783386, "learning_rate": 1.6748264903613208e-05, "loss": 0.2044, "step": 8038 }, { "epoch": 1.6269985832827363, "grad_norm": 0.2959883213043213, "learning_rate": 1.6730647984281423e-05, "loss": 0.1969, "step": 8039 }, { "epoch": 1.6272009714632665, "grad_norm": 0.29775944352149963, "learning_rate": 1.6713039489268945e-05, "loss": 0.2063, "step": 8040 }, { "epoch": 1.6274033596437967, "grad_norm": 0.32729557156562805, "learning_rate": 1.6695439420357206e-05, "loss": 0.2266, "step": 8041 }, { "epoch": 1.627605747824327, "grad_norm": 0.2869478464126587, "learning_rate": 1.6677847779326805e-05, "loss": 0.1958, "step": 8042 }, { "epoch": 1.6278081360048573, "grad_norm": 0.2807890772819519, "learning_rate": 1.6660264567957474e-05, "loss": 0.2006, "step": 8043 }, { "epoch": 1.6280105241853877, "grad_norm": 0.278018981218338, "learning_rate": 1.66426897880281e-05, "loss": 0.1989, "step": 8044 }, { "epoch": 1.628212912365918, "grad_norm": 0.27039635181427, "learning_rate": 1.6625123441316716e-05, "loss": 0.1856, "step": 8045 }, { "epoch": 1.6284153005464481, "grad_norm": 0.3077291250228882, "learning_rate": 1.66075655296005e-05, "loss": 0.1884, "step": 8046 }, { "epoch": 1.6286176887269783, "grad_norm": 0.30340608954429626, "learning_rate": 1.6590016054655766e-05, "loss": 0.2076, "step": 8047 }, { "epoch": 1.6288200769075085, "grad_norm": 0.2628271281719208, "learning_rate": 1.6572475018258015e-05, "loss": 0.1698, "step": 8048 }, { "epoch": 1.6290224650880387, "grad_norm": 0.31509193778038025, "learning_rate": 1.6554942422181863e-05, "loss": 0.2022, "step": 8049 }, { "epoch": 1.6292248532685691, "grad_norm": 0.2697297930717468, "learning_rate": 1.6537418268201034e-05, "loss": 0.1686, "step": 8050 }, { "epoch": 1.6292248532685691, "eval_loss": 0.25894036889076233, "eval_runtime": 0.7385, "eval_samples_per_second": 6.77, "eval_steps_per_second": 1.354, "step": 8050 }, { "epoch": 1.6294272414490993, "grad_norm": 0.2741582691669464, "learning_rate": 1.651990255808845e-05, "loss": 0.1972, "step": 8051 }, { "epoch": 1.6296296296296298, "grad_norm": 0.26430466771125793, "learning_rate": 1.650239529361619e-05, "loss": 0.1881, "step": 8052 }, { "epoch": 1.62983201781016, "grad_norm": 0.26232969760894775, "learning_rate": 1.6484896476555445e-05, "loss": 0.2028, "step": 8053 }, { "epoch": 1.6300344059906902, "grad_norm": 0.3205881118774414, "learning_rate": 1.6467406108676554e-05, "loss": 0.219, "step": 8054 }, { "epoch": 1.6302367941712204, "grad_norm": 0.26163017749786377, "learning_rate": 1.6449924191749024e-05, "loss": 0.1615, "step": 8055 }, { "epoch": 1.6304391823517506, "grad_norm": 0.2618088722229004, "learning_rate": 1.643245072754145e-05, "loss": 0.175, "step": 8056 }, { "epoch": 1.6306415705322808, "grad_norm": 0.28531819581985474, "learning_rate": 1.6414985717821673e-05, "loss": 0.2005, "step": 8057 }, { "epoch": 1.6308439587128112, "grad_norm": 0.27115190029144287, "learning_rate": 1.6397529164356606e-05, "loss": 0.1824, "step": 8058 }, { "epoch": 1.6310463468933414, "grad_norm": 0.28367385268211365, "learning_rate": 1.638008106891229e-05, "loss": 0.1639, "step": 8059 }, { "epoch": 1.6312487350738718, "grad_norm": 0.2776460647583008, "learning_rate": 1.6362641433253968e-05, "loss": 0.1709, "step": 8060 }, { "epoch": 1.631451123254402, "grad_norm": 0.2931157052516937, "learning_rate": 1.634521025914598e-05, "loss": 0.2073, "step": 8061 }, { "epoch": 1.6316535114349322, "grad_norm": 0.32506170868873596, "learning_rate": 1.6327787548351848e-05, "loss": 0.1834, "step": 8062 }, { "epoch": 1.6318558996154624, "grad_norm": 0.31267428398132324, "learning_rate": 1.6310373302634208e-05, "loss": 0.1932, "step": 8063 }, { "epoch": 1.6320582877959926, "grad_norm": 0.30341413617134094, "learning_rate": 1.6292967523754855e-05, "loss": 0.2089, "step": 8064 }, { "epoch": 1.632260675976523, "grad_norm": 0.2488972693681717, "learning_rate": 1.627557021347471e-05, "loss": 0.1712, "step": 8065 }, { "epoch": 1.6324630641570532, "grad_norm": 0.25763022899627686, "learning_rate": 1.625818137355386e-05, "loss": 0.1739, "step": 8066 }, { "epoch": 1.6326654523375836, "grad_norm": 0.27001476287841797, "learning_rate": 1.6240801005751538e-05, "loss": 0.1675, "step": 8067 }, { "epoch": 1.6328678405181138, "grad_norm": 0.2840779423713684, "learning_rate": 1.6223429111826083e-05, "loss": 0.1873, "step": 8068 }, { "epoch": 1.633070228698644, "grad_norm": 0.3620816469192505, "learning_rate": 1.620606569353502e-05, "loss": 0.2138, "step": 8069 }, { "epoch": 1.6332726168791742, "grad_norm": 0.3016754984855652, "learning_rate": 1.6188710752634985e-05, "loss": 0.1899, "step": 8070 }, { "epoch": 1.6334750050597044, "grad_norm": 0.3155854642391205, "learning_rate": 1.6171364290881808e-05, "loss": 0.2162, "step": 8071 }, { "epoch": 1.6336773932402346, "grad_norm": 0.3163856565952301, "learning_rate": 1.6154026310030358e-05, "loss": 0.2142, "step": 8072 }, { "epoch": 1.633879781420765, "grad_norm": 0.3073074221611023, "learning_rate": 1.6136696811834727e-05, "loss": 0.1627, "step": 8073 }, { "epoch": 1.6340821696012953, "grad_norm": 0.3004322648048401, "learning_rate": 1.6119375798048163e-05, "loss": 0.1736, "step": 8074 }, { "epoch": 1.6342845577818257, "grad_norm": 0.24818210303783417, "learning_rate": 1.6102063270422995e-05, "loss": 0.1756, "step": 8075 }, { "epoch": 1.6344869459623559, "grad_norm": 0.33458349108695984, "learning_rate": 1.6084759230710745e-05, "loss": 0.1882, "step": 8076 }, { "epoch": 1.634689334142886, "grad_norm": 0.24042929708957672, "learning_rate": 1.6067463680662043e-05, "loss": 0.1762, "step": 8077 }, { "epoch": 1.6348917223234163, "grad_norm": 0.275277704000473, "learning_rate": 1.605017662202666e-05, "loss": 0.176, "step": 8078 }, { "epoch": 1.6350941105039465, "grad_norm": 0.2901867628097534, "learning_rate": 1.6032898056553535e-05, "loss": 0.1828, "step": 8079 }, { "epoch": 1.6352964986844767, "grad_norm": 0.2616790235042572, "learning_rate": 1.6015627985990732e-05, "loss": 0.1824, "step": 8080 }, { "epoch": 1.635498886865007, "grad_norm": 0.2518100440502167, "learning_rate": 1.5998366412085452e-05, "loss": 0.1616, "step": 8081 }, { "epoch": 1.6357012750455373, "grad_norm": 0.30384114384651184, "learning_rate": 1.5981113336584043e-05, "loss": 0.2056, "step": 8082 }, { "epoch": 1.6359036632260677, "grad_norm": 0.2821972072124481, "learning_rate": 1.5963868761231983e-05, "loss": 0.1708, "step": 8083 }, { "epoch": 1.636106051406598, "grad_norm": 0.2594367265701294, "learning_rate": 1.594663268777389e-05, "loss": 0.1987, "step": 8084 }, { "epoch": 1.6363084395871281, "grad_norm": 0.2959327697753906, "learning_rate": 1.5929405117953557e-05, "loss": 0.2098, "step": 8085 }, { "epoch": 1.6365108277676583, "grad_norm": 0.2778414785861969, "learning_rate": 1.5912186053513856e-05, "loss": 0.1993, "step": 8086 }, { "epoch": 1.6367132159481885, "grad_norm": 0.2850133776664734, "learning_rate": 1.589497549619685e-05, "loss": 0.1948, "step": 8087 }, { "epoch": 1.6369156041287187, "grad_norm": 0.26495492458343506, "learning_rate": 1.5877773447743725e-05, "loss": 0.1821, "step": 8088 }, { "epoch": 1.6371179923092491, "grad_norm": 0.2508051097393036, "learning_rate": 1.58605799098948e-05, "loss": 0.1768, "step": 8089 }, { "epoch": 1.6373203804897793, "grad_norm": 0.27083274722099304, "learning_rate": 1.5843394884389528e-05, "loss": 0.1993, "step": 8090 }, { "epoch": 1.6375227686703098, "grad_norm": 0.2604304850101471, "learning_rate": 1.5826218372966517e-05, "loss": 0.1809, "step": 8091 }, { "epoch": 1.63772515685084, "grad_norm": 0.3725582957267761, "learning_rate": 1.580905037736351e-05, "loss": 0.1671, "step": 8092 }, { "epoch": 1.6379275450313702, "grad_norm": 0.2975594401359558, "learning_rate": 1.5791890899317374e-05, "loss": 0.1744, "step": 8093 }, { "epoch": 1.6381299332119004, "grad_norm": 0.2582997679710388, "learning_rate": 1.5774739940564165e-05, "loss": 0.1551, "step": 8094 }, { "epoch": 1.6383323213924306, "grad_norm": 0.3007787764072418, "learning_rate": 1.5757597502838973e-05, "loss": 0.2484, "step": 8095 }, { "epoch": 1.638534709572961, "grad_norm": 0.2605661153793335, "learning_rate": 1.574046358787612e-05, "loss": 0.171, "step": 8096 }, { "epoch": 1.6387370977534912, "grad_norm": 0.3007476329803467, "learning_rate": 1.572333819740903e-05, "loss": 0.1868, "step": 8097 }, { "epoch": 1.6389394859340216, "grad_norm": 0.2613348364830017, "learning_rate": 1.570622133317028e-05, "loss": 0.1622, "step": 8098 }, { "epoch": 1.6391418741145518, "grad_norm": 0.277700275182724, "learning_rate": 1.5689112996891576e-05, "loss": 0.184, "step": 8099 }, { "epoch": 1.639344262295082, "grad_norm": 0.25784534215927124, "learning_rate": 1.5672013190303757e-05, "loss": 0.1875, "step": 8100 }, { "epoch": 1.639344262295082, "eval_loss": 0.25882890820503235, "eval_runtime": 0.7369, "eval_samples_per_second": 6.785, "eval_steps_per_second": 1.357, "step": 8100 }, { "epoch": 1.6395466504756122, "grad_norm": 0.2594480514526367, "learning_rate": 1.5654921915136787e-05, "loss": 0.2041, "step": 8101 }, { "epoch": 1.6397490386561424, "grad_norm": 0.2852243185043335, "learning_rate": 1.5637839173119807e-05, "loss": 0.2062, "step": 8102 }, { "epoch": 1.6399514268366726, "grad_norm": 0.25183483958244324, "learning_rate": 1.5620764965981048e-05, "loss": 0.1562, "step": 8103 }, { "epoch": 1.640153815017203, "grad_norm": 0.30132558941841125, "learning_rate": 1.5603699295447916e-05, "loss": 0.2095, "step": 8104 }, { "epoch": 1.6403562031977332, "grad_norm": 0.25491568446159363, "learning_rate": 1.5586642163246934e-05, "loss": 0.1582, "step": 8105 }, { "epoch": 1.6405585913782637, "grad_norm": 0.26761573553085327, "learning_rate": 1.5569593571103747e-05, "loss": 0.1787, "step": 8106 }, { "epoch": 1.6407609795587939, "grad_norm": 0.29488828778266907, "learning_rate": 1.5552553520743163e-05, "loss": 0.1755, "step": 8107 }, { "epoch": 1.640963367739324, "grad_norm": 0.2793586254119873, "learning_rate": 1.5535522013889125e-05, "loss": 0.18, "step": 8108 }, { "epoch": 1.6411657559198543, "grad_norm": 0.3017469644546509, "learning_rate": 1.551849905226469e-05, "loss": 0.1824, "step": 8109 }, { "epoch": 1.6413681441003845, "grad_norm": 0.30350354313850403, "learning_rate": 1.5501484637592067e-05, "loss": 0.2096, "step": 8110 }, { "epoch": 1.6415705322809147, "grad_norm": 0.3905794024467468, "learning_rate": 1.5484478771592602e-05, "loss": 0.2198, "step": 8111 }, { "epoch": 1.641772920461445, "grad_norm": 0.2384035289287567, "learning_rate": 1.5467481455986755e-05, "loss": 0.1817, "step": 8112 }, { "epoch": 1.6419753086419753, "grad_norm": 0.3253072500228882, "learning_rate": 1.5450492692494146e-05, "loss": 0.1822, "step": 8113 }, { "epoch": 1.6421776968225057, "grad_norm": 0.23735348880290985, "learning_rate": 1.5433512482833523e-05, "loss": 0.1631, "step": 8114 }, { "epoch": 1.642380085003036, "grad_norm": 0.3243256211280823, "learning_rate": 1.5416540828722738e-05, "loss": 0.2103, "step": 8115 }, { "epoch": 1.642582473183566, "grad_norm": 0.3111126720905304, "learning_rate": 1.5399577731878867e-05, "loss": 0.2073, "step": 8116 }, { "epoch": 1.6427848613640963, "grad_norm": 0.24906755983829498, "learning_rate": 1.5382623194017996e-05, "loss": 0.1634, "step": 8117 }, { "epoch": 1.6429872495446265, "grad_norm": 0.24573220312595367, "learning_rate": 1.5365677216855435e-05, "loss": 0.1508, "step": 8118 }, { "epoch": 1.6431896377251567, "grad_norm": 0.34453287720680237, "learning_rate": 1.5348739802105592e-05, "loss": 0.2242, "step": 8119 }, { "epoch": 1.6433920259056871, "grad_norm": 0.2754497528076172, "learning_rate": 1.533181095148203e-05, "loss": 0.2004, "step": 8120 }, { "epoch": 1.6435944140862173, "grad_norm": 0.3048870265483856, "learning_rate": 1.531489066669741e-05, "loss": 0.2032, "step": 8121 }, { "epoch": 1.6437968022667477, "grad_norm": 0.2695624828338623, "learning_rate": 1.5297978949463566e-05, "loss": 0.1528, "step": 8122 }, { "epoch": 1.643999190447278, "grad_norm": 0.2954610288143158, "learning_rate": 1.5281075801491452e-05, "loss": 0.1739, "step": 8123 }, { "epoch": 1.6442015786278081, "grad_norm": 0.26412299275398254, "learning_rate": 1.5264181224491138e-05, "loss": 0.168, "step": 8124 }, { "epoch": 1.6444039668083383, "grad_norm": 0.27403730154037476, "learning_rate": 1.524729522017183e-05, "loss": 0.1737, "step": 8125 }, { "epoch": 1.6446063549888685, "grad_norm": 0.24554625153541565, "learning_rate": 1.5230417790241913e-05, "loss": 0.1825, "step": 8126 }, { "epoch": 1.644808743169399, "grad_norm": 0.30419591069221497, "learning_rate": 1.5213548936408829e-05, "loss": 0.1677, "step": 8127 }, { "epoch": 1.6450111313499292, "grad_norm": 0.23226524889469147, "learning_rate": 1.519668866037922e-05, "loss": 0.1396, "step": 8128 }, { "epoch": 1.6452135195304596, "grad_norm": 0.27901968359947205, "learning_rate": 1.5179836963858818e-05, "loss": 0.2219, "step": 8129 }, { "epoch": 1.6454159077109898, "grad_norm": 0.2586843967437744, "learning_rate": 1.5162993848552509e-05, "loss": 0.1851, "step": 8130 }, { "epoch": 1.64561829589152, "grad_norm": 0.3563934862613678, "learning_rate": 1.5146159316164299e-05, "loss": 0.1968, "step": 8131 }, { "epoch": 1.6458206840720502, "grad_norm": 0.2968688905239105, "learning_rate": 1.5129333368397314e-05, "loss": 0.1996, "step": 8132 }, { "epoch": 1.6460230722525804, "grad_norm": 0.27744680643081665, "learning_rate": 1.5112516006953858e-05, "loss": 0.1761, "step": 8133 }, { "epoch": 1.6462254604331106, "grad_norm": 0.274127334356308, "learning_rate": 1.5095707233535306e-05, "loss": 0.1802, "step": 8134 }, { "epoch": 1.646427848613641, "grad_norm": 0.295303612947464, "learning_rate": 1.5078907049842217e-05, "loss": 0.2064, "step": 8135 }, { "epoch": 1.6466302367941712, "grad_norm": 0.21443642675876617, "learning_rate": 1.5062115457574232e-05, "loss": 0.165, "step": 8136 }, { "epoch": 1.6468326249747016, "grad_norm": 0.3120724558830261, "learning_rate": 1.5045332458430173e-05, "loss": 0.1912, "step": 8137 }, { "epoch": 1.6470350131552318, "grad_norm": 0.25102463364601135, "learning_rate": 1.5028558054107977e-05, "loss": 0.1789, "step": 8138 }, { "epoch": 1.647237401335762, "grad_norm": 0.3505784273147583, "learning_rate": 1.5011792246304657e-05, "loss": 0.1845, "step": 8139 }, { "epoch": 1.6474397895162922, "grad_norm": 0.2952693998813629, "learning_rate": 1.499503503671642e-05, "loss": 0.2515, "step": 8140 }, { "epoch": 1.6476421776968224, "grad_norm": 0.2916133999824524, "learning_rate": 1.4978286427038601e-05, "loss": 0.1895, "step": 8141 }, { "epoch": 1.6478445658773526, "grad_norm": 0.30217990279197693, "learning_rate": 1.4961546418965633e-05, "loss": 0.1896, "step": 8142 }, { "epoch": 1.648046954057883, "grad_norm": 0.2839777171611786, "learning_rate": 1.4944815014191104e-05, "loss": 0.1674, "step": 8143 }, { "epoch": 1.6482493422384132, "grad_norm": 0.2538793087005615, "learning_rate": 1.4928092214407719e-05, "loss": 0.1737, "step": 8144 }, { "epoch": 1.6484517304189437, "grad_norm": 0.2852862775325775, "learning_rate": 1.491137802130731e-05, "loss": 0.183, "step": 8145 }, { "epoch": 1.6486541185994739, "grad_norm": 0.28725042939186096, "learning_rate": 1.4894672436580847e-05, "loss": 0.1602, "step": 8146 }, { "epoch": 1.648856506780004, "grad_norm": 0.29489025473594666, "learning_rate": 1.4877975461918436e-05, "loss": 0.1929, "step": 8147 }, { "epoch": 1.6490588949605343, "grad_norm": 0.25739914178848267, "learning_rate": 1.486128709900928e-05, "loss": 0.1816, "step": 8148 }, { "epoch": 1.6492612831410645, "grad_norm": 0.2747301757335663, "learning_rate": 1.484460734954175e-05, "loss": 0.1734, "step": 8149 }, { "epoch": 1.6494636713215947, "grad_norm": 0.3500189483165741, "learning_rate": 1.482793621520333e-05, "loss": 0.1936, "step": 8150 }, { "epoch": 1.6494636713215947, "eval_loss": 0.26037317514419556, "eval_runtime": 0.7394, "eval_samples_per_second": 6.762, "eval_steps_per_second": 1.352, "step": 8150 }, { "epoch": 1.649666059502125, "grad_norm": 0.2614065110683441, "learning_rate": 1.4811273697680616e-05, "loss": 0.1686, "step": 8151 }, { "epoch": 1.6498684476826553, "grad_norm": 0.31688347458839417, "learning_rate": 1.4794619798659359e-05, "loss": 0.1951, "step": 8152 }, { "epoch": 1.6500708358631857, "grad_norm": 0.29736804962158203, "learning_rate": 1.4777974519824411e-05, "loss": 0.1883, "step": 8153 }, { "epoch": 1.650273224043716, "grad_norm": 0.2695557177066803, "learning_rate": 1.4761337862859782e-05, "loss": 0.1575, "step": 8154 }, { "epoch": 1.650475612224246, "grad_norm": 0.27493923902511597, "learning_rate": 1.4744709829448588e-05, "loss": 0.1606, "step": 8155 }, { "epoch": 1.6506780004047763, "grad_norm": 0.2712497115135193, "learning_rate": 1.4728090421273088e-05, "loss": 0.1817, "step": 8156 }, { "epoch": 1.6508803885853065, "grad_norm": 0.3294946551322937, "learning_rate": 1.4711479640014646e-05, "loss": 0.1953, "step": 8157 }, { "epoch": 1.651082776765837, "grad_norm": 0.32231223583221436, "learning_rate": 1.4694877487353765e-05, "loss": 0.2271, "step": 8158 }, { "epoch": 1.6512851649463671, "grad_norm": 0.24417562782764435, "learning_rate": 1.4678283964970096e-05, "loss": 0.1673, "step": 8159 }, { "epoch": 1.6514875531268975, "grad_norm": 0.3897631764411926, "learning_rate": 1.4661699074542378e-05, "loss": 0.2331, "step": 8160 }, { "epoch": 1.6516899413074277, "grad_norm": 0.25601083040237427, "learning_rate": 1.4645122817748503e-05, "loss": 0.1779, "step": 8161 }, { "epoch": 1.651892329487958, "grad_norm": 0.2515465021133423, "learning_rate": 1.4628555196265482e-05, "loss": 0.1894, "step": 8162 }, { "epoch": 1.6520947176684881, "grad_norm": 0.2911391258239746, "learning_rate": 1.4611996211769452e-05, "loss": 0.187, "step": 8163 }, { "epoch": 1.6522971058490183, "grad_norm": 0.2927611768245697, "learning_rate": 1.4595445865935676e-05, "loss": 0.213, "step": 8164 }, { "epoch": 1.6524994940295485, "grad_norm": 0.24805690348148346, "learning_rate": 1.4578904160438555e-05, "loss": 0.1776, "step": 8165 }, { "epoch": 1.652701882210079, "grad_norm": 0.32265952229499817, "learning_rate": 1.45623710969516e-05, "loss": 0.2215, "step": 8166 }, { "epoch": 1.6529042703906092, "grad_norm": 0.30108514428138733, "learning_rate": 1.4545846677147445e-05, "loss": 0.1853, "step": 8167 }, { "epoch": 1.6531066585711396, "grad_norm": 0.26640450954437256, "learning_rate": 1.4529330902697857e-05, "loss": 0.1889, "step": 8168 }, { "epoch": 1.6533090467516698, "grad_norm": 0.2683650553226471, "learning_rate": 1.4512823775273743e-05, "loss": 0.1558, "step": 8169 }, { "epoch": 1.6535114349322, "grad_norm": 0.2825409173965454, "learning_rate": 1.449632529654512e-05, "loss": 0.1778, "step": 8170 }, { "epoch": 1.6537138231127302, "grad_norm": 0.26279154419898987, "learning_rate": 1.4479835468181113e-05, "loss": 0.1961, "step": 8171 }, { "epoch": 1.6539162112932604, "grad_norm": 0.3498830199241638, "learning_rate": 1.4463354291850007e-05, "loss": 0.1924, "step": 8172 }, { "epoch": 1.6541185994737906, "grad_norm": 0.3345295190811157, "learning_rate": 1.4446881769219178e-05, "loss": 0.193, "step": 8173 }, { "epoch": 1.654320987654321, "grad_norm": 0.27633896470069885, "learning_rate": 1.4430417901955163e-05, "loss": 0.1685, "step": 8174 }, { "epoch": 1.6545233758348512, "grad_norm": 0.2868563234806061, "learning_rate": 1.4413962691723582e-05, "loss": 0.173, "step": 8175 }, { "epoch": 1.6547257640153816, "grad_norm": 0.2706867754459381, "learning_rate": 1.4397516140189216e-05, "loss": 0.2111, "step": 8176 }, { "epoch": 1.6549281521959118, "grad_norm": 0.25419893860816956, "learning_rate": 1.4381078249015955e-05, "loss": 0.1958, "step": 8177 }, { "epoch": 1.655130540376442, "grad_norm": 0.3127739429473877, "learning_rate": 1.4364649019866804e-05, "loss": 0.2044, "step": 8178 }, { "epoch": 1.6553329285569722, "grad_norm": 0.25201788544654846, "learning_rate": 1.4348228454403912e-05, "loss": 0.1591, "step": 8179 }, { "epoch": 1.6555353167375024, "grad_norm": 0.277378648519516, "learning_rate": 1.4331816554288524e-05, "loss": 0.2106, "step": 8180 }, { "epoch": 1.6557377049180326, "grad_norm": 0.26922157406806946, "learning_rate": 1.4315413321181027e-05, "loss": 0.1854, "step": 8181 }, { "epoch": 1.655940093098563, "grad_norm": 0.2691102623939514, "learning_rate": 1.4299018756740933e-05, "loss": 0.1687, "step": 8182 }, { "epoch": 1.6561424812790932, "grad_norm": 0.28088414669036865, "learning_rate": 1.4282632862626899e-05, "loss": 0.1704, "step": 8183 }, { "epoch": 1.6563448694596237, "grad_norm": 0.2544093430042267, "learning_rate": 1.4266255640496629e-05, "loss": 0.1811, "step": 8184 }, { "epoch": 1.6565472576401539, "grad_norm": 0.2784738838672638, "learning_rate": 1.424988709200702e-05, "loss": 0.1995, "step": 8185 }, { "epoch": 1.656749645820684, "grad_norm": 0.31763532757759094, "learning_rate": 1.4233527218814058e-05, "loss": 0.199, "step": 8186 }, { "epoch": 1.6569520340012143, "grad_norm": 0.29642024636268616, "learning_rate": 1.4217176022572887e-05, "loss": 0.153, "step": 8187 }, { "epoch": 1.6571544221817445, "grad_norm": 0.3127974569797516, "learning_rate": 1.4200833504937727e-05, "loss": 0.2044, "step": 8188 }, { "epoch": 1.657356810362275, "grad_norm": 0.2773086428642273, "learning_rate": 1.4184499667561956e-05, "loss": 0.1911, "step": 8189 }, { "epoch": 1.657559198542805, "grad_norm": 0.3058187663555145, "learning_rate": 1.4168174512098064e-05, "loss": 0.216, "step": 8190 }, { "epoch": 1.6577615867233355, "grad_norm": 0.26577994227409363, "learning_rate": 1.415185804019764e-05, "loss": 0.1779, "step": 8191 }, { "epoch": 1.6579639749038657, "grad_norm": 0.32752305269241333, "learning_rate": 1.4135550253511432e-05, "loss": 0.2328, "step": 8192 }, { "epoch": 1.658166363084396, "grad_norm": 0.3030085265636444, "learning_rate": 1.4119251153689283e-05, "loss": 0.1938, "step": 8193 }, { "epoch": 1.658368751264926, "grad_norm": 0.3024362027645111, "learning_rate": 1.4102960742380167e-05, "loss": 0.2202, "step": 8194 }, { "epoch": 1.6585711394454563, "grad_norm": 0.29923704266548157, "learning_rate": 1.4086679021232174e-05, "loss": 0.2046, "step": 8195 }, { "epoch": 1.6587735276259865, "grad_norm": 0.3013087213039398, "learning_rate": 1.407040599189252e-05, "loss": 0.194, "step": 8196 }, { "epoch": 1.658975915806517, "grad_norm": 0.30144721269607544, "learning_rate": 1.4054141656007536e-05, "loss": 0.2071, "step": 8197 }, { "epoch": 1.6591783039870471, "grad_norm": 0.2825508713722229, "learning_rate": 1.403788601522268e-05, "loss": 0.1909, "step": 8198 }, { "epoch": 1.6593806921675776, "grad_norm": 0.2649502456188202, "learning_rate": 1.4021639071182534e-05, "loss": 0.1847, "step": 8199 }, { "epoch": 1.6595830803481078, "grad_norm": 0.26707082986831665, "learning_rate": 1.4005400825530778e-05, "loss": 0.1752, "step": 8200 }, { "epoch": 1.6595830803481078, "eval_loss": 0.26061928272247314, "eval_runtime": 0.7392, "eval_samples_per_second": 6.764, "eval_steps_per_second": 1.353, "step": 8200 }, { "epoch": 1.659785468528638, "grad_norm": 0.30190932750701904, "learning_rate": 1.398917127991024e-05, "loss": 0.2433, "step": 8201 }, { "epoch": 1.6599878567091682, "grad_norm": 0.2780781090259552, "learning_rate": 1.397295043596285e-05, "loss": 0.1669, "step": 8202 }, { "epoch": 1.6601902448896984, "grad_norm": 0.2770734131336212, "learning_rate": 1.3956738295329664e-05, "loss": 0.2001, "step": 8203 }, { "epoch": 1.6603926330702286, "grad_norm": 0.2806360125541687, "learning_rate": 1.3940534859650844e-05, "loss": 0.1948, "step": 8204 }, { "epoch": 1.660595021250759, "grad_norm": 0.25580450892448425, "learning_rate": 1.3924340130565727e-05, "loss": 0.1718, "step": 8205 }, { "epoch": 1.6607974094312892, "grad_norm": 0.3279639482498169, "learning_rate": 1.3908154109712679e-05, "loss": 0.1913, "step": 8206 }, { "epoch": 1.6609997976118196, "grad_norm": 0.3011409640312195, "learning_rate": 1.3891976798729234e-05, "loss": 0.1917, "step": 8207 }, { "epoch": 1.6612021857923498, "grad_norm": 0.24959488213062286, "learning_rate": 1.387580819925206e-05, "loss": 0.1595, "step": 8208 }, { "epoch": 1.66140457397288, "grad_norm": 0.2853519916534424, "learning_rate": 1.3859648312916907e-05, "loss": 0.1986, "step": 8209 }, { "epoch": 1.6616069621534102, "grad_norm": 0.2913047969341278, "learning_rate": 1.3843497141358685e-05, "loss": 0.2041, "step": 8210 }, { "epoch": 1.6618093503339404, "grad_norm": 0.25920918583869934, "learning_rate": 1.3827354686211403e-05, "loss": 0.1602, "step": 8211 }, { "epoch": 1.6620117385144706, "grad_norm": 0.28853464126586914, "learning_rate": 1.3811220949108172e-05, "loss": 0.183, "step": 8212 }, { "epoch": 1.662214126695001, "grad_norm": 0.2855565845966339, "learning_rate": 1.3795095931681235e-05, "loss": 0.1666, "step": 8213 }, { "epoch": 1.6624165148755312, "grad_norm": 0.29030704498291016, "learning_rate": 1.3778979635561962e-05, "loss": 0.211, "step": 8214 }, { "epoch": 1.6626189030560616, "grad_norm": 0.23719587922096252, "learning_rate": 1.3762872062380805e-05, "loss": 0.1485, "step": 8215 }, { "epoch": 1.6628212912365918, "grad_norm": 0.26473090052604675, "learning_rate": 1.3746773213767394e-05, "loss": 0.2039, "step": 8216 }, { "epoch": 1.663023679417122, "grad_norm": 0.26345399022102356, "learning_rate": 1.3730683091350415e-05, "loss": 0.1863, "step": 8217 }, { "epoch": 1.6632260675976522, "grad_norm": 0.28619056940078735, "learning_rate": 1.3714601696757712e-05, "loss": 0.1959, "step": 8218 }, { "epoch": 1.6634284557781824, "grad_norm": 0.2565390467643738, "learning_rate": 1.3698529031616225e-05, "loss": 0.1823, "step": 8219 }, { "epoch": 1.6636308439587129, "grad_norm": 0.2777693569660187, "learning_rate": 1.3682465097552021e-05, "loss": 0.2056, "step": 8220 }, { "epoch": 1.663833232139243, "grad_norm": 0.25533127784729004, "learning_rate": 1.366640989619028e-05, "loss": 0.1662, "step": 8221 }, { "epoch": 1.6640356203197735, "grad_norm": 0.32117244601249695, "learning_rate": 1.3650363429155288e-05, "loss": 0.2295, "step": 8222 }, { "epoch": 1.6642380085003037, "grad_norm": 0.2592966556549072, "learning_rate": 1.3634325698070471e-05, "loss": 0.1592, "step": 8223 }, { "epoch": 1.6644403966808339, "grad_norm": 0.3049880862236023, "learning_rate": 1.3618296704558364e-05, "loss": 0.2058, "step": 8224 }, { "epoch": 1.664642784861364, "grad_norm": 0.28951483964920044, "learning_rate": 1.3602276450240603e-05, "loss": 0.1971, "step": 8225 }, { "epoch": 1.6648451730418943, "grad_norm": 0.31505659222602844, "learning_rate": 1.3586264936737936e-05, "loss": 0.2161, "step": 8226 }, { "epoch": 1.6650475612224245, "grad_norm": 0.28962957859039307, "learning_rate": 1.3570262165670289e-05, "loss": 0.2002, "step": 8227 }, { "epoch": 1.665249949402955, "grad_norm": 0.314058780670166, "learning_rate": 1.3554268138656589e-05, "loss": 0.2151, "step": 8228 }, { "epoch": 1.665452337583485, "grad_norm": 0.28717002272605896, "learning_rate": 1.3538282857314988e-05, "loss": 0.1955, "step": 8229 }, { "epoch": 1.6656547257640155, "grad_norm": 0.24509413540363312, "learning_rate": 1.352230632326268e-05, "loss": 0.192, "step": 8230 }, { "epoch": 1.6658571139445457, "grad_norm": 0.2782445251941681, "learning_rate": 1.3506338538116025e-05, "loss": 0.1708, "step": 8231 }, { "epoch": 1.666059502125076, "grad_norm": 0.30397579073905945, "learning_rate": 1.349037950349047e-05, "loss": 0.1961, "step": 8232 }, { "epoch": 1.6662618903056061, "grad_norm": 0.2743746340274811, "learning_rate": 1.3474429221000573e-05, "loss": 0.1948, "step": 8233 }, { "epoch": 1.6664642784861363, "grad_norm": 0.3238055109977722, "learning_rate": 1.3458487692260036e-05, "loss": 0.1831, "step": 8234 }, { "epoch": 1.6666666666666665, "grad_norm": 0.268256276845932, "learning_rate": 1.3442554918881634e-05, "loss": 0.1774, "step": 8235 }, { "epoch": 1.666869054847197, "grad_norm": 0.3051510155200958, "learning_rate": 1.34266309024773e-05, "loss": 0.2038, "step": 8236 }, { "epoch": 1.6670714430277271, "grad_norm": 0.2659623920917511, "learning_rate": 1.3410715644658034e-05, "loss": 0.1772, "step": 8237 }, { "epoch": 1.6672738312082576, "grad_norm": 0.2695719003677368, "learning_rate": 1.3394809147033993e-05, "loss": 0.1511, "step": 8238 }, { "epoch": 1.6674762193887878, "grad_norm": 0.2614977955818176, "learning_rate": 1.3378911411214435e-05, "loss": 0.1703, "step": 8239 }, { "epoch": 1.667678607569318, "grad_norm": 0.2625097334384918, "learning_rate": 1.3363022438807704e-05, "loss": 0.1536, "step": 8240 }, { "epoch": 1.6678809957498482, "grad_norm": 0.29641667008399963, "learning_rate": 1.3347142231421295e-05, "loss": 0.2007, "step": 8241 }, { "epoch": 1.6680833839303784, "grad_norm": 0.2981378436088562, "learning_rate": 1.3331270790661799e-05, "loss": 0.1894, "step": 8242 }, { "epoch": 1.6682857721109086, "grad_norm": 0.30381447076797485, "learning_rate": 1.3315408118134909e-05, "loss": 0.1968, "step": 8243 }, { "epoch": 1.668488160291439, "grad_norm": 0.2666470408439636, "learning_rate": 1.3299554215445464e-05, "loss": 0.1746, "step": 8244 }, { "epoch": 1.6686905484719692, "grad_norm": 0.32338204979896545, "learning_rate": 1.3283709084197381e-05, "loss": 0.2023, "step": 8245 }, { "epoch": 1.6688929366524996, "grad_norm": 0.31472939252853394, "learning_rate": 1.3267872725993713e-05, "loss": 0.2048, "step": 8246 }, { "epoch": 1.6690953248330298, "grad_norm": 0.3297794461250305, "learning_rate": 1.325204514243662e-05, "loss": 0.2346, "step": 8247 }, { "epoch": 1.66929771301356, "grad_norm": 0.3132227957248688, "learning_rate": 1.3236226335127356e-05, "loss": 0.2204, "step": 8248 }, { "epoch": 1.6695001011940902, "grad_norm": 0.40292102098464966, "learning_rate": 1.3220416305666328e-05, "loss": 0.1882, "step": 8249 }, { "epoch": 1.6697024893746204, "grad_norm": 0.2750104069709778, "learning_rate": 1.3204615055652992e-05, "loss": 0.1921, "step": 8250 }, { "epoch": 1.6697024893746204, "eval_loss": 0.2600804567337036, "eval_runtime": 0.7413, "eval_samples_per_second": 6.745, "eval_steps_per_second": 1.349, "step": 8250 }, { "epoch": 1.6699048775551508, "grad_norm": 0.31695792078971863, "learning_rate": 1.3188822586685966e-05, "loss": 0.1959, "step": 8251 }, { "epoch": 1.670107265735681, "grad_norm": 0.24991224706172943, "learning_rate": 1.3173038900362976e-05, "loss": 0.1433, "step": 8252 }, { "epoch": 1.6703096539162114, "grad_norm": 0.29578897356987, "learning_rate": 1.3157263998280845e-05, "loss": 0.207, "step": 8253 }, { "epoch": 1.6705120420967416, "grad_norm": 0.2668931782245636, "learning_rate": 1.3141497882035514e-05, "loss": 0.1855, "step": 8254 }, { "epoch": 1.6707144302772718, "grad_norm": 0.2784649729728699, "learning_rate": 1.3125740553222032e-05, "loss": 0.1727, "step": 8255 }, { "epoch": 1.670916818457802, "grad_norm": 0.5614887475967407, "learning_rate": 1.3109992013434557e-05, "loss": 0.2148, "step": 8256 }, { "epoch": 1.6711192066383322, "grad_norm": 0.2848651111125946, "learning_rate": 1.309425226426636e-05, "loss": 0.1892, "step": 8257 }, { "epoch": 1.6713215948188624, "grad_norm": 0.3137986958026886, "learning_rate": 1.3078521307309832e-05, "loss": 0.1973, "step": 8258 }, { "epoch": 1.6715239829993929, "grad_norm": 0.25669851899147034, "learning_rate": 1.3062799144156468e-05, "loss": 0.2005, "step": 8259 }, { "epoch": 1.671726371179923, "grad_norm": 0.2816522717475891, "learning_rate": 1.304708577639685e-05, "loss": 0.1853, "step": 8260 }, { "epoch": 1.6719287593604535, "grad_norm": 0.270802766084671, "learning_rate": 1.3031381205620719e-05, "loss": 0.1676, "step": 8261 }, { "epoch": 1.6721311475409837, "grad_norm": 0.2849435806274414, "learning_rate": 1.301568543341688e-05, "loss": 0.1769, "step": 8262 }, { "epoch": 1.6723335357215139, "grad_norm": 0.2575063705444336, "learning_rate": 1.2999998461373275e-05, "loss": 0.1738, "step": 8263 }, { "epoch": 1.672535923902044, "grad_norm": 0.2596307694911957, "learning_rate": 1.2984320291076947e-05, "loss": 0.1735, "step": 8264 }, { "epoch": 1.6727383120825743, "grad_norm": 0.29953983426094055, "learning_rate": 1.2968650924114045e-05, "loss": 0.1901, "step": 8265 }, { "epoch": 1.6729407002631045, "grad_norm": 0.29430925846099854, "learning_rate": 1.2952990362069828e-05, "loss": 0.1973, "step": 8266 }, { "epoch": 1.673143088443635, "grad_norm": 0.24846704304218292, "learning_rate": 1.2937338606528648e-05, "loss": 0.1797, "step": 8267 }, { "epoch": 1.673345476624165, "grad_norm": 0.30034515261650085, "learning_rate": 1.292169565907404e-05, "loss": 0.1986, "step": 8268 }, { "epoch": 1.6735478648046955, "grad_norm": 0.29777172207832336, "learning_rate": 1.290606152128856e-05, "loss": 0.209, "step": 8269 }, { "epoch": 1.6737502529852257, "grad_norm": 0.3124600052833557, "learning_rate": 1.289043619475392e-05, "loss": 0.1897, "step": 8270 }, { "epoch": 1.673952641165756, "grad_norm": 0.3199320435523987, "learning_rate": 1.2874819681050898e-05, "loss": 0.1963, "step": 8271 }, { "epoch": 1.6741550293462861, "grad_norm": 0.24260728061199188, "learning_rate": 1.2859211981759455e-05, "loss": 0.1804, "step": 8272 }, { "epoch": 1.6743574175268163, "grad_norm": 0.23964375257492065, "learning_rate": 1.2843613098458562e-05, "loss": 0.1645, "step": 8273 }, { "epoch": 1.6745598057073465, "grad_norm": 0.27568867802619934, "learning_rate": 1.2828023032726378e-05, "loss": 0.1916, "step": 8274 }, { "epoch": 1.674762193887877, "grad_norm": 0.2523253858089447, "learning_rate": 1.2812441786140138e-05, "loss": 0.1683, "step": 8275 }, { "epoch": 1.6749645820684074, "grad_norm": 0.27498844265937805, "learning_rate": 1.2796869360276187e-05, "loss": 0.1946, "step": 8276 }, { "epoch": 1.6751669702489376, "grad_norm": 0.30603161454200745, "learning_rate": 1.2781305756709993e-05, "loss": 0.2068, "step": 8277 }, { "epoch": 1.6753693584294678, "grad_norm": 0.2598779797554016, "learning_rate": 1.276575097701609e-05, "loss": 0.1516, "step": 8278 }, { "epoch": 1.675571746609998, "grad_norm": 0.33063915371894836, "learning_rate": 1.275020502276818e-05, "loss": 0.2212, "step": 8279 }, { "epoch": 1.6757741347905282, "grad_norm": 0.2730655372142792, "learning_rate": 1.2734667895539009e-05, "loss": 0.2018, "step": 8280 }, { "epoch": 1.6759765229710584, "grad_norm": 0.26023030281066895, "learning_rate": 1.2719139596900487e-05, "loss": 0.1936, "step": 8281 }, { "epoch": 1.6761789111515888, "grad_norm": 0.2647170424461365, "learning_rate": 1.2703620128423588e-05, "loss": 0.1918, "step": 8282 }, { "epoch": 1.676381299332119, "grad_norm": 0.2505474090576172, "learning_rate": 1.2688109491678412e-05, "loss": 0.178, "step": 8283 }, { "epoch": 1.6765836875126494, "grad_norm": 0.2744555175304413, "learning_rate": 1.2672607688234172e-05, "loss": 0.1996, "step": 8284 }, { "epoch": 1.6767860756931796, "grad_norm": 0.28227829933166504, "learning_rate": 1.265711471965917e-05, "loss": 0.1828, "step": 8285 }, { "epoch": 1.6769884638737098, "grad_norm": 0.36905500292778015, "learning_rate": 1.2641630587520814e-05, "loss": 0.2228, "step": 8286 }, { "epoch": 1.67719085205424, "grad_norm": 0.259802907705307, "learning_rate": 1.2626155293385633e-05, "loss": 0.1658, "step": 8287 }, { "epoch": 1.6773932402347702, "grad_norm": 0.2774128317832947, "learning_rate": 1.2610688838819262e-05, "loss": 0.1956, "step": 8288 }, { "epoch": 1.6775956284153004, "grad_norm": 0.2562331557273865, "learning_rate": 1.2595231225386429e-05, "loss": 0.1705, "step": 8289 }, { "epoch": 1.6777980165958308, "grad_norm": 0.29600250720977783, "learning_rate": 1.2579782454650967e-05, "loss": 0.1926, "step": 8290 }, { "epoch": 1.678000404776361, "grad_norm": 0.25907132029533386, "learning_rate": 1.2564342528175832e-05, "loss": 0.1906, "step": 8291 }, { "epoch": 1.6782027929568915, "grad_norm": 0.27086886763572693, "learning_rate": 1.254891144752307e-05, "loss": 0.1804, "step": 8292 }, { "epoch": 1.6784051811374217, "grad_norm": 0.28162238001823425, "learning_rate": 1.2533489214253836e-05, "loss": 0.1815, "step": 8293 }, { "epoch": 1.6786075693179519, "grad_norm": 0.3076886236667633, "learning_rate": 1.2518075829928399e-05, "loss": 0.2031, "step": 8294 }, { "epoch": 1.678809957498482, "grad_norm": 0.2625339925289154, "learning_rate": 1.2502671296106095e-05, "loss": 0.1572, "step": 8295 }, { "epoch": 1.6790123456790123, "grad_norm": 0.31220224499702454, "learning_rate": 1.2487275614345405e-05, "loss": 0.1773, "step": 8296 }, { "epoch": 1.6792147338595425, "grad_norm": 0.2704111337661743, "learning_rate": 1.2471888786203922e-05, "loss": 0.1797, "step": 8297 }, { "epoch": 1.6794171220400729, "grad_norm": 0.23155251145362854, "learning_rate": 1.2456510813238299e-05, "loss": 0.1614, "step": 8298 }, { "epoch": 1.679619510220603, "grad_norm": 0.2626052796840668, "learning_rate": 1.2441141697004333e-05, "loss": 0.1935, "step": 8299 }, { "epoch": 1.6798218984011335, "grad_norm": 0.2648791968822479, "learning_rate": 1.2425781439056894e-05, "loss": 0.1787, "step": 8300 }, { "epoch": 1.6798218984011335, "eval_loss": 0.2585418224334717, "eval_runtime": 0.7402, "eval_samples_per_second": 6.755, "eval_steps_per_second": 1.351, "step": 8300 }, { "epoch": 1.6800242865816637, "grad_norm": 0.2991698384284973, "learning_rate": 1.2410430040949994e-05, "loss": 0.1971, "step": 8301 }, { "epoch": 1.680226674762194, "grad_norm": 0.2974299490451813, "learning_rate": 1.2395087504236713e-05, "loss": 0.2, "step": 8302 }, { "epoch": 1.680429062942724, "grad_norm": 0.2599434554576874, "learning_rate": 1.2379753830469255e-05, "loss": 0.19, "step": 8303 }, { "epoch": 1.6806314511232543, "grad_norm": 0.25205034017562866, "learning_rate": 1.236442902119891e-05, "loss": 0.1584, "step": 8304 }, { "epoch": 1.6808338393037845, "grad_norm": 0.28435245156288147, "learning_rate": 1.2349113077976094e-05, "loss": 0.166, "step": 8305 }, { "epoch": 1.681036227484315, "grad_norm": 0.30040034651756287, "learning_rate": 1.2333806002350301e-05, "loss": 0.2013, "step": 8306 }, { "epoch": 1.6812386156648453, "grad_norm": 0.2764068841934204, "learning_rate": 1.2318507795870138e-05, "loss": 0.2164, "step": 8307 }, { "epoch": 1.6814410038453755, "grad_norm": 0.2434784173965454, "learning_rate": 1.2303218460083332e-05, "loss": 0.1585, "step": 8308 }, { "epoch": 1.6816433920259057, "grad_norm": 0.3237009048461914, "learning_rate": 1.2287937996536691e-05, "loss": 0.2349, "step": 8309 }, { "epoch": 1.681845780206436, "grad_norm": 0.2595462203025818, "learning_rate": 1.2272666406776135e-05, "loss": 0.1746, "step": 8310 }, { "epoch": 1.6820481683869661, "grad_norm": 0.3123394846916199, "learning_rate": 1.2257403692346681e-05, "loss": 0.1873, "step": 8311 }, { "epoch": 1.6822505565674963, "grad_norm": 0.26531022787094116, "learning_rate": 1.224214985479244e-05, "loss": 0.1724, "step": 8312 }, { "epoch": 1.6824529447480268, "grad_norm": 0.2745008170604706, "learning_rate": 1.2226904895656644e-05, "loss": 0.1652, "step": 8313 }, { "epoch": 1.682655332928557, "grad_norm": 0.29717713594436646, "learning_rate": 1.2211668816481625e-05, "loss": 0.2099, "step": 8314 }, { "epoch": 1.6828577211090874, "grad_norm": 0.3275054097175598, "learning_rate": 1.2196441618808796e-05, "loss": 0.2069, "step": 8315 }, { "epoch": 1.6830601092896176, "grad_norm": 0.3260860741138458, "learning_rate": 1.2181223304178702e-05, "loss": 0.1786, "step": 8316 }, { "epoch": 1.6832624974701478, "grad_norm": 0.615013599395752, "learning_rate": 1.2166013874130955e-05, "loss": 0.1554, "step": 8317 }, { "epoch": 1.683464885650678, "grad_norm": 0.25483933091163635, "learning_rate": 1.2150813330204291e-05, "loss": 0.1671, "step": 8318 }, { "epoch": 1.6836672738312082, "grad_norm": 0.25844165682792664, "learning_rate": 1.2135621673936549e-05, "loss": 0.171, "step": 8319 }, { "epoch": 1.6838696620117384, "grad_norm": 0.2545812726020813, "learning_rate": 1.212043890686465e-05, "loss": 0.1661, "step": 8320 }, { "epoch": 1.6840720501922688, "grad_norm": 0.3012816309928894, "learning_rate": 1.2105265030524626e-05, "loss": 0.2202, "step": 8321 }, { "epoch": 1.684274438372799, "grad_norm": 0.274930864572525, "learning_rate": 1.2090100046451635e-05, "loss": 0.1861, "step": 8322 }, { "epoch": 1.6844768265533294, "grad_norm": 0.27070969343185425, "learning_rate": 1.2074943956179884e-05, "loss": 0.1767, "step": 8323 }, { "epoch": 1.6846792147338596, "grad_norm": 0.35230904817581177, "learning_rate": 1.2059796761242714e-05, "loss": 0.2147, "step": 8324 }, { "epoch": 1.6848816029143898, "grad_norm": 0.2666962742805481, "learning_rate": 1.2044658463172564e-05, "loss": 0.1985, "step": 8325 }, { "epoch": 1.68508399109492, "grad_norm": 0.262483149766922, "learning_rate": 1.2029529063500966e-05, "loss": 0.1689, "step": 8326 }, { "epoch": 1.6852863792754502, "grad_norm": 0.31115856766700745, "learning_rate": 1.201440856375855e-05, "loss": 0.199, "step": 8327 }, { "epoch": 1.6854887674559804, "grad_norm": 0.2889840304851532, "learning_rate": 1.199929696547506e-05, "loss": 0.2209, "step": 8328 }, { "epoch": 1.6856911556365108, "grad_norm": 0.2528282403945923, "learning_rate": 1.1984194270179317e-05, "loss": 0.1417, "step": 8329 }, { "epoch": 1.685893543817041, "grad_norm": 0.28934213519096375, "learning_rate": 1.1969100479399254e-05, "loss": 0.1745, "step": 8330 }, { "epoch": 1.6860959319975715, "grad_norm": 0.2569589614868164, "learning_rate": 1.1954015594661915e-05, "loss": 0.1518, "step": 8331 }, { "epoch": 1.6862983201781017, "grad_norm": 0.2754018008708954, "learning_rate": 1.1938939617493427e-05, "loss": 0.1938, "step": 8332 }, { "epoch": 1.6865007083586319, "grad_norm": 0.23252278566360474, "learning_rate": 1.1923872549419002e-05, "loss": 0.1469, "step": 8333 }, { "epoch": 1.686703096539162, "grad_norm": 0.3326863944530487, "learning_rate": 1.1908814391962985e-05, "loss": 0.23, "step": 8334 }, { "epoch": 1.6869054847196923, "grad_norm": 0.26767897605895996, "learning_rate": 1.1893765146648795e-05, "loss": 0.1816, "step": 8335 }, { "epoch": 1.6871078729002225, "grad_norm": 0.2568624019622803, "learning_rate": 1.1878724814998965e-05, "loss": 0.1793, "step": 8336 }, { "epoch": 1.6873102610807529, "grad_norm": 0.2331872135400772, "learning_rate": 1.1863693398535114e-05, "loss": 0.1522, "step": 8337 }, { "epoch": 1.6875126492612833, "grad_norm": 0.26635870337486267, "learning_rate": 1.184867089877797e-05, "loss": 0.157, "step": 8338 }, { "epoch": 1.6877150374418135, "grad_norm": 0.2556881606578827, "learning_rate": 1.1833657317247338e-05, "loss": 0.1765, "step": 8339 }, { "epoch": 1.6879174256223437, "grad_norm": 0.2685004770755768, "learning_rate": 1.1818652655462126e-05, "loss": 0.1696, "step": 8340 }, { "epoch": 1.688119813802874, "grad_norm": 0.2972007989883423, "learning_rate": 1.1803656914940364e-05, "loss": 0.1859, "step": 8341 }, { "epoch": 1.688322201983404, "grad_norm": 0.25820642709732056, "learning_rate": 1.1788670097199173e-05, "loss": 0.1738, "step": 8342 }, { "epoch": 1.6885245901639343, "grad_norm": 0.28410157561302185, "learning_rate": 1.1773692203754738e-05, "loss": 0.204, "step": 8343 }, { "epoch": 1.6887269783444647, "grad_norm": 0.29454171657562256, "learning_rate": 1.1758723236122382e-05, "loss": 0.1929, "step": 8344 }, { "epoch": 1.688929366524995, "grad_norm": 0.2966192066669464, "learning_rate": 1.1743763195816504e-05, "loss": 0.195, "step": 8345 }, { "epoch": 1.6891317547055253, "grad_norm": 0.28926682472229004, "learning_rate": 1.1728812084350605e-05, "loss": 0.1836, "step": 8346 }, { "epoch": 1.6893341428860555, "grad_norm": 0.28553107380867004, "learning_rate": 1.1713869903237273e-05, "loss": 0.18, "step": 8347 }, { "epoch": 1.6895365310665857, "grad_norm": 0.3032236397266388, "learning_rate": 1.1698936653988214e-05, "loss": 0.2032, "step": 8348 }, { "epoch": 1.689738919247116, "grad_norm": 0.3131108582019806, "learning_rate": 1.1684012338114214e-05, "loss": 0.1725, "step": 8349 }, { "epoch": 1.6899413074276461, "grad_norm": 0.30871301889419556, "learning_rate": 1.1669096957125159e-05, "loss": 0.2258, "step": 8350 }, { "epoch": 1.6899413074276461, "eval_loss": 0.25871673226356506, "eval_runtime": 0.739, "eval_samples_per_second": 6.766, "eval_steps_per_second": 1.353, "step": 8350 }, { "epoch": 1.6901436956081763, "grad_norm": 0.2908981442451477, "learning_rate": 1.1654190512530016e-05, "loss": 0.1896, "step": 8351 }, { "epoch": 1.6903460837887068, "grad_norm": 0.3076784014701843, "learning_rate": 1.1639293005836894e-05, "loss": 0.2111, "step": 8352 }, { "epoch": 1.690548471969237, "grad_norm": 0.2984131872653961, "learning_rate": 1.1624404438552927e-05, "loss": 0.2285, "step": 8353 }, { "epoch": 1.6907508601497674, "grad_norm": 0.28389042615890503, "learning_rate": 1.1609524812184413e-05, "loss": 0.1759, "step": 8354 }, { "epoch": 1.6909532483302976, "grad_norm": 0.29734140634536743, "learning_rate": 1.1594654128236714e-05, "loss": 0.2161, "step": 8355 }, { "epoch": 1.6911556365108278, "grad_norm": 0.2875990569591522, "learning_rate": 1.1579792388214272e-05, "loss": 0.1972, "step": 8356 }, { "epoch": 1.691358024691358, "grad_norm": 0.27701500058174133, "learning_rate": 1.156493959362066e-05, "loss": 0.1898, "step": 8357 }, { "epoch": 1.6915604128718882, "grad_norm": 0.27099698781967163, "learning_rate": 1.1550095745958523e-05, "loss": 0.2021, "step": 8358 }, { "epoch": 1.6917628010524184, "grad_norm": 0.2977381944656372, "learning_rate": 1.1535260846729601e-05, "loss": 0.1998, "step": 8359 }, { "epoch": 1.6919651892329488, "grad_norm": 0.2615094482898712, "learning_rate": 1.152043489743474e-05, "loss": 0.1895, "step": 8360 }, { "epoch": 1.692167577413479, "grad_norm": 0.27318063378334045, "learning_rate": 1.1505617899573885e-05, "loss": 0.1829, "step": 8361 }, { "epoch": 1.6923699655940094, "grad_norm": 0.25262323021888733, "learning_rate": 1.1490809854646011e-05, "loss": 0.161, "step": 8362 }, { "epoch": 1.6925723537745396, "grad_norm": 0.3081194758415222, "learning_rate": 1.1476010764149304e-05, "loss": 0.1924, "step": 8363 }, { "epoch": 1.6927747419550698, "grad_norm": 0.29487472772598267, "learning_rate": 1.146122062958095e-05, "loss": 0.2032, "step": 8364 }, { "epoch": 1.6929771301356, "grad_norm": 0.3395232558250427, "learning_rate": 1.1446439452437275e-05, "loss": 0.2292, "step": 8365 }, { "epoch": 1.6931795183161302, "grad_norm": 0.3127206563949585, "learning_rate": 1.143166723421366e-05, "loss": 0.1867, "step": 8366 }, { "epoch": 1.6933819064966606, "grad_norm": 0.30758431553840637, "learning_rate": 1.1416903976404625e-05, "loss": 0.2042, "step": 8367 }, { "epoch": 1.6935842946771908, "grad_norm": 0.3078838884830475, "learning_rate": 1.140214968050376e-05, "loss": 0.1806, "step": 8368 }, { "epoch": 1.6937866828577213, "grad_norm": 0.27097728848457336, "learning_rate": 1.1387404348003739e-05, "loss": 0.1924, "step": 8369 }, { "epoch": 1.6939890710382515, "grad_norm": 0.2731929123401642, "learning_rate": 1.1372667980396345e-05, "loss": 0.1853, "step": 8370 }, { "epoch": 1.6941914592187817, "grad_norm": 0.31753164529800415, "learning_rate": 1.1357940579172443e-05, "loss": 0.1928, "step": 8371 }, { "epoch": 1.6943938473993119, "grad_norm": 0.26757803559303284, "learning_rate": 1.1343222145822008e-05, "loss": 0.1857, "step": 8372 }, { "epoch": 1.694596235579842, "grad_norm": 0.2824253439903259, "learning_rate": 1.1328512681834092e-05, "loss": 0.2152, "step": 8373 }, { "epoch": 1.6947986237603723, "grad_norm": 0.27644097805023193, "learning_rate": 1.1313812188696838e-05, "loss": 0.2132, "step": 8374 }, { "epoch": 1.6950010119409027, "grad_norm": 0.30260854959487915, "learning_rate": 1.12991206678975e-05, "loss": 0.207, "step": 8375 }, { "epoch": 1.695203400121433, "grad_norm": 0.2823565900325775, "learning_rate": 1.1284438120922402e-05, "loss": 0.1841, "step": 8376 }, { "epoch": 1.6954057883019633, "grad_norm": 0.2845193147659302, "learning_rate": 1.1269764549256978e-05, "loss": 0.1704, "step": 8377 }, { "epoch": 1.6956081764824935, "grad_norm": 0.24924218654632568, "learning_rate": 1.1255099954385727e-05, "loss": 0.1994, "step": 8378 }, { "epoch": 1.6958105646630237, "grad_norm": 0.25757744908332825, "learning_rate": 1.1240444337792288e-05, "loss": 0.1841, "step": 8379 }, { "epoch": 1.696012952843554, "grad_norm": 0.3529128432273865, "learning_rate": 1.122579770095934e-05, "loss": 0.2325, "step": 8380 }, { "epoch": 1.6962153410240841, "grad_norm": 0.26317161321640015, "learning_rate": 1.1211160045368685e-05, "loss": 0.1558, "step": 8381 }, { "epoch": 1.6964177292046143, "grad_norm": 0.28297486901283264, "learning_rate": 1.1196531372501207e-05, "loss": 0.1951, "step": 8382 }, { "epoch": 1.6966201173851447, "grad_norm": 0.32655608654022217, "learning_rate": 1.1181911683836899e-05, "loss": 0.1911, "step": 8383 }, { "epoch": 1.696822505565675, "grad_norm": 0.27955129742622375, "learning_rate": 1.1167300980854789e-05, "loss": 0.2055, "step": 8384 }, { "epoch": 1.6970248937462054, "grad_norm": 0.3174886405467987, "learning_rate": 1.1152699265033062e-05, "loss": 0.1973, "step": 8385 }, { "epoch": 1.6972272819267356, "grad_norm": 0.3020794093608856, "learning_rate": 1.1138106537848948e-05, "loss": 0.2151, "step": 8386 }, { "epoch": 1.6974296701072658, "grad_norm": 0.27277594804763794, "learning_rate": 1.1123522800778807e-05, "loss": 0.1897, "step": 8387 }, { "epoch": 1.697632058287796, "grad_norm": 0.3051762282848358, "learning_rate": 1.1108948055298052e-05, "loss": 0.1935, "step": 8388 }, { "epoch": 1.6978344464683262, "grad_norm": 0.2831827998161316, "learning_rate": 1.1094382302881212e-05, "loss": 0.1899, "step": 8389 }, { "epoch": 1.6980368346488564, "grad_norm": 0.3473316729068756, "learning_rate": 1.1079825545001888e-05, "loss": 0.2047, "step": 8390 }, { "epoch": 1.6982392228293868, "grad_norm": 0.2632426917552948, "learning_rate": 1.10652777831328e-05, "loss": 0.1827, "step": 8391 }, { "epoch": 1.698441611009917, "grad_norm": 0.3207499384880066, "learning_rate": 1.1050739018745716e-05, "loss": 0.2424, "step": 8392 }, { "epoch": 1.6986439991904474, "grad_norm": 0.2827729880809784, "learning_rate": 1.1036209253311524e-05, "loss": 0.2034, "step": 8393 }, { "epoch": 1.6988463873709776, "grad_norm": 0.3075752854347229, "learning_rate": 1.1021688488300197e-05, "loss": 0.1949, "step": 8394 }, { "epoch": 1.6990487755515078, "grad_norm": 0.2706870138645172, "learning_rate": 1.100717672518078e-05, "loss": 0.1794, "step": 8395 }, { "epoch": 1.699251163732038, "grad_norm": 0.2784644663333893, "learning_rate": 1.0992673965421441e-05, "loss": 0.1848, "step": 8396 }, { "epoch": 1.6994535519125682, "grad_norm": 0.27086934447288513, "learning_rate": 1.0978180210489408e-05, "loss": 0.1771, "step": 8397 }, { "epoch": 1.6996559400930986, "grad_norm": 0.2761807441711426, "learning_rate": 1.0963695461850997e-05, "loss": 0.1799, "step": 8398 }, { "epoch": 1.6998583282736288, "grad_norm": 0.2793600559234619, "learning_rate": 1.0949219720971638e-05, "loss": 0.1586, "step": 8399 }, { "epoch": 1.7000607164541592, "grad_norm": 0.29326415061950684, "learning_rate": 1.0934752989315834e-05, "loss": 0.1972, "step": 8400 }, { "epoch": 1.7000607164541592, "eval_loss": 0.25876384973526, "eval_runtime": 0.7391, "eval_samples_per_second": 6.765, "eval_steps_per_second": 1.353, "step": 8400 }, { "epoch": 1.7002631046346894, "grad_norm": 0.2465113401412964, "learning_rate": 1.0920295268347159e-05, "loss": 0.1682, "step": 8401 }, { "epoch": 1.7004654928152196, "grad_norm": 0.24070094525814056, "learning_rate": 1.090584655952831e-05, "loss": 0.1522, "step": 8402 }, { "epoch": 1.7006678809957498, "grad_norm": 0.2528705298900604, "learning_rate": 1.0891406864321053e-05, "loss": 0.1996, "step": 8403 }, { "epoch": 1.70087026917628, "grad_norm": 0.31882789731025696, "learning_rate": 1.087697618418625e-05, "loss": 0.2033, "step": 8404 }, { "epoch": 1.7010726573568102, "grad_norm": 0.2811528444290161, "learning_rate": 1.0862554520583857e-05, "loss": 0.199, "step": 8405 }, { "epoch": 1.7012750455373407, "grad_norm": 0.27912142872810364, "learning_rate": 1.0848141874972862e-05, "loss": 0.1795, "step": 8406 }, { "epoch": 1.7014774337178709, "grad_norm": 0.2798633873462677, "learning_rate": 1.083373824881142e-05, "loss": 0.2137, "step": 8407 }, { "epoch": 1.7016798218984013, "grad_norm": 0.24729759991168976, "learning_rate": 1.0819343643556723e-05, "loss": 0.1795, "step": 8408 }, { "epoch": 1.7018822100789315, "grad_norm": 0.2917996942996979, "learning_rate": 1.0804958060665082e-05, "loss": 0.199, "step": 8409 }, { "epoch": 1.7020845982594617, "grad_norm": 0.291407972574234, "learning_rate": 1.0790581501591867e-05, "loss": 0.1901, "step": 8410 }, { "epoch": 1.7022869864399919, "grad_norm": 0.26372459530830383, "learning_rate": 1.0776213967791549e-05, "loss": 0.1769, "step": 8411 }, { "epoch": 1.702489374620522, "grad_norm": 0.25174322724342346, "learning_rate": 1.0761855460717696e-05, "loss": 0.1688, "step": 8412 }, { "epoch": 1.7026917628010523, "grad_norm": 0.27877235412597656, "learning_rate": 1.0747505981822937e-05, "loss": 0.1756, "step": 8413 }, { "epoch": 1.7028941509815827, "grad_norm": 0.2647832930088043, "learning_rate": 1.0733165532558998e-05, "loss": 0.184, "step": 8414 }, { "epoch": 1.703096539162113, "grad_norm": 0.26928800344467163, "learning_rate": 1.0718834114376718e-05, "loss": 0.1955, "step": 8415 }, { "epoch": 1.7032989273426433, "grad_norm": 0.2908056378364563, "learning_rate": 1.0704511728725986e-05, "loss": 0.2094, "step": 8416 }, { "epoch": 1.7035013155231735, "grad_norm": 0.2552296817302704, "learning_rate": 1.0690198377055782e-05, "loss": 0.1833, "step": 8417 }, { "epoch": 1.7037037037037037, "grad_norm": 0.27076640725135803, "learning_rate": 1.0675894060814185e-05, "loss": 0.1714, "step": 8418 }, { "epoch": 1.703906091884234, "grad_norm": 0.3016811013221741, "learning_rate": 1.0661598781448368e-05, "loss": 0.1685, "step": 8419 }, { "epoch": 1.7041084800647641, "grad_norm": 0.33398357033729553, "learning_rate": 1.0647312540404552e-05, "loss": 0.2415, "step": 8420 }, { "epoch": 1.7043108682452943, "grad_norm": 0.4259779453277588, "learning_rate": 1.0633035339128106e-05, "loss": 0.213, "step": 8421 }, { "epoch": 1.7045132564258247, "grad_norm": 0.272946298122406, "learning_rate": 1.0618767179063416e-05, "loss": 0.201, "step": 8422 }, { "epoch": 1.704715644606355, "grad_norm": 0.2644243538379669, "learning_rate": 1.0604508061654005e-05, "loss": 0.1957, "step": 8423 }, { "epoch": 1.7049180327868854, "grad_norm": 0.2648584842681885, "learning_rate": 1.0590257988342456e-05, "loss": 0.1774, "step": 8424 }, { "epoch": 1.7051204209674156, "grad_norm": 0.30881166458129883, "learning_rate": 1.0576016960570433e-05, "loss": 0.1931, "step": 8425 }, { "epoch": 1.7053228091479458, "grad_norm": 0.22526279091835022, "learning_rate": 1.05617849797787e-05, "loss": 0.1567, "step": 8426 }, { "epoch": 1.705525197328476, "grad_norm": 0.31192898750305176, "learning_rate": 1.054756204740711e-05, "loss": 0.2126, "step": 8427 }, { "epoch": 1.7057275855090062, "grad_norm": 0.2907617688179016, "learning_rate": 1.0533348164894575e-05, "loss": 0.2089, "step": 8428 }, { "epoch": 1.7059299736895366, "grad_norm": 0.31975242495536804, "learning_rate": 1.0519143333679094e-05, "loss": 0.2075, "step": 8429 }, { "epoch": 1.7061323618700668, "grad_norm": 0.2925901412963867, "learning_rate": 1.0504947555197786e-05, "loss": 0.1919, "step": 8430 }, { "epoch": 1.7063347500505972, "grad_norm": 0.2986370623111725, "learning_rate": 1.0490760830886826e-05, "loss": 0.1932, "step": 8431 }, { "epoch": 1.7065371382311274, "grad_norm": 0.31750813126564026, "learning_rate": 1.0476583162181464e-05, "loss": 0.1895, "step": 8432 }, { "epoch": 1.7067395264116576, "grad_norm": 0.3126259446144104, "learning_rate": 1.0462414550516064e-05, "loss": 0.2198, "step": 8433 }, { "epoch": 1.7069419145921878, "grad_norm": 0.3027331829071045, "learning_rate": 1.0448254997324058e-05, "loss": 0.2191, "step": 8434 }, { "epoch": 1.707144302772718, "grad_norm": 0.256282240152359, "learning_rate": 1.0434104504037956e-05, "loss": 0.1846, "step": 8435 }, { "epoch": 1.7073466909532482, "grad_norm": 0.2230484038591385, "learning_rate": 1.0419963072089355e-05, "loss": 0.167, "step": 8436 }, { "epoch": 1.7075490791337786, "grad_norm": 0.2401764690876007, "learning_rate": 1.0405830702908936e-05, "loss": 0.1602, "step": 8437 }, { "epoch": 1.7077514673143088, "grad_norm": 0.2753002941608429, "learning_rate": 1.0391707397926465e-05, "loss": 0.1878, "step": 8438 }, { "epoch": 1.7079538554948392, "grad_norm": 0.31785663962364197, "learning_rate": 1.0377593158570786e-05, "loss": 0.2253, "step": 8439 }, { "epoch": 1.7081562436753694, "grad_norm": 0.26879486441612244, "learning_rate": 1.0363487986269837e-05, "loss": 0.1646, "step": 8440 }, { "epoch": 1.7083586318558996, "grad_norm": 0.3231867253780365, "learning_rate": 1.0349391882450632e-05, "loss": 0.1762, "step": 8441 }, { "epoch": 1.7085610200364298, "grad_norm": 0.306441068649292, "learning_rate": 1.0335304848539262e-05, "loss": 0.199, "step": 8442 }, { "epoch": 1.70876340821696, "grad_norm": 0.325333833694458, "learning_rate": 1.0321226885960899e-05, "loss": 0.2153, "step": 8443 }, { "epoch": 1.7089657963974902, "grad_norm": 0.2622414529323578, "learning_rate": 1.0307157996139815e-05, "loss": 0.1683, "step": 8444 }, { "epoch": 1.7091681845780207, "grad_norm": 0.2952292263507843, "learning_rate": 1.0293098180499361e-05, "loss": 0.183, "step": 8445 }, { "epoch": 1.7093705727585509, "grad_norm": 0.2727248966693878, "learning_rate": 1.0279047440461931e-05, "loss": 0.2091, "step": 8446 }, { "epoch": 1.7095729609390813, "grad_norm": 0.302756667137146, "learning_rate": 1.0265005777449066e-05, "loss": 0.2054, "step": 8447 }, { "epoch": 1.7097753491196115, "grad_norm": 0.35711240768432617, "learning_rate": 1.025097319288133e-05, "loss": 0.2223, "step": 8448 }, { "epoch": 1.7099777373001417, "grad_norm": 0.2822362780570984, "learning_rate": 1.0236949688178399e-05, "loss": 0.1611, "step": 8449 }, { "epoch": 1.7101801254806719, "grad_norm": 0.2875375747680664, "learning_rate": 1.0222935264759037e-05, "loss": 0.2048, "step": 8450 }, { "epoch": 1.7101801254806719, "eval_loss": 0.2587939500808716, "eval_runtime": 0.7393, "eval_samples_per_second": 6.764, "eval_steps_per_second": 1.353, "step": 8450 }, { "epoch": 1.710382513661202, "grad_norm": 0.29503369331359863, "learning_rate": 1.0208929924041055e-05, "loss": 0.2009, "step": 8451 }, { "epoch": 1.7105849018417323, "grad_norm": 0.2740275263786316, "learning_rate": 1.0194933667441386e-05, "loss": 0.1941, "step": 8452 }, { "epoch": 1.7107872900222627, "grad_norm": 0.2855680584907532, "learning_rate": 1.0180946496375998e-05, "loss": 0.2248, "step": 8453 }, { "epoch": 1.710989678202793, "grad_norm": 0.28390875458717346, "learning_rate": 1.0166968412259992e-05, "loss": 0.1819, "step": 8454 }, { "epoch": 1.7111920663833233, "grad_norm": 0.2848486006259918, "learning_rate": 1.0152999416507513e-05, "loss": 0.1901, "step": 8455 }, { "epoch": 1.7113944545638535, "grad_norm": 0.2505626082420349, "learning_rate": 1.01390395105318e-05, "loss": 0.1658, "step": 8456 }, { "epoch": 1.7115968427443837, "grad_norm": 0.2542025148868561, "learning_rate": 1.0125088695745166e-05, "loss": 0.1931, "step": 8457 }, { "epoch": 1.711799230924914, "grad_norm": 0.2455524504184723, "learning_rate": 1.0111146973559015e-05, "loss": 0.198, "step": 8458 }, { "epoch": 1.7120016191054441, "grad_norm": 0.2576819360256195, "learning_rate": 1.009721434538381e-05, "loss": 0.1706, "step": 8459 }, { "epoch": 1.7122040072859745, "grad_norm": 0.26860541105270386, "learning_rate": 1.0083290812629132e-05, "loss": 0.2004, "step": 8460 }, { "epoch": 1.7124063954665047, "grad_norm": 0.25479787588119507, "learning_rate": 1.0069376376703598e-05, "loss": 0.1626, "step": 8461 }, { "epoch": 1.7126087836470352, "grad_norm": 0.3089302182197571, "learning_rate": 1.005547103901493e-05, "loss": 0.2, "step": 8462 }, { "epoch": 1.7128111718275654, "grad_norm": 0.3235393166542053, "learning_rate": 1.0041574800969921e-05, "loss": 0.1868, "step": 8463 }, { "epoch": 1.7130135600080956, "grad_norm": 0.2871195673942566, "learning_rate": 1.0027687663974462e-05, "loss": 0.1798, "step": 8464 }, { "epoch": 1.7132159481886258, "grad_norm": 0.33283573389053345, "learning_rate": 1.001380962943349e-05, "loss": 0.223, "step": 8465 }, { "epoch": 1.713418336369156, "grad_norm": 0.27454549074172974, "learning_rate": 9.999940698751043e-06, "loss": 0.1968, "step": 8466 }, { "epoch": 1.7136207245496862, "grad_norm": 0.2364848405122757, "learning_rate": 9.98608087333024e-06, "loss": 0.1666, "step": 8467 }, { "epoch": 1.7138231127302166, "grad_norm": 0.3056480586528778, "learning_rate": 9.97223015457327e-06, "loss": 0.2041, "step": 8468 }, { "epoch": 1.7140255009107468, "grad_norm": 0.3281700909137726, "learning_rate": 9.958388543881392e-06, "loss": 0.1717, "step": 8469 }, { "epoch": 1.7142278890912772, "grad_norm": 0.28150680661201477, "learning_rate": 9.944556042654973e-06, "loss": 0.199, "step": 8470 }, { "epoch": 1.7144302772718074, "grad_norm": 0.2600691020488739, "learning_rate": 9.93073265229344e-06, "loss": 0.186, "step": 8471 }, { "epoch": 1.7146326654523376, "grad_norm": 0.2786303758621216, "learning_rate": 9.916918374195282e-06, "loss": 0.1903, "step": 8472 }, { "epoch": 1.7148350536328678, "grad_norm": 0.24803856015205383, "learning_rate": 9.903113209758096e-06, "loss": 0.1542, "step": 8473 }, { "epoch": 1.715037441813398, "grad_norm": 0.3093421459197998, "learning_rate": 9.889317160378531e-06, "loss": 0.2187, "step": 8474 }, { "epoch": 1.7152398299939282, "grad_norm": 0.27181512117385864, "learning_rate": 9.875530227452345e-06, "loss": 0.2113, "step": 8475 }, { "epoch": 1.7154422181744586, "grad_norm": 0.3223206102848053, "learning_rate": 9.861752412374336e-06, "loss": 0.2015, "step": 8476 }, { "epoch": 1.7156446063549888, "grad_norm": 0.24381880462169647, "learning_rate": 9.847983716538423e-06, "loss": 0.1781, "step": 8477 }, { "epoch": 1.7158469945355193, "grad_norm": 0.2983056902885437, "learning_rate": 9.834224141337544e-06, "loss": 0.178, "step": 8478 }, { "epoch": 1.7160493827160495, "grad_norm": 0.3364439308643341, "learning_rate": 9.820473688163778e-06, "loss": 0.1767, "step": 8479 }, { "epoch": 1.7162517708965797, "grad_norm": 0.26381823420524597, "learning_rate": 9.806732358408244e-06, "loss": 0.1845, "step": 8480 }, { "epoch": 1.7164541590771099, "grad_norm": 0.3094126880168915, "learning_rate": 9.793000153461141e-06, "loss": 0.2032, "step": 8481 }, { "epoch": 1.71665654725764, "grad_norm": 0.30394935607910156, "learning_rate": 9.779277074711745e-06, "loss": 0.1906, "step": 8482 }, { "epoch": 1.7168589354381703, "grad_norm": 0.289957731962204, "learning_rate": 9.765563123548426e-06, "loss": 0.1838, "step": 8483 }, { "epoch": 1.7170613236187007, "grad_norm": 0.2532503008842468, "learning_rate": 9.751858301358607e-06, "loss": 0.1635, "step": 8484 }, { "epoch": 1.7172637117992309, "grad_norm": 0.26653948426246643, "learning_rate": 9.73816260952881e-06, "loss": 0.1798, "step": 8485 }, { "epoch": 1.7174660999797613, "grad_norm": 0.30379506945610046, "learning_rate": 9.724476049444609e-06, "loss": 0.1908, "step": 8486 }, { "epoch": 1.7176684881602915, "grad_norm": 0.4283871054649353, "learning_rate": 9.710798622490669e-06, "loss": 0.1931, "step": 8487 }, { "epoch": 1.7178708763408217, "grad_norm": 0.24685965478420258, "learning_rate": 9.69713033005073e-06, "loss": 0.1501, "step": 8488 }, { "epoch": 1.718073264521352, "grad_norm": 0.31098735332489014, "learning_rate": 9.683471173507608e-06, "loss": 0.1989, "step": 8489 }, { "epoch": 1.718275652701882, "grad_norm": 0.2871131896972656, "learning_rate": 9.669821154243186e-06, "loss": 0.1812, "step": 8490 }, { "epoch": 1.7184780408824125, "grad_norm": 0.3463563323020935, "learning_rate": 9.656180273638448e-06, "loss": 0.2067, "step": 8491 }, { "epoch": 1.7186804290629427, "grad_norm": 0.34777534008026123, "learning_rate": 9.642548533073415e-06, "loss": 0.2103, "step": 8492 }, { "epoch": 1.7188828172434731, "grad_norm": 0.30363330245018005, "learning_rate": 9.628925933927213e-06, "loss": 0.1695, "step": 8493 }, { "epoch": 1.7190852054240033, "grad_norm": 0.281427264213562, "learning_rate": 9.615312477578054e-06, "loss": 0.173, "step": 8494 }, { "epoch": 1.7192875936045335, "grad_norm": 0.32801055908203125, "learning_rate": 9.601708165403156e-06, "loss": 0.2195, "step": 8495 }, { "epoch": 1.7194899817850637, "grad_norm": 0.25589799880981445, "learning_rate": 9.588112998778898e-06, "loss": 0.1961, "step": 8496 }, { "epoch": 1.719692369965594, "grad_norm": 0.26079970598220825, "learning_rate": 9.574526979080678e-06, "loss": 0.1715, "step": 8497 }, { "epoch": 1.7198947581461241, "grad_norm": 0.2752712070941925, "learning_rate": 9.560950107682997e-06, "loss": 0.1846, "step": 8498 }, { "epoch": 1.7200971463266546, "grad_norm": 0.29999780654907227, "learning_rate": 9.547382385959414e-06, "loss": 0.215, "step": 8499 }, { "epoch": 1.7202995345071848, "grad_norm": 0.2767726480960846, "learning_rate": 9.533823815282583e-06, "loss": 0.1907, "step": 8500 }, { "epoch": 1.7202995345071848, "eval_loss": 0.25887531042099, "eval_runtime": 0.7403, "eval_samples_per_second": 6.754, "eval_steps_per_second": 1.351, "step": 8500 }, { "epoch": 1.7205019226877152, "grad_norm": 0.26701119542121887, "learning_rate": 9.520274397024198e-06, "loss": 0.1854, "step": 8501 }, { "epoch": 1.7207043108682454, "grad_norm": 0.334595650434494, "learning_rate": 9.506734132555062e-06, "loss": 0.2115, "step": 8502 }, { "epoch": 1.7209066990487756, "grad_norm": 0.29382118582725525, "learning_rate": 9.493203023245023e-06, "loss": 0.1966, "step": 8503 }, { "epoch": 1.7211090872293058, "grad_norm": 0.30095425248146057, "learning_rate": 9.47968107046303e-06, "loss": 0.1959, "step": 8504 }, { "epoch": 1.721311475409836, "grad_norm": 0.24783647060394287, "learning_rate": 9.466168275577092e-06, "loss": 0.195, "step": 8505 }, { "epoch": 1.7215138635903662, "grad_norm": 0.26918861269950867, "learning_rate": 9.452664639954278e-06, "loss": 0.1892, "step": 8506 }, { "epoch": 1.7217162517708966, "grad_norm": 0.30738553404808044, "learning_rate": 9.439170164960765e-06, "loss": 0.1901, "step": 8507 }, { "epoch": 1.7219186399514268, "grad_norm": 0.2554987967014313, "learning_rate": 9.425684851961757e-06, "loss": 0.1827, "step": 8508 }, { "epoch": 1.7221210281319572, "grad_norm": 0.288388192653656, "learning_rate": 9.41220870232158e-06, "loss": 0.1896, "step": 8509 }, { "epoch": 1.7223234163124874, "grad_norm": 0.28056037425994873, "learning_rate": 9.398741717403604e-06, "loss": 0.2062, "step": 8510 }, { "epoch": 1.7225258044930176, "grad_norm": 0.35663676261901855, "learning_rate": 9.385283898570264e-06, "loss": 0.2208, "step": 8511 }, { "epoch": 1.7227281926735478, "grad_norm": 0.3021637499332428, "learning_rate": 9.371835247183092e-06, "loss": 0.1946, "step": 8512 }, { "epoch": 1.722930580854078, "grad_norm": 0.3164548873901367, "learning_rate": 9.358395764602679e-06, "loss": 0.2, "step": 8513 }, { "epoch": 1.7231329690346082, "grad_norm": 0.2893165051937103, "learning_rate": 9.344965452188692e-06, "loss": 0.1828, "step": 8514 }, { "epoch": 1.7233353572151386, "grad_norm": 0.25707072019577026, "learning_rate": 9.331544311299867e-06, "loss": 0.1798, "step": 8515 }, { "epoch": 1.7235377453956688, "grad_norm": 0.2197255641222, "learning_rate": 9.318132343294018e-06, "loss": 0.1516, "step": 8516 }, { "epoch": 1.7237401335761993, "grad_norm": 0.2677488625049591, "learning_rate": 9.304729549528014e-06, "loss": 0.1883, "step": 8517 }, { "epoch": 1.7239425217567295, "grad_norm": 0.2830187976360321, "learning_rate": 9.291335931357826e-06, "loss": 0.1925, "step": 8518 }, { "epoch": 1.7241449099372597, "grad_norm": 0.2824142873287201, "learning_rate": 9.27795149013848e-06, "loss": 0.1801, "step": 8519 }, { "epoch": 1.7243472981177899, "grad_norm": 0.2455061674118042, "learning_rate": 9.264576227224064e-06, "loss": 0.1358, "step": 8520 }, { "epoch": 1.72454968629832, "grad_norm": 0.2879447937011719, "learning_rate": 9.251210143967737e-06, "loss": 0.1981, "step": 8521 }, { "epoch": 1.7247520744788505, "grad_norm": 0.33884865045547485, "learning_rate": 9.237853241721761e-06, "loss": 0.1773, "step": 8522 }, { "epoch": 1.7249544626593807, "grad_norm": 0.26604849100112915, "learning_rate": 9.224505521837446e-06, "loss": 0.1719, "step": 8523 }, { "epoch": 1.725156850839911, "grad_norm": 0.3080281615257263, "learning_rate": 9.211166985665154e-06, "loss": 0.1899, "step": 8524 }, { "epoch": 1.7253592390204413, "grad_norm": 0.25215914845466614, "learning_rate": 9.197837634554351e-06, "loss": 0.1578, "step": 8525 }, { "epoch": 1.7255616272009715, "grad_norm": 0.2496306151151657, "learning_rate": 9.18451746985356e-06, "loss": 0.1613, "step": 8526 }, { "epoch": 1.7257640153815017, "grad_norm": 0.3078269958496094, "learning_rate": 9.171206492910377e-06, "loss": 0.2016, "step": 8527 }, { "epoch": 1.725966403562032, "grad_norm": 0.2878892719745636, "learning_rate": 9.157904705071474e-06, "loss": 0.1953, "step": 8528 }, { "epoch": 1.726168791742562, "grad_norm": 0.3165808916091919, "learning_rate": 9.14461210768257e-06, "loss": 0.1478, "step": 8529 }, { "epoch": 1.7263711799230925, "grad_norm": 0.2814152240753174, "learning_rate": 9.131328702088471e-06, "loss": 0.2018, "step": 8530 }, { "epoch": 1.7265735681036227, "grad_norm": 0.34386932849884033, "learning_rate": 9.118054489633066e-06, "loss": 0.2185, "step": 8531 }, { "epoch": 1.7267759562841531, "grad_norm": 0.26644062995910645, "learning_rate": 9.104789471659303e-06, "loss": 0.1746, "step": 8532 }, { "epoch": 1.7269783444646833, "grad_norm": 0.2888523042201996, "learning_rate": 9.091533649509177e-06, "loss": 0.1801, "step": 8533 }, { "epoch": 1.7271807326452135, "grad_norm": 0.29055801033973694, "learning_rate": 9.078287024523791e-06, "loss": 0.1747, "step": 8534 }, { "epoch": 1.7273831208257437, "grad_norm": 0.2960166037082672, "learning_rate": 9.065049598043285e-06, "loss": 0.1804, "step": 8535 }, { "epoch": 1.727585509006274, "grad_norm": 0.3112858235836029, "learning_rate": 9.0518213714069e-06, "loss": 0.1795, "step": 8536 }, { "epoch": 1.7277878971868041, "grad_norm": 0.24035745859146118, "learning_rate": 9.038602345952919e-06, "loss": 0.1646, "step": 8537 }, { "epoch": 1.7279902853673346, "grad_norm": 0.25147929787635803, "learning_rate": 9.025392523018706e-06, "loss": 0.1391, "step": 8538 }, { "epoch": 1.7281926735478648, "grad_norm": 0.3143143653869629, "learning_rate": 9.012191903940704e-06, "loss": 0.1979, "step": 8539 }, { "epoch": 1.7283950617283952, "grad_norm": 0.2703983783721924, "learning_rate": 8.999000490054388e-06, "loss": 0.1675, "step": 8540 }, { "epoch": 1.7285974499089254, "grad_norm": 0.3867993652820587, "learning_rate": 8.985818282694336e-06, "loss": 0.1804, "step": 8541 }, { "epoch": 1.7287998380894556, "grad_norm": 0.2795158922672272, "learning_rate": 8.972645283194193e-06, "loss": 0.1768, "step": 8542 }, { "epoch": 1.7290022262699858, "grad_norm": 0.2464820295572281, "learning_rate": 8.959481492886657e-06, "loss": 0.1665, "step": 8543 }, { "epoch": 1.729204614450516, "grad_norm": 0.24622751772403717, "learning_rate": 8.946326913103508e-06, "loss": 0.1711, "step": 8544 }, { "epoch": 1.7294070026310462, "grad_norm": 0.3368050754070282, "learning_rate": 8.933181545175585e-06, "loss": 0.2069, "step": 8545 }, { "epoch": 1.7296093908115766, "grad_norm": 0.270673930644989, "learning_rate": 8.920045390432796e-06, "loss": 0.2033, "step": 8546 }, { "epoch": 1.7298117789921068, "grad_norm": 0.26693806052207947, "learning_rate": 8.906918450204138e-06, "loss": 0.1797, "step": 8547 }, { "epoch": 1.7300141671726372, "grad_norm": 0.3291761875152588, "learning_rate": 8.893800725817624e-06, "loss": 0.1935, "step": 8548 }, { "epoch": 1.7302165553531674, "grad_norm": 0.35645949840545654, "learning_rate": 8.880692218600406e-06, "loss": 0.1857, "step": 8549 }, { "epoch": 1.7304189435336976, "grad_norm": 0.2925495505332947, "learning_rate": 8.867592929878632e-06, "loss": 0.1572, "step": 8550 }, { "epoch": 1.7304189435336976, "eval_loss": 0.2594282627105713, "eval_runtime": 0.7383, "eval_samples_per_second": 6.773, "eval_steps_per_second": 1.355, "step": 8550 }, { "epoch": 1.7306213317142278, "grad_norm": 0.2810535430908203, "learning_rate": 8.854502860977564e-06, "loss": 0.2246, "step": 8551 }, { "epoch": 1.730823719894758, "grad_norm": 0.3040057122707367, "learning_rate": 8.841422013221524e-06, "loss": 0.1888, "step": 8552 }, { "epoch": 1.7310261080752884, "grad_norm": 0.2596018314361572, "learning_rate": 8.828350387933882e-06, "loss": 0.1542, "step": 8553 }, { "epoch": 1.7312284962558186, "grad_norm": 0.2669846713542938, "learning_rate": 8.815287986437092e-06, "loss": 0.1916, "step": 8554 }, { "epoch": 1.731430884436349, "grad_norm": 0.2962510883808136, "learning_rate": 8.80223481005269e-06, "loss": 0.211, "step": 8555 }, { "epoch": 1.7316332726168793, "grad_norm": 0.2679612934589386, "learning_rate": 8.789190860101225e-06, "loss": 0.1475, "step": 8556 }, { "epoch": 1.7318356607974095, "grad_norm": 0.2798098027706146, "learning_rate": 8.77615613790237e-06, "loss": 0.2033, "step": 8557 }, { "epoch": 1.7320380489779397, "grad_norm": 0.311907559633255, "learning_rate": 8.763130644774842e-06, "loss": 0.2081, "step": 8558 }, { "epoch": 1.7322404371584699, "grad_norm": 0.2620576024055481, "learning_rate": 8.750114382036412e-06, "loss": 0.1779, "step": 8559 }, { "epoch": 1.732442825339, "grad_norm": 0.30197739601135254, "learning_rate": 8.737107351003937e-06, "loss": 0.2017, "step": 8560 }, { "epoch": 1.7326452135195305, "grad_norm": 0.26769980788230896, "learning_rate": 8.724109552993342e-06, "loss": 0.1938, "step": 8561 }, { "epoch": 1.7328476017000607, "grad_norm": 0.27940577268600464, "learning_rate": 8.711120989319588e-06, "loss": 0.1882, "step": 8562 }, { "epoch": 1.7330499898805911, "grad_norm": 0.31509724259376526, "learning_rate": 8.69814166129672e-06, "loss": 0.1933, "step": 8563 }, { "epoch": 1.7332523780611213, "grad_norm": 0.3496498763561249, "learning_rate": 8.68517157023786e-06, "loss": 0.205, "step": 8564 }, { "epoch": 1.7334547662416515, "grad_norm": 0.25049978494644165, "learning_rate": 8.672210717455187e-06, "loss": 0.1646, "step": 8565 }, { "epoch": 1.7336571544221817, "grad_norm": 0.29622167348861694, "learning_rate": 8.659259104259942e-06, "loss": 0.1671, "step": 8566 }, { "epoch": 1.733859542602712, "grad_norm": 0.27434036135673523, "learning_rate": 8.646316731962433e-06, "loss": 0.19, "step": 8567 }, { "epoch": 1.7340619307832421, "grad_norm": 0.2887204885482788, "learning_rate": 8.633383601872035e-06, "loss": 0.1957, "step": 8568 }, { "epoch": 1.7342643189637725, "grad_norm": 0.3375066816806793, "learning_rate": 8.62045971529718e-06, "loss": 0.2209, "step": 8569 }, { "epoch": 1.7344667071443027, "grad_norm": 0.32264482975006104, "learning_rate": 8.607545073545375e-06, "loss": 0.1748, "step": 8570 }, { "epoch": 1.7346690953248332, "grad_norm": 0.33347657322883606, "learning_rate": 8.5946396779232e-06, "loss": 0.1992, "step": 8571 }, { "epoch": 1.7348714835053634, "grad_norm": 0.2814638614654541, "learning_rate": 8.581743529736274e-06, "loss": 0.2136, "step": 8572 }, { "epoch": 1.7350738716858936, "grad_norm": 0.3195917010307312, "learning_rate": 8.568856630289268e-06, "loss": 0.2252, "step": 8573 }, { "epoch": 1.7352762598664238, "grad_norm": 0.2509499192237854, "learning_rate": 8.555978980886004e-06, "loss": 0.1712, "step": 8574 }, { "epoch": 1.735478648046954, "grad_norm": 0.32808899879455566, "learning_rate": 8.543110582829272e-06, "loss": 0.2164, "step": 8575 }, { "epoch": 1.7356810362274842, "grad_norm": 0.244186133146286, "learning_rate": 8.530251437420954e-06, "loss": 0.1678, "step": 8576 }, { "epoch": 1.7358834244080146, "grad_norm": 0.2884461581707001, "learning_rate": 8.517401545962034e-06, "loss": 0.1972, "step": 8577 }, { "epoch": 1.7360858125885448, "grad_norm": 0.3048520088195801, "learning_rate": 8.50456090975249e-06, "loss": 0.218, "step": 8578 }, { "epoch": 1.7362882007690752, "grad_norm": 0.34312018752098083, "learning_rate": 8.49172953009143e-06, "loss": 0.2004, "step": 8579 }, { "epoch": 1.7364905889496054, "grad_norm": 0.28323104977607727, "learning_rate": 8.478907408276993e-06, "loss": 0.1645, "step": 8580 }, { "epoch": 1.7366929771301356, "grad_norm": 0.26150673627853394, "learning_rate": 8.466094545606385e-06, "loss": 0.1472, "step": 8581 }, { "epoch": 1.7368953653106658, "grad_norm": 0.30239489674568176, "learning_rate": 8.45329094337588e-06, "loss": 0.1921, "step": 8582 }, { "epoch": 1.737097753491196, "grad_norm": 0.2655140459537506, "learning_rate": 8.44049660288082e-06, "loss": 0.2039, "step": 8583 }, { "epoch": 1.7373001416717264, "grad_norm": 0.28127583861351013, "learning_rate": 8.427711525415571e-06, "loss": 0.2167, "step": 8584 }, { "epoch": 1.7375025298522566, "grad_norm": 0.2773891091346741, "learning_rate": 8.41493571227362e-06, "loss": 0.1755, "step": 8585 }, { "epoch": 1.737704918032787, "grad_norm": 0.2599252760410309, "learning_rate": 8.402169164747475e-06, "loss": 0.1728, "step": 8586 }, { "epoch": 1.7379073062133172, "grad_norm": 0.2744457721710205, "learning_rate": 8.389411884128728e-06, "loss": 0.1987, "step": 8587 }, { "epoch": 1.7381096943938474, "grad_norm": 0.26484882831573486, "learning_rate": 8.376663871708035e-06, "loss": 0.1871, "step": 8588 }, { "epoch": 1.7383120825743776, "grad_norm": 0.2672707438468933, "learning_rate": 8.363925128775096e-06, "loss": 0.2017, "step": 8589 }, { "epoch": 1.7385144707549078, "grad_norm": 0.32294702529907227, "learning_rate": 8.351195656618682e-06, "loss": 0.2062, "step": 8590 }, { "epoch": 1.738716858935438, "grad_norm": 0.27408814430236816, "learning_rate": 8.33847545652664e-06, "loss": 0.1722, "step": 8591 }, { "epoch": 1.7389192471159685, "grad_norm": 0.2947283089160919, "learning_rate": 8.325764529785851e-06, "loss": 0.2286, "step": 8592 }, { "epoch": 1.7391216352964987, "grad_norm": 0.2431199848651886, "learning_rate": 8.313062877682287e-06, "loss": 0.1926, "step": 8593 }, { "epoch": 1.739324023477029, "grad_norm": 0.30300426483154297, "learning_rate": 8.300370501500953e-06, "loss": 0.2268, "step": 8594 }, { "epoch": 1.7395264116575593, "grad_norm": 0.2920367121696472, "learning_rate": 8.287687402525945e-06, "loss": 0.1849, "step": 8595 }, { "epoch": 1.7397287998380895, "grad_norm": 0.2584496736526489, "learning_rate": 8.275013582040392e-06, "loss": 0.18, "step": 8596 }, { "epoch": 1.7399311880186197, "grad_norm": 0.26713255047798157, "learning_rate": 8.262349041326512e-06, "loss": 0.1572, "step": 8597 }, { "epoch": 1.7401335761991499, "grad_norm": 0.2660084664821625, "learning_rate": 8.249693781665557e-06, "loss": 0.2091, "step": 8598 }, { "epoch": 1.74033596437968, "grad_norm": 0.32999715209007263, "learning_rate": 8.237047804337861e-06, "loss": 0.1907, "step": 8599 }, { "epoch": 1.7405383525602105, "grad_norm": 0.2914731502532959, "learning_rate": 8.22441111062282e-06, "loss": 0.2153, "step": 8600 }, { "epoch": 1.7405383525602105, "eval_loss": 0.2593904137611389, "eval_runtime": 0.7381, "eval_samples_per_second": 6.774, "eval_steps_per_second": 1.355, "step": 8600 }, { "epoch": 1.7407407407407407, "grad_norm": 0.27393868565559387, "learning_rate": 8.211783701798859e-06, "loss": 0.19, "step": 8601 }, { "epoch": 1.7409431289212711, "grad_norm": 0.24766801297664642, "learning_rate": 8.199165579143508e-06, "loss": 0.1699, "step": 8602 }, { "epoch": 1.7411455171018013, "grad_norm": 0.2624143064022064, "learning_rate": 8.186556743933327e-06, "loss": 0.1918, "step": 8603 }, { "epoch": 1.7413479052823315, "grad_norm": 0.2595481872558594, "learning_rate": 8.173957197443948e-06, "loss": 0.1906, "step": 8604 }, { "epoch": 1.7415502934628617, "grad_norm": 0.2655414938926697, "learning_rate": 8.161366940950076e-06, "loss": 0.17, "step": 8605 }, { "epoch": 1.741752681643392, "grad_norm": 0.29584211111068726, "learning_rate": 8.148785975725437e-06, "loss": 0.1749, "step": 8606 }, { "epoch": 1.7419550698239221, "grad_norm": 0.28779736161231995, "learning_rate": 8.136214303042834e-06, "loss": 0.175, "step": 8607 }, { "epoch": 1.7421574580044525, "grad_norm": 0.27830713987350464, "learning_rate": 8.123651924174158e-06, "loss": 0.1795, "step": 8608 }, { "epoch": 1.742359846184983, "grad_norm": 0.302734375, "learning_rate": 8.111098840390341e-06, "loss": 0.2105, "step": 8609 }, { "epoch": 1.7425622343655132, "grad_norm": 0.28851422667503357, "learning_rate": 8.098555052961354e-06, "loss": 0.1874, "step": 8610 }, { "epoch": 1.7427646225460434, "grad_norm": 0.2846405804157257, "learning_rate": 8.08602056315626e-06, "loss": 0.1796, "step": 8611 }, { "epoch": 1.7429670107265736, "grad_norm": 0.2728175222873688, "learning_rate": 8.073495372243156e-06, "loss": 0.1574, "step": 8612 }, { "epoch": 1.7431693989071038, "grad_norm": 0.30502745509147644, "learning_rate": 8.060979481489228e-06, "loss": 0.1851, "step": 8613 }, { "epoch": 1.743371787087634, "grad_norm": 0.2595095932483673, "learning_rate": 8.048472892160685e-06, "loss": 0.1707, "step": 8614 }, { "epoch": 1.7435741752681644, "grad_norm": 0.3026995062828064, "learning_rate": 8.035975605522816e-06, "loss": 0.1626, "step": 8615 }, { "epoch": 1.7437765634486946, "grad_norm": 0.3065376579761505, "learning_rate": 8.023487622839975e-06, "loss": 0.2179, "step": 8616 }, { "epoch": 1.743978951629225, "grad_norm": 0.34945711493492126, "learning_rate": 8.011008945375553e-06, "loss": 0.1956, "step": 8617 }, { "epoch": 1.7441813398097552, "grad_norm": 0.3083902597427368, "learning_rate": 7.998539574392017e-06, "loss": 0.2114, "step": 8618 }, { "epoch": 1.7443837279902854, "grad_norm": 0.31957361102104187, "learning_rate": 7.986079511150879e-06, "loss": 0.2174, "step": 8619 }, { "epoch": 1.7445861161708156, "grad_norm": 0.2879070043563843, "learning_rate": 7.973628756912732e-06, "loss": 0.2101, "step": 8620 }, { "epoch": 1.7447885043513458, "grad_norm": 0.2841523289680481, "learning_rate": 7.961187312937202e-06, "loss": 0.1844, "step": 8621 }, { "epoch": 1.744990892531876, "grad_norm": 0.2686194181442261, "learning_rate": 7.948755180482991e-06, "loss": 0.1757, "step": 8622 }, { "epoch": 1.7451932807124064, "grad_norm": 0.4323246479034424, "learning_rate": 7.936332360807853e-06, "loss": 0.1904, "step": 8623 }, { "epoch": 1.7453956688929366, "grad_norm": 0.29777491092681885, "learning_rate": 7.923918855168588e-06, "loss": 0.1798, "step": 8624 }, { "epoch": 1.745598057073467, "grad_norm": 0.2588173747062683, "learning_rate": 7.911514664821073e-06, "loss": 0.1872, "step": 8625 }, { "epoch": 1.7458004452539972, "grad_norm": 0.24862785637378693, "learning_rate": 7.899119791020226e-06, "loss": 0.1694, "step": 8626 }, { "epoch": 1.7460028334345274, "grad_norm": 0.2731352746486664, "learning_rate": 7.886734235020033e-06, "loss": 0.163, "step": 8627 }, { "epoch": 1.7462052216150576, "grad_norm": 0.3019119203090668, "learning_rate": 7.874357998073544e-06, "loss": 0.1828, "step": 8628 }, { "epoch": 1.7464076097955878, "grad_norm": 0.29366788268089294, "learning_rate": 7.86199108143284e-06, "loss": 0.1737, "step": 8629 }, { "epoch": 1.746609997976118, "grad_norm": 0.27854087948799133, "learning_rate": 7.849633486349095e-06, "loss": 0.1978, "step": 8630 }, { "epoch": 1.7468123861566485, "grad_norm": 0.3060019016265869, "learning_rate": 7.83728521407251e-06, "loss": 0.1997, "step": 8631 }, { "epoch": 1.7470147743371787, "grad_norm": 0.29798224568367004, "learning_rate": 7.82494626585235e-06, "loss": 0.213, "step": 8632 }, { "epoch": 1.747217162517709, "grad_norm": 0.2958841621875763, "learning_rate": 7.812616642936943e-06, "loss": 0.216, "step": 8633 }, { "epoch": 1.7474195506982393, "grad_norm": 0.3002054989337921, "learning_rate": 7.800296346573677e-06, "loss": 0.1936, "step": 8634 }, { "epoch": 1.7476219388787695, "grad_norm": 0.3035990297794342, "learning_rate": 7.787985378008988e-06, "loss": 0.188, "step": 8635 }, { "epoch": 1.7478243270592997, "grad_norm": 0.29013994336128235, "learning_rate": 7.775683738488371e-06, "loss": 0.1609, "step": 8636 }, { "epoch": 1.7480267152398299, "grad_norm": 0.3334468901157379, "learning_rate": 7.763391429256373e-06, "loss": 0.1858, "step": 8637 }, { "epoch": 1.74822910342036, "grad_norm": 0.2830390930175781, "learning_rate": 7.7511084515566e-06, "loss": 0.1803, "step": 8638 }, { "epoch": 1.7484314916008905, "grad_norm": 0.2534598410129547, "learning_rate": 7.738834806631711e-06, "loss": 0.1873, "step": 8639 }, { "epoch": 1.748633879781421, "grad_norm": 0.25773561000823975, "learning_rate": 7.72657049572344e-06, "loss": 0.1664, "step": 8640 }, { "epoch": 1.7488362679619511, "grad_norm": 0.2659550905227661, "learning_rate": 7.714315520072545e-06, "loss": 0.1795, "step": 8641 }, { "epoch": 1.7490386561424813, "grad_norm": 0.2660394012928009, "learning_rate": 7.702069880918872e-06, "loss": 0.1714, "step": 8642 }, { "epoch": 1.7492410443230115, "grad_norm": 0.22891388833522797, "learning_rate": 7.689833579501293e-06, "loss": 0.171, "step": 8643 }, { "epoch": 1.7494434325035417, "grad_norm": 0.2488303780555725, "learning_rate": 7.677606617057743e-06, "loss": 0.1725, "step": 8644 }, { "epoch": 1.749645820684072, "grad_norm": 0.29489654302597046, "learning_rate": 7.66538899482524e-06, "loss": 0.2015, "step": 8645 }, { "epoch": 1.7498482088646024, "grad_norm": 0.2691362500190735, "learning_rate": 7.65318071403982e-06, "loss": 0.1774, "step": 8646 }, { "epoch": 1.7500505970451325, "grad_norm": 0.33754125237464905, "learning_rate": 7.640981775936595e-06, "loss": 0.2412, "step": 8647 }, { "epoch": 1.750252985225663, "grad_norm": 0.27895069122314453, "learning_rate": 7.628792181749711e-06, "loss": 0.222, "step": 8648 }, { "epoch": 1.7504553734061932, "grad_norm": 0.31739845871925354, "learning_rate": 7.616611932712403e-06, "loss": 0.1908, "step": 8649 }, { "epoch": 1.7506577615867234, "grad_norm": 0.28091490268707275, "learning_rate": 7.604441030056941e-06, "loss": 0.1811, "step": 8650 }, { "epoch": 1.7506577615867234, "eval_loss": 0.2593732178211212, "eval_runtime": 0.7369, "eval_samples_per_second": 6.785, "eval_steps_per_second": 1.357, "step": 8650 }, { "epoch": 1.7508601497672536, "grad_norm": 0.2957996428012848, "learning_rate": 7.5922794750146294e-06, "loss": 0.2139, "step": 8651 }, { "epoch": 1.7510625379477838, "grad_norm": 0.349030464887619, "learning_rate": 7.580127268815862e-06, "loss": 0.2125, "step": 8652 }, { "epoch": 1.751264926128314, "grad_norm": 0.24564795196056366, "learning_rate": 7.567984412690055e-06, "loss": 0.1514, "step": 8653 }, { "epoch": 1.7514673143088444, "grad_norm": 0.28310883045196533, "learning_rate": 7.555850907865713e-06, "loss": 0.2011, "step": 8654 }, { "epoch": 1.7516697024893746, "grad_norm": 0.3396984338760376, "learning_rate": 7.543726755570368e-06, "loss": 0.2358, "step": 8655 }, { "epoch": 1.751872090669905, "grad_norm": 0.32286304235458374, "learning_rate": 7.531611957030626e-06, "loss": 0.2065, "step": 8656 }, { "epoch": 1.7520744788504352, "grad_norm": 0.29195284843444824, "learning_rate": 7.519506513472118e-06, "loss": 0.2033, "step": 8657 }, { "epoch": 1.7522768670309654, "grad_norm": 0.2819693386554718, "learning_rate": 7.507410426119554e-06, "loss": 0.1663, "step": 8658 }, { "epoch": 1.7524792552114956, "grad_norm": 0.28465232253074646, "learning_rate": 7.4953236961966874e-06, "loss": 0.192, "step": 8659 }, { "epoch": 1.7526816433920258, "grad_norm": 0.2673487365245819, "learning_rate": 7.48324632492634e-06, "loss": 0.1682, "step": 8660 }, { "epoch": 1.752884031572556, "grad_norm": 0.30271047353744507, "learning_rate": 7.471178313530347e-06, "loss": 0.1794, "step": 8661 }, { "epoch": 1.7530864197530864, "grad_norm": 0.2480824738740921, "learning_rate": 7.459119663229652e-06, "loss": 0.1852, "step": 8662 }, { "epoch": 1.7532888079336166, "grad_norm": 0.2820073962211609, "learning_rate": 7.447070375244203e-06, "loss": 0.1962, "step": 8663 }, { "epoch": 1.753491196114147, "grad_norm": 0.2992934584617615, "learning_rate": 7.435030450793024e-06, "loss": 0.1752, "step": 8664 }, { "epoch": 1.7536935842946773, "grad_norm": 0.299672931432724, "learning_rate": 7.422999891094196e-06, "loss": 0.2217, "step": 8665 }, { "epoch": 1.7538959724752075, "grad_norm": 0.26297396421432495, "learning_rate": 7.410978697364834e-06, "loss": 0.1846, "step": 8666 }, { "epoch": 1.7540983606557377, "grad_norm": 0.42981722950935364, "learning_rate": 7.398966870821122e-06, "loss": 0.1864, "step": 8667 }, { "epoch": 1.7543007488362679, "grad_norm": 0.33034002780914307, "learning_rate": 7.386964412678299e-06, "loss": 0.2026, "step": 8668 }, { "epoch": 1.754503137016798, "grad_norm": 0.33484843373298645, "learning_rate": 7.374971324150637e-06, "loss": 0.1514, "step": 8669 }, { "epoch": 1.7547055251973285, "grad_norm": 0.31231656670570374, "learning_rate": 7.362987606451466e-06, "loss": 0.1844, "step": 8670 }, { "epoch": 1.754907913377859, "grad_norm": 0.28507134318351746, "learning_rate": 7.351013260793183e-06, "loss": 0.1737, "step": 8671 }, { "epoch": 1.755110301558389, "grad_norm": 0.31598010659217834, "learning_rate": 7.339048288387229e-06, "loss": 0.2043, "step": 8672 }, { "epoch": 1.7553126897389193, "grad_norm": 0.2558382749557495, "learning_rate": 7.327092690444082e-06, "loss": 0.166, "step": 8673 }, { "epoch": 1.7555150779194495, "grad_norm": 0.33906546235084534, "learning_rate": 7.315146468173295e-06, "loss": 0.1969, "step": 8674 }, { "epoch": 1.7557174660999797, "grad_norm": 0.2782249450683594, "learning_rate": 7.303209622783446e-06, "loss": 0.1898, "step": 8675 }, { "epoch": 1.75591985428051, "grad_norm": 0.26307907700538635, "learning_rate": 7.2912821554822046e-06, "loss": 0.182, "step": 8676 }, { "epoch": 1.7561222424610403, "grad_norm": 0.2771715223789215, "learning_rate": 7.279364067476246e-06, "loss": 0.1882, "step": 8677 }, { "epoch": 1.7563246306415705, "grad_norm": 0.2941603362560272, "learning_rate": 7.2674553599713315e-06, "loss": 0.1756, "step": 8678 }, { "epoch": 1.756527018822101, "grad_norm": 0.23980875313282013, "learning_rate": 7.25555603417224e-06, "loss": 0.1701, "step": 8679 }, { "epoch": 1.7567294070026311, "grad_norm": 0.30672362446784973, "learning_rate": 7.243666091282841e-06, "loss": 0.2077, "step": 8680 }, { "epoch": 1.7569317951831613, "grad_norm": 0.29822811484336853, "learning_rate": 7.231785532506031e-06, "loss": 0.1857, "step": 8681 }, { "epoch": 1.7571341833636915, "grad_norm": 0.2909514904022217, "learning_rate": 7.219914359043744e-06, "loss": 0.1987, "step": 8682 }, { "epoch": 1.7573365715442217, "grad_norm": 0.2535093128681183, "learning_rate": 7.208052572097001e-06, "loss": 0.1722, "step": 8683 }, { "epoch": 1.757538959724752, "grad_norm": 0.28953081369400024, "learning_rate": 7.196200172865841e-06, "loss": 0.1771, "step": 8684 }, { "epoch": 1.7577413479052824, "grad_norm": 0.2795270085334778, "learning_rate": 7.18435716254936e-06, "loss": 0.1846, "step": 8685 }, { "epoch": 1.7579437360858126, "grad_norm": 0.3224479854106903, "learning_rate": 7.172523542345733e-06, "loss": 0.219, "step": 8686 }, { "epoch": 1.758146124266343, "grad_norm": 0.28203216195106506, "learning_rate": 7.160699313452135e-06, "loss": 0.1738, "step": 8687 }, { "epoch": 1.7583485124468732, "grad_norm": 0.2619762718677521, "learning_rate": 7.1488844770648325e-06, "loss": 0.1747, "step": 8688 }, { "epoch": 1.7585509006274034, "grad_norm": 0.2592688202857971, "learning_rate": 7.137079034379124e-06, "loss": 0.1529, "step": 8689 }, { "epoch": 1.7587532888079336, "grad_norm": 0.31665951013565063, "learning_rate": 7.125282986589355e-06, "loss": 0.1954, "step": 8690 }, { "epoch": 1.7589556769884638, "grad_norm": 0.26507389545440674, "learning_rate": 7.113496334888936e-06, "loss": 0.1584, "step": 8691 }, { "epoch": 1.759158065168994, "grad_norm": 0.3036040961742401, "learning_rate": 7.101719080470304e-06, "loss": 0.1909, "step": 8692 }, { "epoch": 1.7593604533495244, "grad_norm": 0.2305152714252472, "learning_rate": 7.089951224524971e-06, "loss": 0.1737, "step": 8693 }, { "epoch": 1.7595628415300546, "grad_norm": 0.2953859567642212, "learning_rate": 7.078192768243486e-06, "loss": 0.1938, "step": 8694 }, { "epoch": 1.759765229710585, "grad_norm": 0.28796306252479553, "learning_rate": 7.066443712815429e-06, "loss": 0.1981, "step": 8695 }, { "epoch": 1.7599676178911152, "grad_norm": 0.25642630457878113, "learning_rate": 7.054704059429463e-06, "loss": 0.181, "step": 8696 }, { "epoch": 1.7601700060716454, "grad_norm": 0.28918367624282837, "learning_rate": 7.0429738092732676e-06, "loss": 0.1828, "step": 8697 }, { "epoch": 1.7603723942521756, "grad_norm": 0.3338787853717804, "learning_rate": 7.0312529635335965e-06, "loss": 0.1814, "step": 8698 }, { "epoch": 1.7605747824327058, "grad_norm": 0.24299395084381104, "learning_rate": 7.019541523396245e-06, "loss": 0.139, "step": 8699 }, { "epoch": 1.7607771706132362, "grad_norm": 0.27827152609825134, "learning_rate": 7.007839490046042e-06, "loss": 0.1963, "step": 8700 }, { "epoch": 1.7607771706132362, "eval_loss": 0.2589094042778015, "eval_runtime": 0.7364, "eval_samples_per_second": 6.789, "eval_steps_per_second": 1.358, "step": 8700 }, { "epoch": 1.7609795587937664, "grad_norm": 0.2616543769836426, "learning_rate": 6.9961468646668855e-06, "loss": 0.1759, "step": 8701 }, { "epoch": 1.7611819469742969, "grad_norm": 0.27737903594970703, "learning_rate": 6.984463648441719e-06, "loss": 0.2049, "step": 8702 }, { "epoch": 1.761384335154827, "grad_norm": 0.2969575524330139, "learning_rate": 6.9727898425525185e-06, "loss": 0.2156, "step": 8703 }, { "epoch": 1.7615867233353573, "grad_norm": 0.31687629222869873, "learning_rate": 6.9611254481803055e-06, "loss": 0.2173, "step": 8704 }, { "epoch": 1.7617891115158875, "grad_norm": 0.22732824087142944, "learning_rate": 6.949470466505181e-06, "loss": 0.1607, "step": 8705 }, { "epoch": 1.7619914996964177, "grad_norm": 0.28792738914489746, "learning_rate": 6.937824898706269e-06, "loss": 0.1731, "step": 8706 }, { "epoch": 1.7621938878769479, "grad_norm": 0.28398075699806213, "learning_rate": 6.926188745961748e-06, "loss": 0.1807, "step": 8707 }, { "epoch": 1.7623962760574783, "grad_norm": 0.24871675670146942, "learning_rate": 6.914562009448833e-06, "loss": 0.1634, "step": 8708 }, { "epoch": 1.7625986642380085, "grad_norm": 0.2863733470439911, "learning_rate": 6.9029446903437934e-06, "loss": 0.1672, "step": 8709 }, { "epoch": 1.762801052418539, "grad_norm": 0.28869694471359253, "learning_rate": 6.8913367898219565e-06, "loss": 0.1901, "step": 8710 }, { "epoch": 1.763003440599069, "grad_norm": 0.28518444299697876, "learning_rate": 6.879738309057693e-06, "loss": 0.1773, "step": 8711 }, { "epoch": 1.7632058287795993, "grad_norm": 0.27658623456954956, "learning_rate": 6.868149249224398e-06, "loss": 0.1697, "step": 8712 }, { "epoch": 1.7634082169601295, "grad_norm": 0.2947976887226105, "learning_rate": 6.856569611494546e-06, "loss": 0.2189, "step": 8713 }, { "epoch": 1.7636106051406597, "grad_norm": 0.3152853846549988, "learning_rate": 6.844999397039631e-06, "loss": 0.2387, "step": 8714 }, { "epoch": 1.76381299332119, "grad_norm": 0.27520832419395447, "learning_rate": 6.833438607030218e-06, "loss": 0.1728, "step": 8715 }, { "epoch": 1.7640153815017203, "grad_norm": 0.3301842510700226, "learning_rate": 6.821887242635905e-06, "loss": 0.1953, "step": 8716 }, { "epoch": 1.7642177696822505, "grad_norm": 0.2489853948354721, "learning_rate": 6.810345305025345e-06, "loss": 0.1564, "step": 8717 }, { "epoch": 1.764420157862781, "grad_norm": 0.2562078833580017, "learning_rate": 6.798812795366205e-06, "loss": 0.1522, "step": 8718 }, { "epoch": 1.7646225460433111, "grad_norm": 0.2957938015460968, "learning_rate": 6.7872897148252294e-06, "loss": 0.1752, "step": 8719 }, { "epoch": 1.7648249342238413, "grad_norm": 0.2993956208229065, "learning_rate": 6.775776064568218e-06, "loss": 0.1718, "step": 8720 }, { "epoch": 1.7650273224043715, "grad_norm": 0.45438745617866516, "learning_rate": 6.764271845759995e-06, "loss": 0.1941, "step": 8721 }, { "epoch": 1.7652297105849017, "grad_norm": 0.2617250978946686, "learning_rate": 6.75277705956443e-06, "loss": 0.1881, "step": 8722 }, { "epoch": 1.765432098765432, "grad_norm": 0.31186604499816895, "learning_rate": 6.74129170714446e-06, "loss": 0.1943, "step": 8723 }, { "epoch": 1.7656344869459624, "grad_norm": 0.27825167775154114, "learning_rate": 6.729815789662031e-06, "loss": 0.162, "step": 8724 }, { "epoch": 1.7658368751264926, "grad_norm": 0.3025761842727661, "learning_rate": 6.718349308278171e-06, "loss": 0.2153, "step": 8725 }, { "epoch": 1.766039263307023, "grad_norm": 0.3043777048587799, "learning_rate": 6.70689226415292e-06, "loss": 0.183, "step": 8726 }, { "epoch": 1.7662416514875532, "grad_norm": 0.25967058539390564, "learning_rate": 6.695444658445416e-06, "loss": 0.1866, "step": 8727 }, { "epoch": 1.7664440396680834, "grad_norm": 0.34056445956230164, "learning_rate": 6.684006492313788e-06, "loss": 0.2396, "step": 8728 }, { "epoch": 1.7666464278486136, "grad_norm": 0.2622250020503998, "learning_rate": 6.672577766915222e-06, "loss": 0.1957, "step": 8729 }, { "epoch": 1.7668488160291438, "grad_norm": 0.3250180184841156, "learning_rate": 6.661158483405971e-06, "loss": 0.2404, "step": 8730 }, { "epoch": 1.7670512042096742, "grad_norm": 0.3211231529712677, "learning_rate": 6.6497486429413e-06, "loss": 0.2132, "step": 8731 }, { "epoch": 1.7672535923902044, "grad_norm": 0.29395386576652527, "learning_rate": 6.638348246675563e-06, "loss": 0.1855, "step": 8732 }, { "epoch": 1.7674559805707348, "grad_norm": 0.26442763209342957, "learning_rate": 6.626957295762115e-06, "loss": 0.1919, "step": 8733 }, { "epoch": 1.767658368751265, "grad_norm": 0.32076603174209595, "learning_rate": 6.6155757913533675e-06, "loss": 0.1912, "step": 8734 }, { "epoch": 1.7678607569317952, "grad_norm": 0.25481972098350525, "learning_rate": 6.604203734600789e-06, "loss": 0.1663, "step": 8735 }, { "epoch": 1.7680631451123254, "grad_norm": 0.260008305311203, "learning_rate": 6.592841126654892e-06, "loss": 0.164, "step": 8736 }, { "epoch": 1.7682655332928556, "grad_norm": 0.2986699342727661, "learning_rate": 6.581487968665223e-06, "loss": 0.2153, "step": 8737 }, { "epoch": 1.7684679214733858, "grad_norm": 0.2634279429912567, "learning_rate": 6.570144261780364e-06, "loss": 0.1995, "step": 8738 }, { "epoch": 1.7686703096539163, "grad_norm": 0.334778368473053, "learning_rate": 6.558810007147986e-06, "loss": 0.2261, "step": 8739 }, { "epoch": 1.7688726978344465, "grad_norm": 0.2399263083934784, "learning_rate": 6.547485205914716e-06, "loss": 0.1512, "step": 8740 }, { "epoch": 1.7690750860149769, "grad_norm": 0.28150245547294617, "learning_rate": 6.536169859226316e-06, "loss": 0.2154, "step": 8741 }, { "epoch": 1.769277474195507, "grad_norm": 0.3357497453689575, "learning_rate": 6.524863968227535e-06, "loss": 0.2347, "step": 8742 }, { "epoch": 1.7694798623760373, "grad_norm": 0.3222461938858032, "learning_rate": 6.5135675340622035e-06, "loss": 0.1789, "step": 8743 }, { "epoch": 1.7696822505565675, "grad_norm": 0.27976444363594055, "learning_rate": 6.502280557873164e-06, "loss": 0.1678, "step": 8744 }, { "epoch": 1.7698846387370977, "grad_norm": 0.30489134788513184, "learning_rate": 6.491003040802323e-06, "loss": 0.1747, "step": 8745 }, { "epoch": 1.7700870269176279, "grad_norm": 0.28980520367622375, "learning_rate": 6.479734983990604e-06, "loss": 0.1881, "step": 8746 }, { "epoch": 1.7702894150981583, "grad_norm": 0.2970348596572876, "learning_rate": 6.468476388578016e-06, "loss": 0.2024, "step": 8747 }, { "epoch": 1.7704918032786885, "grad_norm": 0.2925608158111572, "learning_rate": 6.4572272557035575e-06, "loss": 0.1726, "step": 8748 }, { "epoch": 1.770694191459219, "grad_norm": 0.3097609281539917, "learning_rate": 6.44598758650532e-06, "loss": 0.2162, "step": 8749 }, { "epoch": 1.7708965796397491, "grad_norm": 0.30766546726226807, "learning_rate": 6.4347573821204044e-06, "loss": 0.2104, "step": 8750 }, { "epoch": 1.7708965796397491, "eval_loss": 0.2586788237094879, "eval_runtime": 0.7386, "eval_samples_per_second": 6.77, "eval_steps_per_second": 1.354, "step": 8750 }, { "epoch": 1.7710989678202793, "grad_norm": 0.28390100598335266, "learning_rate": 6.423536643684969e-06, "loss": 0.1976, "step": 8751 }, { "epoch": 1.7713013560008095, "grad_norm": 0.2914179265499115, "learning_rate": 6.412325372334216e-06, "loss": 0.2039, "step": 8752 }, { "epoch": 1.7715037441813397, "grad_norm": 0.30227139592170715, "learning_rate": 6.401123569202372e-06, "loss": 0.2108, "step": 8753 }, { "epoch": 1.77170613236187, "grad_norm": 0.29538655281066895, "learning_rate": 6.389931235422719e-06, "loss": 0.218, "step": 8754 }, { "epoch": 1.7719085205424003, "grad_norm": 0.3247475326061249, "learning_rate": 6.378748372127585e-06, "loss": 0.2053, "step": 8755 }, { "epoch": 1.7721109087229305, "grad_norm": 0.2719709575176239, "learning_rate": 6.367574980448343e-06, "loss": 0.1944, "step": 8756 }, { "epoch": 1.772313296903461, "grad_norm": 0.24502572417259216, "learning_rate": 6.356411061515377e-06, "loss": 0.1592, "step": 8757 }, { "epoch": 1.7725156850839912, "grad_norm": 0.26764917373657227, "learning_rate": 6.345256616458162e-06, "loss": 0.1708, "step": 8758 }, { "epoch": 1.7727180732645214, "grad_norm": 0.2626522183418274, "learning_rate": 6.334111646405172e-06, "loss": 0.1885, "step": 8759 }, { "epoch": 1.7729204614450516, "grad_norm": 0.3055856227874756, "learning_rate": 6.322976152483928e-06, "loss": 0.1749, "step": 8760 }, { "epoch": 1.7731228496255818, "grad_norm": 0.25192734599113464, "learning_rate": 6.311850135821052e-06, "loss": 0.1478, "step": 8761 }, { "epoch": 1.7733252378061122, "grad_norm": 0.2746264934539795, "learning_rate": 6.300733597542085e-06, "loss": 0.1972, "step": 8762 }, { "epoch": 1.7735276259866424, "grad_norm": 0.24976009130477905, "learning_rate": 6.289626538771731e-06, "loss": 0.1871, "step": 8763 }, { "epoch": 1.7737300141671728, "grad_norm": 0.29063618183135986, "learning_rate": 6.278528960633667e-06, "loss": 0.1936, "step": 8764 }, { "epoch": 1.773932402347703, "grad_norm": 0.27964475750923157, "learning_rate": 6.267440864250629e-06, "loss": 0.1969, "step": 8765 }, { "epoch": 1.7741347905282332, "grad_norm": 0.3473265767097473, "learning_rate": 6.256362250744407e-06, "loss": 0.2297, "step": 8766 }, { "epoch": 1.7743371787087634, "grad_norm": 0.28343695402145386, "learning_rate": 6.2452931212358064e-06, "loss": 0.1899, "step": 8767 }, { "epoch": 1.7745395668892936, "grad_norm": 0.2353745698928833, "learning_rate": 6.234233476844697e-06, "loss": 0.1692, "step": 8768 }, { "epoch": 1.7747419550698238, "grad_norm": 0.2326369434595108, "learning_rate": 6.223183318689973e-06, "loss": 0.1342, "step": 8769 }, { "epoch": 1.7749443432503542, "grad_norm": 0.29546186327934265, "learning_rate": 6.212142647889563e-06, "loss": 0.2019, "step": 8770 }, { "epoch": 1.7751467314308844, "grad_norm": 0.32325753569602966, "learning_rate": 6.201111465560461e-06, "loss": 0.2141, "step": 8771 }, { "epoch": 1.7753491196114148, "grad_norm": 0.275481253862381, "learning_rate": 6.190089772818674e-06, "loss": 0.1821, "step": 8772 }, { "epoch": 1.775551507791945, "grad_norm": 0.29665717482566833, "learning_rate": 6.179077570779279e-06, "loss": 0.1749, "step": 8773 }, { "epoch": 1.7757538959724752, "grad_norm": 0.3191758990287781, "learning_rate": 6.168074860556361e-06, "loss": 0.1951, "step": 8774 }, { "epoch": 1.7759562841530054, "grad_norm": 0.24474631249904633, "learning_rate": 6.1570816432630515e-06, "loss": 0.1552, "step": 8775 }, { "epoch": 1.7761586723335356, "grad_norm": 0.25271105766296387, "learning_rate": 6.1460979200115505e-06, "loss": 0.1534, "step": 8776 }, { "epoch": 1.7763610605140658, "grad_norm": 0.31084853410720825, "learning_rate": 6.135123691913059e-06, "loss": 0.2258, "step": 8777 }, { "epoch": 1.7765634486945963, "grad_norm": 0.2757551968097687, "learning_rate": 6.12415896007783e-06, "loss": 0.1842, "step": 8778 }, { "epoch": 1.7767658368751265, "grad_norm": 0.24306604266166687, "learning_rate": 6.11320372561518e-06, "loss": 0.1484, "step": 8779 }, { "epoch": 1.7769682250556569, "grad_norm": 0.27842000126838684, "learning_rate": 6.102257989633431e-06, "loss": 0.1912, "step": 8780 }, { "epoch": 1.777170613236187, "grad_norm": 0.28890183568000793, "learning_rate": 6.0913217532399645e-06, "loss": 0.2146, "step": 8781 }, { "epoch": 1.7773730014167173, "grad_norm": 0.28030017018318176, "learning_rate": 6.080395017541185e-06, "loss": 0.1858, "step": 8782 }, { "epoch": 1.7775753895972475, "grad_norm": 0.271921843290329, "learning_rate": 6.069477783642563e-06, "loss": 0.1861, "step": 8783 }, { "epoch": 1.7777777777777777, "grad_norm": 0.2917061150074005, "learning_rate": 6.05857005264856e-06, "loss": 0.204, "step": 8784 }, { "epoch": 1.7779801659583079, "grad_norm": 0.24218259751796722, "learning_rate": 6.0476718256627375e-06, "loss": 0.1711, "step": 8785 }, { "epoch": 1.7781825541388383, "grad_norm": 0.2720330059528351, "learning_rate": 6.036783103787635e-06, "loss": 0.2027, "step": 8786 }, { "epoch": 1.7783849423193685, "grad_norm": 0.2660661041736603, "learning_rate": 6.025903888124884e-06, "loss": 0.1892, "step": 8787 }, { "epoch": 1.778587330499899, "grad_norm": 0.4043765962123871, "learning_rate": 6.015034179775114e-06, "loss": 0.1995, "step": 8788 }, { "epoch": 1.7787897186804291, "grad_norm": 0.28639769554138184, "learning_rate": 6.004173979838013e-06, "loss": 0.1876, "step": 8789 }, { "epoch": 1.7789921068609593, "grad_norm": 0.23483648896217346, "learning_rate": 5.9933232894123e-06, "loss": 0.1881, "step": 8790 }, { "epoch": 1.7791944950414895, "grad_norm": 0.2661982476711273, "learning_rate": 5.982482109595744e-06, "loss": 0.1786, "step": 8791 }, { "epoch": 1.7793968832220197, "grad_norm": 0.27296239137649536, "learning_rate": 5.971650441485121e-06, "loss": 0.1766, "step": 8792 }, { "epoch": 1.7795992714025501, "grad_norm": 0.30432015657424927, "learning_rate": 5.96082828617629e-06, "loss": 0.2276, "step": 8793 }, { "epoch": 1.7798016595830803, "grad_norm": 0.3130051791667938, "learning_rate": 5.950015644764106e-06, "loss": 0.2133, "step": 8794 }, { "epoch": 1.7800040477636108, "grad_norm": 0.27052175998687744, "learning_rate": 5.939212518342485e-06, "loss": 0.1942, "step": 8795 }, { "epoch": 1.780206435944141, "grad_norm": 0.255877822637558, "learning_rate": 5.9284189080043625e-06, "loss": 0.175, "step": 8796 }, { "epoch": 1.7804088241246712, "grad_norm": 0.2689751088619232, "learning_rate": 5.917634814841743e-06, "loss": 0.1921, "step": 8797 }, { "epoch": 1.7806112123052014, "grad_norm": 0.24834966659545898, "learning_rate": 5.906860239945644e-06, "loss": 0.1492, "step": 8798 }, { "epoch": 1.7808136004857316, "grad_norm": 0.28980231285095215, "learning_rate": 5.896095184406103e-06, "loss": 0.1657, "step": 8799 }, { "epoch": 1.7810159886662618, "grad_norm": 0.2986690402030945, "learning_rate": 5.885339649312238e-06, "loss": 0.2066, "step": 8800 }, { "epoch": 1.7810159886662618, "eval_loss": 0.2586442232131958, "eval_runtime": 0.7377, "eval_samples_per_second": 6.778, "eval_steps_per_second": 1.356, "step": 8800 }, { "epoch": 1.7812183768467922, "grad_norm": 0.23212267458438873, "learning_rate": 5.874593635752179e-06, "loss": 0.1668, "step": 8801 }, { "epoch": 1.7814207650273224, "grad_norm": 0.31673598289489746, "learning_rate": 5.863857144813078e-06, "loss": 0.219, "step": 8802 }, { "epoch": 1.7816231532078528, "grad_norm": 0.3033224642276764, "learning_rate": 5.8531301775811565e-06, "loss": 0.1996, "step": 8803 }, { "epoch": 1.781825541388383, "grad_norm": 0.24953462183475494, "learning_rate": 5.8424127351416556e-06, "loss": 0.1833, "step": 8804 }, { "epoch": 1.7820279295689132, "grad_norm": 0.25415509939193726, "learning_rate": 5.831704818578843e-06, "loss": 0.2136, "step": 8805 }, { "epoch": 1.7822303177494434, "grad_norm": 0.282206267118454, "learning_rate": 5.821006428976061e-06, "loss": 0.2218, "step": 8806 }, { "epoch": 1.7824327059299736, "grad_norm": 0.25228193402290344, "learning_rate": 5.810317567415624e-06, "loss": 0.1884, "step": 8807 }, { "epoch": 1.7826350941105038, "grad_norm": 0.2990221083164215, "learning_rate": 5.799638234978933e-06, "loss": 0.1995, "step": 8808 }, { "epoch": 1.7828374822910342, "grad_norm": 0.2889970541000366, "learning_rate": 5.788968432746411e-06, "loss": 0.2128, "step": 8809 }, { "epoch": 1.7830398704715644, "grad_norm": 0.2746729254722595, "learning_rate": 5.7783081617975184e-06, "loss": 0.1643, "step": 8810 }, { "epoch": 1.7832422586520948, "grad_norm": 0.3056463897228241, "learning_rate": 5.767657423210749e-06, "loss": 0.2146, "step": 8811 }, { "epoch": 1.783444646832625, "grad_norm": 0.26303210854530334, "learning_rate": 5.757016218063638e-06, "loss": 0.1719, "step": 8812 }, { "epoch": 1.7836470350131552, "grad_norm": 0.23666363954544067, "learning_rate": 5.746384547432737e-06, "loss": 0.1649, "step": 8813 }, { "epoch": 1.7838494231936854, "grad_norm": 0.27246472239494324, "learning_rate": 5.7357624123936635e-06, "loss": 0.172, "step": 8814 }, { "epoch": 1.7840518113742156, "grad_norm": 0.30931583046913147, "learning_rate": 5.725149814021036e-06, "loss": 0.2071, "step": 8815 }, { "epoch": 1.7842541995547458, "grad_norm": 0.2522447109222412, "learning_rate": 5.714546753388539e-06, "loss": 0.1752, "step": 8816 }, { "epoch": 1.7844565877352763, "grad_norm": 0.31273213028907776, "learning_rate": 5.703953231568881e-06, "loss": 0.203, "step": 8817 }, { "epoch": 1.7846589759158065, "grad_norm": 0.30619895458221436, "learning_rate": 5.693369249633795e-06, "loss": 0.1848, "step": 8818 }, { "epoch": 1.7848613640963369, "grad_norm": 0.27686363458633423, "learning_rate": 5.6827948086540575e-06, "loss": 0.2046, "step": 8819 }, { "epoch": 1.785063752276867, "grad_norm": 0.2953948974609375, "learning_rate": 5.672229909699489e-06, "loss": 0.1875, "step": 8820 }, { "epoch": 1.7852661404573973, "grad_norm": 0.3067280948162079, "learning_rate": 5.661674553838925e-06, "loss": 0.1811, "step": 8821 }, { "epoch": 1.7854685286379275, "grad_norm": 0.2993681728839874, "learning_rate": 5.6511287421402435e-06, "loss": 0.1772, "step": 8822 }, { "epoch": 1.7856709168184577, "grad_norm": 0.28528183698654175, "learning_rate": 5.6405924756703696e-06, "loss": 0.1876, "step": 8823 }, { "epoch": 1.785873304998988, "grad_norm": 0.28923770785331726, "learning_rate": 5.63006575549524e-06, "loss": 0.1872, "step": 8824 }, { "epoch": 1.7860756931795183, "grad_norm": 0.32765883207321167, "learning_rate": 5.619548582679857e-06, "loss": 0.1737, "step": 8825 }, { "epoch": 1.7862780813600487, "grad_norm": 0.24244408309459686, "learning_rate": 5.6090409582882145e-06, "loss": 0.1515, "step": 8826 }, { "epoch": 1.786480469540579, "grad_norm": 0.26799750328063965, "learning_rate": 5.5985428833833846e-06, "loss": 0.1791, "step": 8827 }, { "epoch": 1.7866828577211091, "grad_norm": 0.22886481881141663, "learning_rate": 5.588054359027439e-06, "loss": 0.1426, "step": 8828 }, { "epoch": 1.7868852459016393, "grad_norm": 0.27654311060905457, "learning_rate": 5.577575386281497e-06, "loss": 0.1953, "step": 8829 }, { "epoch": 1.7870876340821695, "grad_norm": 0.2745510935783386, "learning_rate": 5.56710596620571e-06, "loss": 0.1708, "step": 8830 }, { "epoch": 1.7872900222626997, "grad_norm": 0.25635775923728943, "learning_rate": 5.556646099859275e-06, "loss": 0.184, "step": 8831 }, { "epoch": 1.7874924104432302, "grad_norm": 0.26283639669418335, "learning_rate": 5.546195788300401e-06, "loss": 0.1622, "step": 8832 }, { "epoch": 1.7876947986237604, "grad_norm": 0.2936389744281769, "learning_rate": 5.535755032586354e-06, "loss": 0.2106, "step": 8833 }, { "epoch": 1.7878971868042908, "grad_norm": 0.2639520764350891, "learning_rate": 5.525323833773399e-06, "loss": 0.1553, "step": 8834 }, { "epoch": 1.788099574984821, "grad_norm": 0.2605137526988983, "learning_rate": 5.514902192916871e-06, "loss": 0.1998, "step": 8835 }, { "epoch": 1.7883019631653512, "grad_norm": 0.25581395626068115, "learning_rate": 5.504490111071114e-06, "loss": 0.184, "step": 8836 }, { "epoch": 1.7885043513458814, "grad_norm": 0.2903321087360382, "learning_rate": 5.494087589289531e-06, "loss": 0.2201, "step": 8837 }, { "epoch": 1.7887067395264116, "grad_norm": 0.31646454334259033, "learning_rate": 5.483694628624514e-06, "loss": 0.2071, "step": 8838 }, { "epoch": 1.7889091277069418, "grad_norm": 0.28405052423477173, "learning_rate": 5.473311230127531e-06, "loss": 0.1903, "step": 8839 }, { "epoch": 1.7891115158874722, "grad_norm": 0.31384506821632385, "learning_rate": 5.4629373948490545e-06, "loss": 0.1911, "step": 8840 }, { "epoch": 1.7893139040680024, "grad_norm": 0.23068785667419434, "learning_rate": 5.452573123838611e-06, "loss": 0.1725, "step": 8841 }, { "epoch": 1.7895162922485328, "grad_norm": 0.24589887261390686, "learning_rate": 5.442218418144751e-06, "loss": 0.1438, "step": 8842 }, { "epoch": 1.789718680429063, "grad_norm": 0.25478124618530273, "learning_rate": 5.4318732788150366e-06, "loss": 0.1512, "step": 8843 }, { "epoch": 1.7899210686095932, "grad_norm": 0.2645474970340729, "learning_rate": 5.421537706896096e-06, "loss": 0.1695, "step": 8844 }, { "epoch": 1.7901234567901234, "grad_norm": 0.26867932081222534, "learning_rate": 5.411211703433572e-06, "loss": 0.1791, "step": 8845 }, { "epoch": 1.7903258449706536, "grad_norm": 0.2700962722301483, "learning_rate": 5.4008952694721395e-06, "loss": 0.1858, "step": 8846 }, { "epoch": 1.7905282331511838, "grad_norm": 0.2806064188480377, "learning_rate": 5.390588406055497e-06, "loss": 0.2028, "step": 8847 }, { "epoch": 1.7907306213317142, "grad_norm": 0.3218914568424225, "learning_rate": 5.38029111422641e-06, "loss": 0.2434, "step": 8848 }, { "epoch": 1.7909330095122444, "grad_norm": 0.2712363600730896, "learning_rate": 5.370003395026624e-06, "loss": 0.1776, "step": 8849 }, { "epoch": 1.7911353976927749, "grad_norm": 0.2838458716869354, "learning_rate": 5.359725249496972e-06, "loss": 0.1613, "step": 8850 }, { "epoch": 1.7911353976927749, "eval_loss": 0.2580936849117279, "eval_runtime": 0.737, "eval_samples_per_second": 6.785, "eval_steps_per_second": 1.357, "step": 8850 }, { "epoch": 1.791337785873305, "grad_norm": 0.2904452979564667, "learning_rate": 5.349456678677245e-06, "loss": 0.1794, "step": 8851 }, { "epoch": 1.7915401740538353, "grad_norm": 0.3108134865760803, "learning_rate": 5.339197683606345e-06, "loss": 0.1996, "step": 8852 }, { "epoch": 1.7917425622343655, "grad_norm": 0.24708275496959686, "learning_rate": 5.328948265322154e-06, "loss": 0.1853, "step": 8853 }, { "epoch": 1.7919449504148957, "grad_norm": 0.30340859293937683, "learning_rate": 5.318708424861607e-06, "loss": 0.2239, "step": 8854 }, { "epoch": 1.792147338595426, "grad_norm": 0.26493820548057556, "learning_rate": 5.3084781632606665e-06, "loss": 0.1647, "step": 8855 }, { "epoch": 1.7923497267759563, "grad_norm": 0.29768866300582886, "learning_rate": 5.298257481554314e-06, "loss": 0.1698, "step": 8856 }, { "epoch": 1.7925521149564867, "grad_norm": 0.25119927525520325, "learning_rate": 5.2880463807765786e-06, "loss": 0.1792, "step": 8857 }, { "epoch": 1.792754503137017, "grad_norm": 0.3058163523674011, "learning_rate": 5.277844861960512e-06, "loss": 0.1858, "step": 8858 }, { "epoch": 1.792956891317547, "grad_norm": 0.3083089292049408, "learning_rate": 5.267652926138189e-06, "loss": 0.1938, "step": 8859 }, { "epoch": 1.7931592794980773, "grad_norm": 0.22727486491203308, "learning_rate": 5.257470574340729e-06, "loss": 0.1461, "step": 8860 }, { "epoch": 1.7933616676786075, "grad_norm": 0.2988595962524414, "learning_rate": 5.247297807598273e-06, "loss": 0.1964, "step": 8861 }, { "epoch": 1.7935640558591377, "grad_norm": 0.2653481662273407, "learning_rate": 5.237134626939988e-06, "loss": 0.1779, "step": 8862 }, { "epoch": 1.7937664440396681, "grad_norm": 0.32473576068878174, "learning_rate": 5.226981033394096e-06, "loss": 0.2151, "step": 8863 }, { "epoch": 1.7939688322201983, "grad_norm": 0.25815069675445557, "learning_rate": 5.2168370279878195e-06, "loss": 0.1758, "step": 8864 }, { "epoch": 1.7941712204007287, "grad_norm": 0.25564804673194885, "learning_rate": 5.206702611747427e-06, "loss": 0.1649, "step": 8865 }, { "epoch": 1.794373608581259, "grad_norm": 0.26895710825920105, "learning_rate": 5.196577785698198e-06, "loss": 0.1957, "step": 8866 }, { "epoch": 1.7945759967617891, "grad_norm": 0.24266557395458221, "learning_rate": 5.186462550864479e-06, "loss": 0.1339, "step": 8867 }, { "epoch": 1.7947783849423193, "grad_norm": 0.30665162205696106, "learning_rate": 5.176356908269608e-06, "loss": 0.1948, "step": 8868 }, { "epoch": 1.7949807731228495, "grad_norm": 0.257072389125824, "learning_rate": 5.166260858935978e-06, "loss": 0.1676, "step": 8869 }, { "epoch": 1.7951831613033797, "grad_norm": 0.269353449344635, "learning_rate": 5.156174403884984e-06, "loss": 0.1795, "step": 8870 }, { "epoch": 1.7953855494839102, "grad_norm": 0.269663542509079, "learning_rate": 5.146097544137085e-06, "loss": 0.1738, "step": 8871 }, { "epoch": 1.7955879376644404, "grad_norm": 0.2640727162361145, "learning_rate": 5.136030280711757e-06, "loss": 0.198, "step": 8872 }, { "epoch": 1.7957903258449708, "grad_norm": 0.30548980832099915, "learning_rate": 5.125972614627483e-06, "loss": 0.2174, "step": 8873 }, { "epoch": 1.795992714025501, "grad_norm": 0.27031007409095764, "learning_rate": 5.115924546901796e-06, "loss": 0.1814, "step": 8874 }, { "epoch": 1.7961951022060312, "grad_norm": 0.27772194147109985, "learning_rate": 5.1058860785512476e-06, "loss": 0.2004, "step": 8875 }, { "epoch": 1.7963974903865614, "grad_norm": 0.2998545169830322, "learning_rate": 5.095857210591437e-06, "loss": 0.1777, "step": 8876 }, { "epoch": 1.7965998785670916, "grad_norm": 0.29473671317100525, "learning_rate": 5.085837944036976e-06, "loss": 0.1935, "step": 8877 }, { "epoch": 1.7968022667476218, "grad_norm": 0.30179837346076965, "learning_rate": 5.075828279901507e-06, "loss": 0.2038, "step": 8878 }, { "epoch": 1.7970046549281522, "grad_norm": 0.32339680194854736, "learning_rate": 5.065828219197699e-06, "loss": 0.2456, "step": 8879 }, { "epoch": 1.7972070431086824, "grad_norm": 0.26312246918678284, "learning_rate": 5.055837762937265e-06, "loss": 0.1624, "step": 8880 }, { "epoch": 1.7974094312892128, "grad_norm": 0.29252588748931885, "learning_rate": 5.045856912130931e-06, "loss": 0.1632, "step": 8881 }, { "epoch": 1.797611819469743, "grad_norm": 0.2949320375919342, "learning_rate": 5.035885667788454e-06, "loss": 0.1967, "step": 8882 }, { "epoch": 1.7978142076502732, "grad_norm": 0.2704543471336365, "learning_rate": 5.025924030918616e-06, "loss": 0.2029, "step": 8883 }, { "epoch": 1.7980165958308034, "grad_norm": 0.2713168263435364, "learning_rate": 5.0159720025292344e-06, "loss": 0.18, "step": 8884 }, { "epoch": 1.7982189840113336, "grad_norm": 0.30050283670425415, "learning_rate": 5.006029583627148e-06, "loss": 0.1939, "step": 8885 }, { "epoch": 1.798421372191864, "grad_norm": 0.25041332840919495, "learning_rate": 4.996096775218218e-06, "loss": 0.1809, "step": 8886 }, { "epoch": 1.7986237603723942, "grad_norm": 0.327347069978714, "learning_rate": 4.986173578307362e-06, "loss": 0.2197, "step": 8887 }, { "epoch": 1.7988261485529247, "grad_norm": 0.2811526358127594, "learning_rate": 4.976259993898502e-06, "loss": 0.2059, "step": 8888 }, { "epoch": 1.7990285367334549, "grad_norm": 0.28768783807754517, "learning_rate": 4.966356022994567e-06, "loss": 0.183, "step": 8889 }, { "epoch": 1.799230924913985, "grad_norm": 0.27493974566459656, "learning_rate": 4.956461666597567e-06, "loss": 0.1911, "step": 8890 }, { "epoch": 1.7994333130945153, "grad_norm": 0.2958086431026459, "learning_rate": 4.946576925708491e-06, "loss": 0.2226, "step": 8891 }, { "epoch": 1.7996357012750455, "grad_norm": 0.30736634135246277, "learning_rate": 4.93670180132737e-06, "loss": 0.192, "step": 8892 }, { "epoch": 1.7998380894555757, "grad_norm": 0.31193798780441284, "learning_rate": 4.926836294453274e-06, "loss": 0.1944, "step": 8893 }, { "epoch": 1.800040477636106, "grad_norm": 0.29948121309280396, "learning_rate": 4.9169804060843035e-06, "loss": 0.2295, "step": 8894 }, { "epoch": 1.8002428658166363, "grad_norm": 0.28736236691474915, "learning_rate": 4.907134137217562e-06, "loss": 0.2076, "step": 8895 }, { "epoch": 1.8004452539971667, "grad_norm": 0.2893117666244507, "learning_rate": 4.897297488849173e-06, "loss": 0.1743, "step": 8896 }, { "epoch": 1.800647642177697, "grad_norm": 0.3026507496833801, "learning_rate": 4.887470461974331e-06, "loss": 0.2176, "step": 8897 }, { "epoch": 1.800850030358227, "grad_norm": 0.2544638216495514, "learning_rate": 4.877653057587228e-06, "loss": 0.179, "step": 8898 }, { "epoch": 1.8010524185387573, "grad_norm": 0.24779057502746582, "learning_rate": 4.867845276681071e-06, "loss": 0.1852, "step": 8899 }, { "epoch": 1.8012548067192875, "grad_norm": 0.27465617656707764, "learning_rate": 4.858047120248121e-06, "loss": 0.1983, "step": 8900 }, { "epoch": 1.8012548067192875, "eval_loss": 0.25783097743988037, "eval_runtime": 0.7366, "eval_samples_per_second": 6.788, "eval_steps_per_second": 1.358, "step": 8900 }, { "epoch": 1.8014571948998177, "grad_norm": 0.2639307379722595, "learning_rate": 4.848258589279652e-06, "loss": 0.1657, "step": 8901 }, { "epoch": 1.8016595830803481, "grad_norm": 0.26923590898513794, "learning_rate": 4.838479684765962e-06, "loss": 0.1766, "step": 8902 }, { "epoch": 1.8018619712608783, "grad_norm": 0.2985227108001709, "learning_rate": 4.828710407696391e-06, "loss": 0.182, "step": 8903 }, { "epoch": 1.8020643594414087, "grad_norm": 0.31524157524108887, "learning_rate": 4.81895075905926e-06, "loss": 0.2524, "step": 8904 }, { "epoch": 1.802266747621939, "grad_norm": 0.3199750483036041, "learning_rate": 4.809200739841979e-06, "loss": 0.1932, "step": 8905 }, { "epoch": 1.8024691358024691, "grad_norm": 0.2882361114025116, "learning_rate": 4.799460351030938e-06, "loss": 0.1756, "step": 8906 }, { "epoch": 1.8026715239829993, "grad_norm": 0.31260427832603455, "learning_rate": 4.789729593611569e-06, "loss": 0.2151, "step": 8907 }, { "epoch": 1.8028739121635295, "grad_norm": 0.2947652339935303, "learning_rate": 4.780008468568342e-06, "loss": 0.1879, "step": 8908 }, { "epoch": 1.8030763003440597, "grad_norm": 0.32592445611953735, "learning_rate": 4.770296976884714e-06, "loss": 0.1707, "step": 8909 }, { "epoch": 1.8032786885245902, "grad_norm": 0.28303763270378113, "learning_rate": 4.760595119543209e-06, "loss": 0.18, "step": 8910 }, { "epoch": 1.8034810767051204, "grad_norm": 0.2822856605052948, "learning_rate": 4.750902897525345e-06, "loss": 0.184, "step": 8911 }, { "epoch": 1.8036834648856508, "grad_norm": 0.286300927400589, "learning_rate": 4.741220311811701e-06, "loss": 0.1824, "step": 8912 }, { "epoch": 1.803885853066181, "grad_norm": 0.29102855920791626, "learning_rate": 4.7315473633818385e-06, "loss": 0.181, "step": 8913 }, { "epoch": 1.8040882412467112, "grad_norm": 0.28965407609939575, "learning_rate": 4.721884053214376e-06, "loss": 0.1647, "step": 8914 }, { "epoch": 1.8042906294272414, "grad_norm": 0.28732210397720337, "learning_rate": 4.7122303822869416e-06, "loss": 0.2239, "step": 8915 }, { "epoch": 1.8044930176077716, "grad_norm": 0.2571806311607361, "learning_rate": 4.702586351576199e-06, "loss": 0.1667, "step": 8916 }, { "epoch": 1.804695405788302, "grad_norm": 0.25412270426750183, "learning_rate": 4.692951962057834e-06, "loss": 0.1638, "step": 8917 }, { "epoch": 1.8048977939688322, "grad_norm": 0.2577854096889496, "learning_rate": 4.683327214706534e-06, "loss": 0.1646, "step": 8918 }, { "epoch": 1.8051001821493626, "grad_norm": 0.3552990257740021, "learning_rate": 4.673712110496031e-06, "loss": 0.2013, "step": 8919 }, { "epoch": 1.8053025703298928, "grad_norm": 0.31418198347091675, "learning_rate": 4.6641066503990916e-06, "loss": 0.2224, "step": 8920 }, { "epoch": 1.805504958510423, "grad_norm": 0.3056572675704956, "learning_rate": 4.6545108353875045e-06, "loss": 0.2114, "step": 8921 }, { "epoch": 1.8057073466909532, "grad_norm": 0.3513403534889221, "learning_rate": 4.644924666432049e-06, "loss": 0.1504, "step": 8922 }, { "epoch": 1.8059097348714834, "grad_norm": 0.34498804807662964, "learning_rate": 4.635348144502571e-06, "loss": 0.1988, "step": 8923 }, { "epoch": 1.8061121230520136, "grad_norm": 0.2675020694732666, "learning_rate": 4.625781270567919e-06, "loss": 0.1669, "step": 8924 }, { "epoch": 1.806314511232544, "grad_norm": 0.24444063007831573, "learning_rate": 4.616224045595974e-06, "loss": 0.1672, "step": 8925 }, { "epoch": 1.8065168994130743, "grad_norm": 0.2665750980377197, "learning_rate": 4.606676470553617e-06, "loss": 0.1883, "step": 8926 }, { "epoch": 1.8067192875936047, "grad_norm": 0.3534557521343231, "learning_rate": 4.597138546406798e-06, "loss": 0.1867, "step": 8927 }, { "epoch": 1.8069216757741349, "grad_norm": 0.3001806437969208, "learning_rate": 4.587610274120435e-06, "loss": 0.1955, "step": 8928 }, { "epoch": 1.807124063954665, "grad_norm": 0.3241526186466217, "learning_rate": 4.578091654658523e-06, "loss": 0.171, "step": 8929 }, { "epoch": 1.8073264521351953, "grad_norm": 0.29405054450035095, "learning_rate": 4.568582688984047e-06, "loss": 0.1745, "step": 8930 }, { "epoch": 1.8075288403157255, "grad_norm": 0.2953342795372009, "learning_rate": 4.559083378059015e-06, "loss": 0.1972, "step": 8931 }, { "epoch": 1.8077312284962557, "grad_norm": 0.27937665581703186, "learning_rate": 4.549593722844492e-06, "loss": 0.1815, "step": 8932 }, { "epoch": 1.807933616676786, "grad_norm": 0.3475225567817688, "learning_rate": 4.54011372430051e-06, "loss": 0.2308, "step": 8933 }, { "epoch": 1.8081360048573163, "grad_norm": 0.33293816447257996, "learning_rate": 4.530643383386179e-06, "loss": 0.2115, "step": 8934 }, { "epoch": 1.8083383930378467, "grad_norm": 0.2608540654182434, "learning_rate": 4.5211827010596005e-06, "loss": 0.1838, "step": 8935 }, { "epoch": 1.808540781218377, "grad_norm": 0.29804497957229614, "learning_rate": 4.511731678277919e-06, "loss": 0.1847, "step": 8936 }, { "epoch": 1.8087431693989071, "grad_norm": 0.2685197591781616, "learning_rate": 4.502290315997271e-06, "loss": 0.2106, "step": 8937 }, { "epoch": 1.8089455575794373, "grad_norm": 0.274152010679245, "learning_rate": 4.492858615172824e-06, "loss": 0.1865, "step": 8938 }, { "epoch": 1.8091479457599675, "grad_norm": 0.27463477849960327, "learning_rate": 4.483436576758826e-06, "loss": 0.1886, "step": 8939 }, { "epoch": 1.8093503339404977, "grad_norm": 0.27460166811943054, "learning_rate": 4.47402420170846e-06, "loss": 0.1805, "step": 8940 }, { "epoch": 1.8095527221210281, "grad_norm": 0.26897096633911133, "learning_rate": 4.464621490973986e-06, "loss": 0.1926, "step": 8941 }, { "epoch": 1.8097551103015583, "grad_norm": 0.28815120458602905, "learning_rate": 4.455228445506665e-06, "loss": 0.1845, "step": 8942 }, { "epoch": 1.8099574984820888, "grad_norm": 0.27394571900367737, "learning_rate": 4.445845066256793e-06, "loss": 0.1834, "step": 8943 }, { "epoch": 1.810159886662619, "grad_norm": 0.29721203446388245, "learning_rate": 4.436471354173677e-06, "loss": 0.1913, "step": 8944 }, { "epoch": 1.8103622748431492, "grad_norm": 0.30943432450294495, "learning_rate": 4.427107310205647e-06, "loss": 0.208, "step": 8945 }, { "epoch": 1.8105646630236794, "grad_norm": 0.29275450110435486, "learning_rate": 4.417752935300079e-06, "loss": 0.1942, "step": 8946 }, { "epoch": 1.8107670512042096, "grad_norm": 0.2976188659667969, "learning_rate": 4.408408230403327e-06, "loss": 0.1846, "step": 8947 }, { "epoch": 1.81096943938474, "grad_norm": 0.2598245143890381, "learning_rate": 4.39907319646079e-06, "loss": 0.1852, "step": 8948 }, { "epoch": 1.8111718275652702, "grad_norm": 0.25931134819984436, "learning_rate": 4.389747834416913e-06, "loss": 0.1495, "step": 8949 }, { "epoch": 1.8113742157458006, "grad_norm": 0.26877543330192566, "learning_rate": 4.380432145215119e-06, "loss": 0.1652, "step": 8950 }, { "epoch": 1.8113742157458006, "eval_loss": 0.25779491662979126, "eval_runtime": 0.7398, "eval_samples_per_second": 6.759, "eval_steps_per_second": 1.352, "step": 8950 }, { "epoch": 1.8115766039263308, "grad_norm": 0.30282893776893616, "learning_rate": 4.371126129797864e-06, "loss": 0.2027, "step": 8951 }, { "epoch": 1.811778992106861, "grad_norm": 0.3111761510372162, "learning_rate": 4.361829789106653e-06, "loss": 0.1984, "step": 8952 }, { "epoch": 1.8119813802873912, "grad_norm": 0.2611580193042755, "learning_rate": 4.352543124081987e-06, "loss": 0.1644, "step": 8953 }, { "epoch": 1.8121837684679214, "grad_norm": 0.2492230385541916, "learning_rate": 4.343266135663393e-06, "loss": 0.1598, "step": 8954 }, { "epoch": 1.8123861566484516, "grad_norm": 0.3022874891757965, "learning_rate": 4.33399882478941e-06, "loss": 0.1791, "step": 8955 }, { "epoch": 1.812588544828982, "grad_norm": 0.2649358808994293, "learning_rate": 4.324741192397619e-06, "loss": 0.1878, "step": 8956 }, { "epoch": 1.8127909330095122, "grad_norm": 0.23399528861045837, "learning_rate": 4.315493239424606e-06, "loss": 0.1813, "step": 8957 }, { "epoch": 1.8129933211900426, "grad_norm": 0.2612743079662323, "learning_rate": 4.306254966805967e-06, "loss": 0.2021, "step": 8958 }, { "epoch": 1.8131957093705728, "grad_norm": 0.35977280139923096, "learning_rate": 4.297026375476365e-06, "loss": 0.2231, "step": 8959 }, { "epoch": 1.813398097551103, "grad_norm": 0.28002238273620605, "learning_rate": 4.28780746636942e-06, "loss": 0.1699, "step": 8960 }, { "epoch": 1.8136004857316332, "grad_norm": 0.2709221839904785, "learning_rate": 4.278598240417842e-06, "loss": 0.1805, "step": 8961 }, { "epoch": 1.8138028739121634, "grad_norm": 0.2538873553276062, "learning_rate": 4.269398698553284e-06, "loss": 0.1682, "step": 8962 }, { "epoch": 1.8140052620926936, "grad_norm": 0.26883426308631897, "learning_rate": 4.260208841706481e-06, "loss": 0.1853, "step": 8963 }, { "epoch": 1.814207650273224, "grad_norm": 0.24153414368629456, "learning_rate": 4.251028670807156e-06, "loss": 0.1588, "step": 8964 }, { "epoch": 1.8144100384537543, "grad_norm": 0.2933860719203949, "learning_rate": 4.241858186784064e-06, "loss": 0.2111, "step": 8965 }, { "epoch": 1.8146124266342847, "grad_norm": 0.27167898416519165, "learning_rate": 4.232697390564988e-06, "loss": 0.2109, "step": 8966 }, { "epoch": 1.8148148148148149, "grad_norm": 0.2836179733276367, "learning_rate": 4.223546283076718e-06, "loss": 0.1851, "step": 8967 }, { "epoch": 1.815017202995345, "grad_norm": 0.25831085443496704, "learning_rate": 4.2144048652450585e-06, "loss": 0.183, "step": 8968 }, { "epoch": 1.8152195911758753, "grad_norm": 0.2679770886898041, "learning_rate": 4.2052731379948475e-06, "loss": 0.1804, "step": 8969 }, { "epoch": 1.8154219793564055, "grad_norm": 0.2566496431827545, "learning_rate": 4.1961511022499345e-06, "loss": 0.1776, "step": 8970 }, { "epoch": 1.8156243675369357, "grad_norm": 0.28995993733406067, "learning_rate": 4.187038758933204e-06, "loss": 0.2231, "step": 8971 }, { "epoch": 1.815826755717466, "grad_norm": 0.2585195302963257, "learning_rate": 4.177936108966529e-06, "loss": 0.1795, "step": 8972 }, { "epoch": 1.8160291438979965, "grad_norm": 0.28282400965690613, "learning_rate": 4.1688431532708404e-06, "loss": 0.1899, "step": 8973 }, { "epoch": 1.8162315320785267, "grad_norm": 0.2926957905292511, "learning_rate": 4.159759892766047e-06, "loss": 0.1828, "step": 8974 }, { "epoch": 1.816433920259057, "grad_norm": 0.3157510459423065, "learning_rate": 4.150686328371112e-06, "loss": 0.1972, "step": 8975 }, { "epoch": 1.8166363084395871, "grad_norm": 0.2653707265853882, "learning_rate": 4.141622461003991e-06, "loss": 0.1639, "step": 8976 }, { "epoch": 1.8168386966201173, "grad_norm": 0.28877976536750793, "learning_rate": 4.132568291581684e-06, "loss": 0.1908, "step": 8977 }, { "epoch": 1.8170410848006475, "grad_norm": 0.30027449131011963, "learning_rate": 4.123523821020192e-06, "loss": 0.1952, "step": 8978 }, { "epoch": 1.817243472981178, "grad_norm": 0.3052767515182495, "learning_rate": 4.1144890502345375e-06, "loss": 0.2201, "step": 8979 }, { "epoch": 1.8174458611617081, "grad_norm": 0.314216285943985, "learning_rate": 4.105463980138769e-06, "loss": 0.1787, "step": 8980 }, { "epoch": 1.8176482493422386, "grad_norm": 0.30276334285736084, "learning_rate": 4.096448611645942e-06, "loss": 0.1971, "step": 8981 }, { "epoch": 1.8178506375227688, "grad_norm": 0.24939769506454468, "learning_rate": 4.087442945668152e-06, "loss": 0.1796, "step": 8982 }, { "epoch": 1.818053025703299, "grad_norm": 0.28983643651008606, "learning_rate": 4.078446983116468e-06, "loss": 0.1915, "step": 8983 }, { "epoch": 1.8182554138838292, "grad_norm": 0.25924208760261536, "learning_rate": 4.0694607249010304e-06, "loss": 0.1679, "step": 8984 }, { "epoch": 1.8184578020643594, "grad_norm": 0.24220941960811615, "learning_rate": 4.060484171930978e-06, "loss": 0.155, "step": 8985 }, { "epoch": 1.8186601902448896, "grad_norm": 0.3046566843986511, "learning_rate": 4.051517325114451e-06, "loss": 0.2165, "step": 8986 }, { "epoch": 1.81886257842542, "grad_norm": 0.26422369480133057, "learning_rate": 4.0425601853586125e-06, "loss": 0.1877, "step": 8987 }, { "epoch": 1.8190649666059502, "grad_norm": 0.3298000991344452, "learning_rate": 4.033612753569682e-06, "loss": 0.2056, "step": 8988 }, { "epoch": 1.8192673547864806, "grad_norm": 0.3131459951400757, "learning_rate": 4.0246750306528354e-06, "loss": 0.2212, "step": 8989 }, { "epoch": 1.8194697429670108, "grad_norm": 0.2667044401168823, "learning_rate": 4.015747017512317e-06, "loss": 0.1741, "step": 8990 }, { "epoch": 1.819672131147541, "grad_norm": 0.3156639039516449, "learning_rate": 4.0068287150513696e-06, "loss": 0.2316, "step": 8991 }, { "epoch": 1.8198745193280712, "grad_norm": 0.2932990491390228, "learning_rate": 3.997920124172238e-06, "loss": 0.1958, "step": 8992 }, { "epoch": 1.8200769075086014, "grad_norm": 0.2587626278400421, "learning_rate": 3.989021245776214e-06, "loss": 0.1702, "step": 8993 }, { "epoch": 1.8202792956891316, "grad_norm": 0.29920312762260437, "learning_rate": 3.980132080763588e-06, "loss": 0.2167, "step": 8994 }, { "epoch": 1.820481683869662, "grad_norm": 0.3583579957485199, "learning_rate": 3.971252630033684e-06, "loss": 0.2383, "step": 8995 }, { "epoch": 1.8206840720501922, "grad_norm": 0.336424320936203, "learning_rate": 3.9623828944848065e-06, "loss": 0.2016, "step": 8996 }, { "epoch": 1.8208864602307226, "grad_norm": 0.24210865795612335, "learning_rate": 3.953522875014326e-06, "loss": 0.1853, "step": 8997 }, { "epoch": 1.8210888484112528, "grad_norm": 0.3300243616104126, "learning_rate": 3.944672572518582e-06, "loss": 0.1933, "step": 8998 }, { "epoch": 1.821291236591783, "grad_norm": 0.2903205156326294, "learning_rate": 3.935831987892979e-06, "loss": 0.1797, "step": 8999 }, { "epoch": 1.8214936247723132, "grad_norm": 0.3007589876651764, "learning_rate": 3.927001122031915e-06, "loss": 0.2058, "step": 9000 }, { "epoch": 1.8214936247723132, "eval_loss": 0.25774043798446655, "eval_runtime": 0.7398, "eval_samples_per_second": 6.759, "eval_steps_per_second": 1.352, "step": 9000 }, { "epoch": 1.8216960129528434, "grad_norm": 0.30178171396255493, "learning_rate": 3.918179975828784e-06, "loss": 0.1917, "step": 9001 }, { "epoch": 1.8218984011333736, "grad_norm": 0.29727664589881897, "learning_rate": 3.909368550176029e-06, "loss": 0.183, "step": 9002 }, { "epoch": 1.822100789313904, "grad_norm": 0.26325443387031555, "learning_rate": 3.900566845965104e-06, "loss": 0.1653, "step": 9003 }, { "epoch": 1.8223031774944345, "grad_norm": 0.25132912397384644, "learning_rate": 3.891774864086451e-06, "loss": 0.1707, "step": 9004 }, { "epoch": 1.8225055656749647, "grad_norm": 0.27996230125427246, "learning_rate": 3.8829926054295805e-06, "loss": 0.2224, "step": 9005 }, { "epoch": 1.822707953855495, "grad_norm": 0.25456923246383667, "learning_rate": 3.874220070882972e-06, "loss": 0.1832, "step": 9006 }, { "epoch": 1.822910342036025, "grad_norm": 0.3332825005054474, "learning_rate": 3.865457261334138e-06, "loss": 0.1774, "step": 9007 }, { "epoch": 1.8231127302165553, "grad_norm": 0.28720322251319885, "learning_rate": 3.856704177669612e-06, "loss": 0.1957, "step": 9008 }, { "epoch": 1.8233151183970855, "grad_norm": 0.37506118416786194, "learning_rate": 3.847960820774932e-06, "loss": 0.1914, "step": 9009 }, { "epoch": 1.823517506577616, "grad_norm": 0.25758567452430725, "learning_rate": 3.839227191534666e-06, "loss": 0.1718, "step": 9010 }, { "epoch": 1.823719894758146, "grad_norm": 0.28181517124176025, "learning_rate": 3.830503290832388e-06, "loss": 0.1774, "step": 9011 }, { "epoch": 1.8239222829386765, "grad_norm": 0.3112618327140808, "learning_rate": 3.82178911955069e-06, "loss": 0.2347, "step": 9012 }, { "epoch": 1.8241246711192067, "grad_norm": 0.3159734904766083, "learning_rate": 3.8130846785711773e-06, "loss": 0.2079, "step": 9013 }, { "epoch": 1.824327059299737, "grad_norm": 0.287130206823349, "learning_rate": 3.804389968774491e-06, "loss": 0.2008, "step": 9014 }, { "epoch": 1.8245294474802671, "grad_norm": 0.27288955450057983, "learning_rate": 3.7957049910402497e-06, "loss": 0.1735, "step": 9015 }, { "epoch": 1.8247318356607973, "grad_norm": 0.2727169990539551, "learning_rate": 3.7870297462471282e-06, "loss": 0.2139, "step": 9016 }, { "epoch": 1.8249342238413275, "grad_norm": 0.29491594433784485, "learning_rate": 3.77836423527278e-06, "loss": 0.1806, "step": 9017 }, { "epoch": 1.825136612021858, "grad_norm": 0.3176809847354889, "learning_rate": 3.7697084589938924e-06, "loss": 0.2092, "step": 9018 }, { "epoch": 1.8253390002023882, "grad_norm": 0.33551716804504395, "learning_rate": 3.7610624182861655e-06, "loss": 0.186, "step": 9019 }, { "epoch": 1.8255413883829186, "grad_norm": 0.29789623618125916, "learning_rate": 3.7524261140243322e-06, "loss": 0.1917, "step": 9020 }, { "epoch": 1.8257437765634488, "grad_norm": 0.29452013969421387, "learning_rate": 3.7437995470821052e-06, "loss": 0.1992, "step": 9021 }, { "epoch": 1.825946164743979, "grad_norm": 0.27145907282829285, "learning_rate": 3.735182718332231e-06, "loss": 0.1697, "step": 9022 }, { "epoch": 1.8261485529245092, "grad_norm": 0.2831342816352844, "learning_rate": 3.726575628646478e-06, "loss": 0.201, "step": 9023 }, { "epoch": 1.8263509411050394, "grad_norm": 0.2772304117679596, "learning_rate": 3.7179782788956175e-06, "loss": 0.1909, "step": 9024 }, { "epoch": 1.8265533292855696, "grad_norm": 0.26404857635498047, "learning_rate": 3.7093906699494417e-06, "loss": 0.1797, "step": 9025 }, { "epoch": 1.8267557174661, "grad_norm": 0.2647900879383087, "learning_rate": 3.7008128026767453e-06, "loss": 0.1886, "step": 9026 }, { "epoch": 1.8269581056466302, "grad_norm": 0.3391728103160858, "learning_rate": 3.692244677945356e-06, "loss": 0.1792, "step": 9027 }, { "epoch": 1.8271604938271606, "grad_norm": 0.29189401865005493, "learning_rate": 3.6836862966221243e-06, "loss": 0.1718, "step": 9028 }, { "epoch": 1.8273628820076908, "grad_norm": 0.31552425026893616, "learning_rate": 3.6751376595728582e-06, "loss": 0.2288, "step": 9029 }, { "epoch": 1.827565270188221, "grad_norm": 0.3222467601299286, "learning_rate": 3.6665987676624323e-06, "loss": 0.2479, "step": 9030 }, { "epoch": 1.8277676583687512, "grad_norm": 0.3233198821544647, "learning_rate": 3.658069621754734e-06, "loss": 0.214, "step": 9031 }, { "epoch": 1.8279700465492814, "grad_norm": 0.2774644196033478, "learning_rate": 3.6495502227126387e-06, "loss": 0.1885, "step": 9032 }, { "epoch": 1.8281724347298116, "grad_norm": 0.28501439094543457, "learning_rate": 3.641040571398069e-06, "loss": 0.2028, "step": 9033 }, { "epoch": 1.828374822910342, "grad_norm": 0.2721464931964874, "learning_rate": 3.6325406686719352e-06, "loss": 0.199, "step": 9034 }, { "epoch": 1.8285772110908725, "grad_norm": 0.28608623147010803, "learning_rate": 3.6240505153941506e-06, "loss": 0.188, "step": 9035 }, { "epoch": 1.8287795992714027, "grad_norm": 0.2906154692173004, "learning_rate": 3.615570112423683e-06, "loss": 0.1663, "step": 9036 }, { "epoch": 1.8289819874519329, "grad_norm": 0.2678082287311554, "learning_rate": 3.607099460618479e-06, "loss": 0.1792, "step": 9037 }, { "epoch": 1.829184375632463, "grad_norm": 0.28244009613990784, "learning_rate": 3.59863856083551e-06, "loss": 0.207, "step": 9038 }, { "epoch": 1.8293867638129933, "grad_norm": 0.30563104152679443, "learning_rate": 3.590187413930768e-06, "loss": 0.2205, "step": 9039 }, { "epoch": 1.8295891519935235, "grad_norm": 0.2915782928466797, "learning_rate": 3.581746020759247e-06, "loss": 0.208, "step": 9040 }, { "epoch": 1.8297915401740539, "grad_norm": 0.3015349507331848, "learning_rate": 3.5733143821749636e-06, "loss": 0.1832, "step": 9041 }, { "epoch": 1.829993928354584, "grad_norm": 0.3021952509880066, "learning_rate": 3.564892499030925e-06, "loss": 0.1899, "step": 9042 }, { "epoch": 1.8301963165351145, "grad_norm": 0.31338658928871155, "learning_rate": 3.5564803721791827e-06, "loss": 0.1869, "step": 9043 }, { "epoch": 1.8303987047156447, "grad_norm": 0.3176608085632324, "learning_rate": 3.5480780024707894e-06, "loss": 0.2072, "step": 9044 }, { "epoch": 1.830601092896175, "grad_norm": 0.28486374020576477, "learning_rate": 3.539685390755809e-06, "loss": 0.1892, "step": 9045 }, { "epoch": 1.830803481076705, "grad_norm": 0.26669222116470337, "learning_rate": 3.5313025378833077e-06, "loss": 0.1965, "step": 9046 }, { "epoch": 1.8310058692572353, "grad_norm": 0.28664642572402954, "learning_rate": 3.5229294447013838e-06, "loss": 0.1861, "step": 9047 }, { "epoch": 1.8312082574377655, "grad_norm": 0.2668553292751312, "learning_rate": 3.5145661120571384e-06, "loss": 0.1649, "step": 9048 }, { "epoch": 1.831410645618296, "grad_norm": 0.3332071304321289, "learning_rate": 3.506212540796683e-06, "loss": 0.2077, "step": 9049 }, { "epoch": 1.8316130337988261, "grad_norm": 0.2512947618961334, "learning_rate": 3.4978687317651526e-06, "loss": 0.1923, "step": 9050 }, { "epoch": 1.8316130337988261, "eval_loss": 0.2575944662094116, "eval_runtime": 0.7391, "eval_samples_per_second": 6.765, "eval_steps_per_second": 1.353, "step": 9050 }, { "epoch": 1.8318154219793565, "grad_norm": 0.2591105103492737, "learning_rate": 3.4895346858066724e-06, "loss": 0.1553, "step": 9051 }, { "epoch": 1.8320178101598867, "grad_norm": 0.26791900396347046, "learning_rate": 3.481210403764401e-06, "loss": 0.1818, "step": 9052 }, { "epoch": 1.832220198340417, "grad_norm": 0.32195326685905457, "learning_rate": 3.4728958864804984e-06, "loss": 0.1723, "step": 9053 }, { "epoch": 1.8324225865209471, "grad_norm": 0.2889711260795593, "learning_rate": 3.4645911347961357e-06, "loss": 0.1628, "step": 9054 }, { "epoch": 1.8326249747014773, "grad_norm": 0.3247166574001312, "learning_rate": 3.456296149551519e-06, "loss": 0.2015, "step": 9055 }, { "epoch": 1.8328273628820075, "grad_norm": 0.2901359498500824, "learning_rate": 3.4480109315858324e-06, "loss": 0.1748, "step": 9056 }, { "epoch": 1.833029751062538, "grad_norm": 0.2965649664402008, "learning_rate": 3.439735481737283e-06, "loss": 0.22, "step": 9057 }, { "epoch": 1.8332321392430682, "grad_norm": 0.303994357585907, "learning_rate": 3.4314698008431123e-06, "loss": 0.2208, "step": 9058 }, { "epoch": 1.8334345274235986, "grad_norm": 0.2678375840187073, "learning_rate": 3.4232138897395406e-06, "loss": 0.1935, "step": 9059 }, { "epoch": 1.8336369156041288, "grad_norm": 0.3268527388572693, "learning_rate": 3.4149677492618214e-06, "loss": 0.2103, "step": 9060 }, { "epoch": 1.833839303784659, "grad_norm": 0.2670746445655823, "learning_rate": 3.406731380244199e-06, "loss": 0.1804, "step": 9061 }, { "epoch": 1.8340416919651892, "grad_norm": 0.27483952045440674, "learning_rate": 3.3985047835199624e-06, "loss": 0.1703, "step": 9062 }, { "epoch": 1.8342440801457194, "grad_norm": 0.3195999562740326, "learning_rate": 3.3902879599213897e-06, "loss": 0.2326, "step": 9063 }, { "epoch": 1.8344464683262498, "grad_norm": 0.2567335069179535, "learning_rate": 3.382080910279761e-06, "loss": 0.1817, "step": 9064 }, { "epoch": 1.83464885650678, "grad_norm": 0.2286807745695114, "learning_rate": 3.37388363542539e-06, "loss": 0.1377, "step": 9065 }, { "epoch": 1.8348512446873104, "grad_norm": 0.2931678593158722, "learning_rate": 3.3656961361875795e-06, "loss": 0.2191, "step": 9066 }, { "epoch": 1.8350536328678406, "grad_norm": 0.25825655460357666, "learning_rate": 3.3575184133946668e-06, "loss": 0.178, "step": 9067 }, { "epoch": 1.8352560210483708, "grad_norm": 0.2457646280527115, "learning_rate": 3.3493504678739797e-06, "loss": 0.1439, "step": 9068 }, { "epoch": 1.835458409228901, "grad_norm": 0.30730950832366943, "learning_rate": 3.3411923004518674e-06, "loss": 0.2097, "step": 9069 }, { "epoch": 1.8356607974094312, "grad_norm": 0.31206050515174866, "learning_rate": 3.333043911953693e-06, "loss": 0.1768, "step": 9070 }, { "epoch": 1.8358631855899614, "grad_norm": 0.3024093210697174, "learning_rate": 3.324905303203818e-06, "loss": 0.2128, "step": 9071 }, { "epoch": 1.8360655737704918, "grad_norm": 0.2619602084159851, "learning_rate": 3.316776475025629e-06, "loss": 0.1696, "step": 9072 }, { "epoch": 1.836267961951022, "grad_norm": 0.2904742658138275, "learning_rate": 3.3086574282415127e-06, "loss": 0.1849, "step": 9073 }, { "epoch": 1.8364703501315525, "grad_norm": 0.29145240783691406, "learning_rate": 3.3005481636728676e-06, "loss": 0.1838, "step": 9074 }, { "epoch": 1.8366727383120827, "grad_norm": 0.29646483063697815, "learning_rate": 3.2924486821400923e-06, "loss": 0.2048, "step": 9075 }, { "epoch": 1.8368751264926129, "grad_norm": 0.28350889682769775, "learning_rate": 3.284358984462621e-06, "loss": 0.1887, "step": 9076 }, { "epoch": 1.837077514673143, "grad_norm": 0.3916099965572357, "learning_rate": 3.2762790714588876e-06, "loss": 0.1887, "step": 9077 }, { "epoch": 1.8372799028536733, "grad_norm": 0.28508061170578003, "learning_rate": 3.268208943946327e-06, "loss": 0.2175, "step": 9078 }, { "epoch": 1.8374822910342035, "grad_norm": 0.26994675397872925, "learning_rate": 3.260148602741386e-06, "loss": 0.1734, "step": 9079 }, { "epoch": 1.8376846792147339, "grad_norm": 0.2726169526576996, "learning_rate": 3.2520980486595353e-06, "loss": 0.1934, "step": 9080 }, { "epoch": 1.837887067395264, "grad_norm": 0.25494077801704407, "learning_rate": 3.244057282515234e-06, "loss": 0.182, "step": 9081 }, { "epoch": 1.8380894555757945, "grad_norm": 0.21710167825222015, "learning_rate": 3.2360263051219643e-06, "loss": 0.1331, "step": 9082 }, { "epoch": 1.8382918437563247, "grad_norm": 0.2714509963989258, "learning_rate": 3.228005117292232e-06, "loss": 0.1859, "step": 9083 }, { "epoch": 1.838494231936855, "grad_norm": 0.286760151386261, "learning_rate": 3.219993719837511e-06, "loss": 0.167, "step": 9084 }, { "epoch": 1.838696620117385, "grad_norm": 0.2535327076911926, "learning_rate": 3.2119921135683405e-06, "loss": 0.1753, "step": 9085 }, { "epoch": 1.8388990082979153, "grad_norm": 0.24537889659404755, "learning_rate": 3.2040002992942077e-06, "loss": 0.1687, "step": 9086 }, { "epoch": 1.8391013964784455, "grad_norm": 0.29615381360054016, "learning_rate": 3.1960182778236647e-06, "loss": 0.2077, "step": 9087 }, { "epoch": 1.839303784658976, "grad_norm": 0.3481968641281128, "learning_rate": 3.188046049964233e-06, "loss": 0.2454, "step": 9088 }, { "epoch": 1.8395061728395061, "grad_norm": 0.32502785325050354, "learning_rate": 3.180083616522478e-06, "loss": 0.1733, "step": 9089 }, { "epoch": 1.8397085610200365, "grad_norm": 0.22581607103347778, "learning_rate": 3.1721309783039332e-06, "loss": 0.1675, "step": 9090 }, { "epoch": 1.8399109492005667, "grad_norm": 0.2769971191883087, "learning_rate": 3.1641881361131664e-06, "loss": 0.1906, "step": 9091 }, { "epoch": 1.840113337381097, "grad_norm": 0.30919790267944336, "learning_rate": 3.156255090753757e-06, "loss": 0.2084, "step": 9092 }, { "epoch": 1.8403157255616271, "grad_norm": 0.2568013668060303, "learning_rate": 3.148331843028296e-06, "loss": 0.1692, "step": 9093 }, { "epoch": 1.8405181137421573, "grad_norm": 0.2942313551902771, "learning_rate": 3.1404183937383647e-06, "loss": 0.1977, "step": 9094 }, { "epoch": 1.8407205019226878, "grad_norm": 0.2699166536331177, "learning_rate": 3.1325147436845783e-06, "loss": 0.1425, "step": 9095 }, { "epoch": 1.840922890103218, "grad_norm": 0.30012571811676025, "learning_rate": 3.1246208936665188e-06, "loss": 0.2031, "step": 9096 }, { "epoch": 1.8411252782837484, "grad_norm": 0.3417207896709442, "learning_rate": 3.1167368444828147e-06, "loss": 0.1909, "step": 9097 }, { "epoch": 1.8413276664642786, "grad_norm": 0.29664674401283264, "learning_rate": 3.108862596931095e-06, "loss": 0.1856, "step": 9098 }, { "epoch": 1.8415300546448088, "grad_norm": 0.37633007764816284, "learning_rate": 3.100998151807988e-06, "loss": 0.1895, "step": 9099 }, { "epoch": 1.841732442825339, "grad_norm": 0.2773500978946686, "learning_rate": 3.0931435099091466e-06, "loss": 0.1801, "step": 9100 }, { "epoch": 1.841732442825339, "eval_loss": 0.2573607563972473, "eval_runtime": 0.7401, "eval_samples_per_second": 6.756, "eval_steps_per_second": 1.351, "step": 9100 }, { "epoch": 1.8419348310058692, "grad_norm": 0.28869545459747314, "learning_rate": 3.085298672029202e-06, "loss": 0.2102, "step": 9101 }, { "epoch": 1.8421372191863994, "grad_norm": 0.28606685996055603, "learning_rate": 3.0774636389618192e-06, "loss": 0.2018, "step": 9102 }, { "epoch": 1.8423396073669298, "grad_norm": 0.3356720507144928, "learning_rate": 3.0696384114996757e-06, "loss": 0.2135, "step": 9103 }, { "epoch": 1.84254199554746, "grad_norm": 0.3090665936470032, "learning_rate": 3.0618229904344485e-06, "loss": 0.1787, "step": 9104 }, { "epoch": 1.8427443837279904, "grad_norm": 0.2992224097251892, "learning_rate": 3.054017376556795e-06, "loss": 0.2351, "step": 9105 }, { "epoch": 1.8429467719085206, "grad_norm": 0.268134742975235, "learning_rate": 3.0462215706564267e-06, "loss": 0.1458, "step": 9106 }, { "epoch": 1.8431491600890508, "grad_norm": 0.31659549474716187, "learning_rate": 3.038435573522036e-06, "loss": 0.175, "step": 9107 }, { "epoch": 1.843351548269581, "grad_norm": 0.282866507768631, "learning_rate": 3.0306593859413255e-06, "loss": 0.1694, "step": 9108 }, { "epoch": 1.8435539364501112, "grad_norm": 0.27301478385925293, "learning_rate": 3.022893008701011e-06, "loss": 0.1973, "step": 9109 }, { "epoch": 1.8437563246306414, "grad_norm": 0.23165518045425415, "learning_rate": 3.015136442586819e-06, "loss": 0.1689, "step": 9110 }, { "epoch": 1.8439587128111719, "grad_norm": 0.31681936979293823, "learning_rate": 3.0073896883834663e-06, "loss": 0.2378, "step": 9111 }, { "epoch": 1.844161100991702, "grad_norm": 0.3050534129142761, "learning_rate": 2.9996527468746925e-06, "loss": 0.1851, "step": 9112 }, { "epoch": 1.8443634891722325, "grad_norm": 0.33265596628189087, "learning_rate": 2.9919256188432387e-06, "loss": 0.2079, "step": 9113 }, { "epoch": 1.8445658773527627, "grad_norm": 0.29722699522972107, "learning_rate": 2.984208305070857e-06, "loss": 0.2271, "step": 9114 }, { "epoch": 1.8447682655332929, "grad_norm": 0.2777019441127777, "learning_rate": 2.9765008063383117e-06, "loss": 0.1677, "step": 9115 }, { "epoch": 1.844970653713823, "grad_norm": 0.2719971239566803, "learning_rate": 2.9688031234253565e-06, "loss": 0.187, "step": 9116 }, { "epoch": 1.8451730418943533, "grad_norm": 0.279680073261261, "learning_rate": 2.961115257110769e-06, "loss": 0.1779, "step": 9117 }, { "epoch": 1.8453754300748835, "grad_norm": 0.2744757831096649, "learning_rate": 2.953437208172316e-06, "loss": 0.1692, "step": 9118 }, { "epoch": 1.845577818255414, "grad_norm": 0.2708338499069214, "learning_rate": 2.945768977386787e-06, "loss": 0.1935, "step": 9119 }, { "epoch": 1.845780206435944, "grad_norm": 0.2639225125312805, "learning_rate": 2.9381105655299724e-06, "loss": 0.1543, "step": 9120 }, { "epoch": 1.8459825946164745, "grad_norm": 0.24041591584682465, "learning_rate": 2.930461973376675e-06, "loss": 0.1803, "step": 9121 }, { "epoch": 1.8461849827970047, "grad_norm": 0.23384606838226318, "learning_rate": 2.9228232017006864e-06, "loss": 0.1645, "step": 9122 }, { "epoch": 1.846387370977535, "grad_norm": 0.34278208017349243, "learning_rate": 2.915194251274844e-06, "loss": 0.1817, "step": 9123 }, { "epoch": 1.8465897591580651, "grad_norm": 0.3258562684059143, "learning_rate": 2.9075751228709312e-06, "loss": 0.2113, "step": 9124 }, { "epoch": 1.8467921473385953, "grad_norm": 0.3236311078071594, "learning_rate": 2.8999658172597975e-06, "loss": 0.1951, "step": 9125 }, { "epoch": 1.8469945355191257, "grad_norm": 0.2938937246799469, "learning_rate": 2.8923663352112606e-06, "loss": 0.1901, "step": 9126 }, { "epoch": 1.847196923699656, "grad_norm": 0.3299955725669861, "learning_rate": 2.884776677494161e-06, "loss": 0.1755, "step": 9127 }, { "epoch": 1.8473993118801864, "grad_norm": 0.26704564690589905, "learning_rate": 2.8771968448763396e-06, "loss": 0.197, "step": 9128 }, { "epoch": 1.8476017000607166, "grad_norm": 0.297905832529068, "learning_rate": 2.8696268381246394e-06, "loss": 0.1863, "step": 9129 }, { "epoch": 1.8478040882412468, "grad_norm": 0.38043296337127686, "learning_rate": 2.8620666580049247e-06, "loss": 0.216, "step": 9130 }, { "epoch": 1.848006476421777, "grad_norm": 0.2848310172557831, "learning_rate": 2.85451630528204e-06, "loss": 0.1913, "step": 9131 }, { "epoch": 1.8482088646023072, "grad_norm": 0.3236400783061981, "learning_rate": 2.8469757807198736e-06, "loss": 0.2242, "step": 9132 }, { "epoch": 1.8484112527828374, "grad_norm": 0.3123624920845032, "learning_rate": 2.8394450850812714e-06, "loss": 0.2231, "step": 9133 }, { "epoch": 1.8486136409633678, "grad_norm": 0.3278132975101471, "learning_rate": 2.8319242191281237e-06, "loss": 0.2337, "step": 9134 }, { "epoch": 1.848816029143898, "grad_norm": 0.2576705515384674, "learning_rate": 2.8244131836213106e-06, "loss": 0.1761, "step": 9135 }, { "epoch": 1.8490184173244284, "grad_norm": 0.292070209980011, "learning_rate": 2.8169119793207134e-06, "loss": 0.1743, "step": 9136 }, { "epoch": 1.8492208055049586, "grad_norm": 0.31883808970451355, "learning_rate": 2.809420606985236e-06, "loss": 0.1821, "step": 9137 }, { "epoch": 1.8494231936854888, "grad_norm": 0.28848689794540405, "learning_rate": 2.801939067372783e-06, "loss": 0.1736, "step": 9138 }, { "epoch": 1.849625581866019, "grad_norm": 0.24862170219421387, "learning_rate": 2.794467361240238e-06, "loss": 0.1928, "step": 9139 }, { "epoch": 1.8498279700465492, "grad_norm": 0.3228076100349426, "learning_rate": 2.7870054893435194e-06, "loss": 0.173, "step": 9140 }, { "epoch": 1.8500303582270794, "grad_norm": 0.26641377806663513, "learning_rate": 2.7795534524375333e-06, "loss": 0.1851, "step": 9141 }, { "epoch": 1.8502327464076098, "grad_norm": 0.2720263600349426, "learning_rate": 2.7721112512762216e-06, "loss": 0.1717, "step": 9142 }, { "epoch": 1.85043513458814, "grad_norm": 0.29709115624427795, "learning_rate": 2.7646788866124817e-06, "loss": 0.1788, "step": 9143 }, { "epoch": 1.8506375227686704, "grad_norm": 0.27155938744544983, "learning_rate": 2.757256359198257e-06, "loss": 0.1862, "step": 9144 }, { "epoch": 1.8508399109492006, "grad_norm": 0.27506211400032043, "learning_rate": 2.74984366978448e-06, "loss": 0.2015, "step": 9145 }, { "epoch": 1.8510422991297308, "grad_norm": 0.30489999055862427, "learning_rate": 2.742440819121084e-06, "loss": 0.2147, "step": 9146 }, { "epoch": 1.851244687310261, "grad_norm": 0.3416946530342102, "learning_rate": 2.735047807957014e-06, "loss": 0.199, "step": 9147 }, { "epoch": 1.8514470754907912, "grad_norm": 0.3143892288208008, "learning_rate": 2.7276646370402172e-06, "loss": 0.1983, "step": 9148 }, { "epoch": 1.8516494636713214, "grad_norm": 0.27518364787101746, "learning_rate": 2.7202913071176507e-06, "loss": 0.1964, "step": 9149 }, { "epoch": 1.8518518518518519, "grad_norm": 0.312863290309906, "learning_rate": 2.7129278189352512e-06, "loss": 0.179, "step": 9150 }, { "epoch": 1.8518518518518519, "eval_loss": 0.2572021782398224, "eval_runtime": 0.7386, "eval_samples_per_second": 6.77, "eval_steps_per_second": 1.354, "step": 9150 }, { "epoch": 1.852054240032382, "grad_norm": 0.2847476005554199, "learning_rate": 2.7055741732380012e-06, "loss": 0.1939, "step": 9151 }, { "epoch": 1.8522566282129125, "grad_norm": 0.3035951852798462, "learning_rate": 2.6982303707698607e-06, "loss": 0.2055, "step": 9152 }, { "epoch": 1.8524590163934427, "grad_norm": 0.24721133708953857, "learning_rate": 2.690896412273791e-06, "loss": 0.1724, "step": 9153 }, { "epoch": 1.8526614045739729, "grad_norm": 0.30687880516052246, "learning_rate": 2.6835722984917764e-06, "loss": 0.2138, "step": 9154 }, { "epoch": 1.852863792754503, "grad_norm": 0.26503756642341614, "learning_rate": 2.67625803016478e-06, "loss": 0.1726, "step": 9155 }, { "epoch": 1.8530661809350333, "grad_norm": 0.2724968492984772, "learning_rate": 2.668953608032798e-06, "loss": 0.1842, "step": 9156 }, { "epoch": 1.8532685691155637, "grad_norm": 0.3005676865577698, "learning_rate": 2.6616590328347958e-06, "loss": 0.1891, "step": 9157 }, { "epoch": 1.853470957296094, "grad_norm": 0.2763330638408661, "learning_rate": 2.6543743053087823e-06, "loss": 0.1864, "step": 9158 }, { "epoch": 1.8536733454766243, "grad_norm": 0.40457960963249207, "learning_rate": 2.6470994261917347e-06, "loss": 0.1893, "step": 9159 }, { "epoch": 1.8538757336571545, "grad_norm": 0.33288365602493286, "learning_rate": 2.639834396219654e-06, "loss": 0.2259, "step": 9160 }, { "epoch": 1.8540781218376847, "grad_norm": 0.28434813022613525, "learning_rate": 2.63257921612754e-06, "loss": 0.1996, "step": 9161 }, { "epoch": 1.854280510018215, "grad_norm": 0.3212646245956421, "learning_rate": 2.625333886649417e-06, "loss": 0.178, "step": 9162 }, { "epoch": 1.8544828981987451, "grad_norm": 0.2832084894180298, "learning_rate": 2.6180984085182547e-06, "loss": 0.1968, "step": 9163 }, { "epoch": 1.8546852863792753, "grad_norm": 0.26873666048049927, "learning_rate": 2.610872782466067e-06, "loss": 0.175, "step": 9164 }, { "epoch": 1.8548876745598057, "grad_norm": 0.28573164343833923, "learning_rate": 2.603657009223892e-06, "loss": 0.172, "step": 9165 }, { "epoch": 1.855090062740336, "grad_norm": 0.29353541135787964, "learning_rate": 2.596451089521734e-06, "loss": 0.1608, "step": 9166 }, { "epoch": 1.8552924509208664, "grad_norm": 0.2839539051055908, "learning_rate": 2.5892550240885995e-06, "loss": 0.1626, "step": 9167 }, { "epoch": 1.8554948391013966, "grad_norm": 0.41530534625053406, "learning_rate": 2.582068813652527e-06, "loss": 0.1724, "step": 9168 }, { "epoch": 1.8556972272819268, "grad_norm": 0.2706339359283447, "learning_rate": 2.5748924589405476e-06, "loss": 0.19, "step": 9169 }, { "epoch": 1.855899615462457, "grad_norm": 0.29047369956970215, "learning_rate": 2.5677259606786684e-06, "loss": 0.1829, "step": 9170 }, { "epoch": 1.8561020036429872, "grad_norm": 0.24929776787757874, "learning_rate": 2.5605693195919323e-06, "loss": 0.1668, "step": 9171 }, { "epoch": 1.8563043918235174, "grad_norm": 0.26415640115737915, "learning_rate": 2.5534225364043706e-06, "loss": 0.1611, "step": 9172 }, { "epoch": 1.8565067800040478, "grad_norm": 0.28007614612579346, "learning_rate": 2.5462856118390277e-06, "loss": 0.1789, "step": 9173 }, { "epoch": 1.856709168184578, "grad_norm": 0.2696763873100281, "learning_rate": 2.5391585466179257e-06, "loss": 0.2041, "step": 9174 }, { "epoch": 1.8569115563651084, "grad_norm": 0.25559699535369873, "learning_rate": 2.5320413414621323e-06, "loss": 0.17, "step": 9175 }, { "epoch": 1.8571139445456386, "grad_norm": 0.28708815574645996, "learning_rate": 2.524933997091661e-06, "loss": 0.1763, "step": 9176 }, { "epoch": 1.8573163327261688, "grad_norm": 0.25946274399757385, "learning_rate": 2.51783651422558e-06, "loss": 0.1492, "step": 9177 }, { "epoch": 1.857518720906699, "grad_norm": 0.2698550224304199, "learning_rate": 2.510748893581938e-06, "loss": 0.1746, "step": 9178 }, { "epoch": 1.8577211090872292, "grad_norm": 0.27411776781082153, "learning_rate": 2.503671135877772e-06, "loss": 0.1906, "step": 9179 }, { "epoch": 1.8579234972677594, "grad_norm": 0.2979721426963806, "learning_rate": 2.496603241829143e-06, "loss": 0.2126, "step": 9180 }, { "epoch": 1.8581258854482898, "grad_norm": 0.24646952748298645, "learning_rate": 2.489545212151112e-06, "loss": 0.1719, "step": 9181 }, { "epoch": 1.85832827362882, "grad_norm": 0.25245559215545654, "learning_rate": 2.4824970475577305e-06, "loss": 0.1815, "step": 9182 }, { "epoch": 1.8585306618093504, "grad_norm": 0.23951876163482666, "learning_rate": 2.4754587487620494e-06, "loss": 0.1746, "step": 9183 }, { "epoch": 1.8587330499898806, "grad_norm": 0.33775365352630615, "learning_rate": 2.4684303164761556e-06, "loss": 0.2248, "step": 9184 }, { "epoch": 1.8589354381704108, "grad_norm": 0.34358781576156616, "learning_rate": 2.46141175141108e-06, "loss": 0.1786, "step": 9185 }, { "epoch": 1.859137826350941, "grad_norm": 0.3016970753669739, "learning_rate": 2.4544030542768993e-06, "loss": 0.2214, "step": 9186 }, { "epoch": 1.8593402145314712, "grad_norm": 0.2781483829021454, "learning_rate": 2.4474042257826902e-06, "loss": 0.1786, "step": 9187 }, { "epoch": 1.8595426027120017, "grad_norm": 0.2879065275192261, "learning_rate": 2.44041526663652e-06, "loss": 0.1992, "step": 9188 }, { "epoch": 1.8597449908925319, "grad_norm": 0.28307580947875977, "learning_rate": 2.433436177545445e-06, "loss": 0.1868, "step": 9189 }, { "epoch": 1.8599473790730623, "grad_norm": 0.29029884934425354, "learning_rate": 2.4264669592155454e-06, "loss": 0.2209, "step": 9190 }, { "epoch": 1.8601497672535925, "grad_norm": 0.2731107771396637, "learning_rate": 2.41950761235189e-06, "loss": 0.1885, "step": 9191 }, { "epoch": 1.8603521554341227, "grad_norm": 0.29840999841690063, "learning_rate": 2.412558137658549e-06, "loss": 0.1925, "step": 9192 }, { "epoch": 1.860554543614653, "grad_norm": 0.2856263220310211, "learning_rate": 2.4056185358386163e-06, "loss": 0.1725, "step": 9193 }, { "epoch": 1.860756931795183, "grad_norm": 0.264995276927948, "learning_rate": 2.3986888075941404e-06, "loss": 0.1689, "step": 9194 }, { "epoch": 1.8609593199757133, "grad_norm": 0.3229961395263672, "learning_rate": 2.3917689536262166e-06, "loss": 0.2027, "step": 9195 }, { "epoch": 1.8611617081562437, "grad_norm": 0.2580181658267975, "learning_rate": 2.384858974634918e-06, "loss": 0.1565, "step": 9196 }, { "epoch": 1.861364096336774, "grad_norm": 0.3221249282360077, "learning_rate": 2.37795887131933e-06, "loss": 0.2124, "step": 9197 }, { "epoch": 1.8615664845173043, "grad_norm": 0.22325314581394196, "learning_rate": 2.3710686443775165e-06, "loss": 0.1501, "step": 9198 }, { "epoch": 1.8617688726978345, "grad_norm": 0.292501300573349, "learning_rate": 2.364188294506575e-06, "loss": 0.2013, "step": 9199 }, { "epoch": 1.8619712608783647, "grad_norm": 0.35212448239326477, "learning_rate": 2.357317822402583e-06, "loss": 0.2097, "step": 9200 }, { "epoch": 1.8619712608783647, "eval_loss": 0.25725609064102173, "eval_runtime": 0.7405, "eval_samples_per_second": 6.752, "eval_steps_per_second": 1.35, "step": 9200 }, { "epoch": 1.862173649058895, "grad_norm": 0.2880713641643524, "learning_rate": 2.350457228760616e-06, "loss": 0.1991, "step": 9201 }, { "epoch": 1.8623760372394251, "grad_norm": 0.2760334014892578, "learning_rate": 2.3436065142747652e-06, "loss": 0.1861, "step": 9202 }, { "epoch": 1.8625784254199553, "grad_norm": 0.2789352834224701, "learning_rate": 2.336765679638109e-06, "loss": 0.1808, "step": 9203 }, { "epoch": 1.8627808136004858, "grad_norm": 0.27005985379219055, "learning_rate": 2.329934725542737e-06, "loss": 0.2044, "step": 9204 }, { "epoch": 1.862983201781016, "grad_norm": 0.2802645266056061, "learning_rate": 2.3231136526797204e-06, "loss": 0.1483, "step": 9205 }, { "epoch": 1.8631855899615464, "grad_norm": 0.2909417450428009, "learning_rate": 2.3163024617391727e-06, "loss": 0.1796, "step": 9206 }, { "epoch": 1.8633879781420766, "grad_norm": 0.33910495042800903, "learning_rate": 2.309501153410143e-06, "loss": 0.183, "step": 9207 }, { "epoch": 1.8635903663226068, "grad_norm": 0.2519986629486084, "learning_rate": 2.3027097283807253e-06, "loss": 0.1665, "step": 9208 }, { "epoch": 1.863792754503137, "grad_norm": 0.2841980755329132, "learning_rate": 2.2959281873380146e-06, "loss": 0.2132, "step": 9209 }, { "epoch": 1.8639951426836672, "grad_norm": 0.29311954975128174, "learning_rate": 2.2891565309680952e-06, "loss": 0.2043, "step": 9210 }, { "epoch": 1.8641975308641974, "grad_norm": 0.3197910189628601, "learning_rate": 2.2823947599560525e-06, "loss": 0.1826, "step": 9211 }, { "epoch": 1.8643999190447278, "grad_norm": 0.35433757305145264, "learning_rate": 2.2756428749859728e-06, "loss": 0.1838, "step": 9212 }, { "epoch": 1.864602307225258, "grad_norm": 0.26283395290374756, "learning_rate": 2.2689008767409313e-06, "loss": 0.1839, "step": 9213 }, { "epoch": 1.8648046954057884, "grad_norm": 0.30381646752357483, "learning_rate": 2.2621687659030165e-06, "loss": 0.2045, "step": 9214 }, { "epoch": 1.8650070835863186, "grad_norm": 0.27693313360214233, "learning_rate": 2.2554465431533168e-06, "loss": 0.2015, "step": 9215 }, { "epoch": 1.8652094717668488, "grad_norm": 0.450054407119751, "learning_rate": 2.24873420917191e-06, "loss": 0.1661, "step": 9216 }, { "epoch": 1.865411859947379, "grad_norm": 0.2783445417881012, "learning_rate": 2.2420317646378864e-06, "loss": 0.1715, "step": 9217 }, { "epoch": 1.8656142481279092, "grad_norm": 0.27848151326179504, "learning_rate": 2.235339210229326e-06, "loss": 0.1761, "step": 9218 }, { "epoch": 1.8658166363084396, "grad_norm": 0.29417315125465393, "learning_rate": 2.2286565466233087e-06, "loss": 0.2073, "step": 9219 }, { "epoch": 1.8660190244889698, "grad_norm": 0.3033013939857483, "learning_rate": 2.2219837744959283e-06, "loss": 0.2171, "step": 9220 }, { "epoch": 1.8662214126695003, "grad_norm": 0.27484026551246643, "learning_rate": 2.2153208945222436e-06, "loss": 0.1637, "step": 9221 }, { "epoch": 1.8664238008500305, "grad_norm": 0.3170156478881836, "learning_rate": 2.2086679073763607e-06, "loss": 0.2094, "step": 9222 }, { "epoch": 1.8666261890305607, "grad_norm": 0.2751418650150299, "learning_rate": 2.2020248137313405e-06, "loss": 0.21, "step": 9223 }, { "epoch": 1.8668285772110909, "grad_norm": 0.28255924582481384, "learning_rate": 2.195391614259257e-06, "loss": 0.2093, "step": 9224 }, { "epoch": 1.867030965391621, "grad_norm": 0.25517529249191284, "learning_rate": 2.1887683096312062e-06, "loss": 0.1716, "step": 9225 }, { "epoch": 1.8672333535721513, "grad_norm": 0.2793150842189789, "learning_rate": 2.182154900517264e-06, "loss": 0.1935, "step": 9226 }, { "epoch": 1.8674357417526817, "grad_norm": 0.30712810158729553, "learning_rate": 2.175551387586494e-06, "loss": 0.1864, "step": 9227 }, { "epoch": 1.8676381299332119, "grad_norm": 0.3432822823524475, "learning_rate": 2.1689577715069743e-06, "loss": 0.2281, "step": 9228 }, { "epoch": 1.8678405181137423, "grad_norm": 0.24391323328018188, "learning_rate": 2.1623740529457815e-06, "loss": 0.1517, "step": 9229 }, { "epoch": 1.8680429062942725, "grad_norm": 0.24822361767292023, "learning_rate": 2.155800232568972e-06, "loss": 0.1536, "step": 9230 }, { "epoch": 1.8682452944748027, "grad_norm": 0.28816094994544983, "learning_rate": 2.1492363110416357e-06, "loss": 0.1978, "step": 9231 }, { "epoch": 1.868447682655333, "grad_norm": 0.29633474349975586, "learning_rate": 2.142682289027831e-06, "loss": 0.1907, "step": 9232 }, { "epoch": 1.868650070835863, "grad_norm": 0.4840158522129059, "learning_rate": 2.1361381671906267e-06, "loss": 0.1782, "step": 9233 }, { "epoch": 1.8688524590163933, "grad_norm": 0.2775403559207916, "learning_rate": 2.1296039461920825e-06, "loss": 0.1873, "step": 9234 }, { "epoch": 1.8690548471969237, "grad_norm": 0.2747441530227661, "learning_rate": 2.1230796266932694e-06, "loss": 0.1781, "step": 9235 }, { "epoch": 1.869257235377454, "grad_norm": 0.2542593777179718, "learning_rate": 2.1165652093542598e-06, "loss": 0.1633, "step": 9236 }, { "epoch": 1.8694596235579843, "grad_norm": 0.2847427725791931, "learning_rate": 2.1100606948340927e-06, "loss": 0.1835, "step": 9237 }, { "epoch": 1.8696620117385145, "grad_norm": 0.2818686366081238, "learning_rate": 2.103566083790842e-06, "loss": 0.1953, "step": 9238 }, { "epoch": 1.8698643999190447, "grad_norm": 0.23137100040912628, "learning_rate": 2.09708137688156e-06, "loss": 0.1449, "step": 9239 }, { "epoch": 1.870066788099575, "grad_norm": 0.2877149283885956, "learning_rate": 2.0906065747622994e-06, "loss": 0.2078, "step": 9240 }, { "epoch": 1.8702691762801051, "grad_norm": 0.301807165145874, "learning_rate": 2.084141678088114e-06, "loss": 0.1845, "step": 9241 }, { "epoch": 1.8704715644606353, "grad_norm": 0.28998205065727234, "learning_rate": 2.0776866875130586e-06, "loss": 0.184, "step": 9242 }, { "epoch": 1.8706739526411658, "grad_norm": 0.24933961033821106, "learning_rate": 2.0712416036901663e-06, "loss": 0.1612, "step": 9243 }, { "epoch": 1.870876340821696, "grad_norm": 0.3098524212837219, "learning_rate": 2.064806427271493e-06, "loss": 0.1954, "step": 9244 }, { "epoch": 1.8710787290022264, "grad_norm": 0.2629394829273224, "learning_rate": 2.0583811589080847e-06, "loss": 0.1805, "step": 9245 }, { "epoch": 1.8712811171827566, "grad_norm": 0.27097082138061523, "learning_rate": 2.0519657992499884e-06, "loss": 0.16, "step": 9246 }, { "epoch": 1.8714835053632868, "grad_norm": 0.28503546118736267, "learning_rate": 2.0455603489462405e-06, "loss": 0.188, "step": 9247 }, { "epoch": 1.871685893543817, "grad_norm": 0.30967095494270325, "learning_rate": 2.0391648086448556e-06, "loss": 0.2317, "step": 9248 }, { "epoch": 1.8718882817243472, "grad_norm": 0.30626732110977173, "learning_rate": 2.032779178992894e-06, "loss": 0.2115, "step": 9249 }, { "epoch": 1.8720906699048776, "grad_norm": 0.2859762907028198, "learning_rate": 2.0264034606363835e-06, "loss": 0.1828, "step": 9250 }, { "epoch": 1.8720906699048776, "eval_loss": 0.2571257948875427, "eval_runtime": 0.7387, "eval_samples_per_second": 6.768, "eval_steps_per_second": 1.354, "step": 9250 }, { "epoch": 1.8722930580854078, "grad_norm": 0.2790232300758362, "learning_rate": 2.0200376542203414e-06, "loss": 0.1982, "step": 9251 }, { "epoch": 1.8724954462659382, "grad_norm": 0.28410059213638306, "learning_rate": 2.013681760388797e-06, "loss": 0.1754, "step": 9252 }, { "epoch": 1.8726978344464684, "grad_norm": 0.2875385284423828, "learning_rate": 2.0073357797847694e-06, "loss": 0.1871, "step": 9253 }, { "epoch": 1.8729002226269986, "grad_norm": 0.28860172629356384, "learning_rate": 2.000999713050289e-06, "loss": 0.1987, "step": 9254 }, { "epoch": 1.8731026108075288, "grad_norm": 0.23642873764038086, "learning_rate": 1.9946735608263543e-06, "loss": 0.1455, "step": 9255 }, { "epoch": 1.873304998988059, "grad_norm": 0.3174664378166199, "learning_rate": 1.9883573237529985e-06, "loss": 0.1946, "step": 9256 }, { "epoch": 1.8735073871685892, "grad_norm": 0.29666104912757874, "learning_rate": 1.9820510024692206e-06, "loss": 0.1958, "step": 9257 }, { "epoch": 1.8737097753491196, "grad_norm": 0.32560938596725464, "learning_rate": 1.975754597613033e-06, "loss": 0.2262, "step": 9258 }, { "epoch": 1.8739121635296498, "grad_norm": 0.25239941477775574, "learning_rate": 1.9694681098214375e-06, "loss": 0.1736, "step": 9259 }, { "epoch": 1.8741145517101803, "grad_norm": 0.31177282333374023, "learning_rate": 1.9631915397304355e-06, "loss": 0.1845, "step": 9260 }, { "epoch": 1.8743169398907105, "grad_norm": 0.24597904086112976, "learning_rate": 1.956924887975031e-06, "loss": 0.1554, "step": 9261 }, { "epoch": 1.8745193280712407, "grad_norm": 0.2527812123298645, "learning_rate": 1.950668155189206e-06, "loss": 0.2053, "step": 9262 }, { "epoch": 1.8747217162517709, "grad_norm": 0.30786600708961487, "learning_rate": 1.944421342005964e-06, "loss": 0.1854, "step": 9263 }, { "epoch": 1.874924104432301, "grad_norm": 0.2886776626110077, "learning_rate": 1.9381844490572786e-06, "loss": 0.2074, "step": 9264 }, { "epoch": 1.8751264926128313, "grad_norm": 0.26758891344070435, "learning_rate": 1.9319574769741334e-06, "loss": 0.218, "step": 9265 }, { "epoch": 1.8753288807933617, "grad_norm": 0.26394400000572205, "learning_rate": 1.9257404263865244e-06, "loss": 0.167, "step": 9266 }, { "epoch": 1.8755312689738919, "grad_norm": 0.2897722125053406, "learning_rate": 1.919533297923415e-06, "loss": 0.1638, "step": 9267 }, { "epoch": 1.8757336571544223, "grad_norm": 0.266205370426178, "learning_rate": 1.9133360922127806e-06, "loss": 0.1758, "step": 9268 }, { "epoch": 1.8759360453349525, "grad_norm": 0.2297278642654419, "learning_rate": 1.907148809881587e-06, "loss": 0.1547, "step": 9269 }, { "epoch": 1.8761384335154827, "grad_norm": 0.24826155602931976, "learning_rate": 1.9009714515557997e-06, "loss": 0.1586, "step": 9270 }, { "epoch": 1.876340821696013, "grad_norm": 0.36072930693626404, "learning_rate": 1.8948040178603855e-06, "loss": 0.2092, "step": 9271 }, { "epoch": 1.876543209876543, "grad_norm": 0.3244743049144745, "learning_rate": 1.8886465094192896e-06, "loss": 0.2082, "step": 9272 }, { "epoch": 1.8767455980570733, "grad_norm": 0.27832773327827454, "learning_rate": 1.8824989268554805e-06, "loss": 0.1951, "step": 9273 }, { "epoch": 1.8769479862376037, "grad_norm": 0.2717227041721344, "learning_rate": 1.8763612707908828e-06, "loss": 0.1733, "step": 9274 }, { "epoch": 1.877150374418134, "grad_norm": 0.3183591365814209, "learning_rate": 1.8702335418464556e-06, "loss": 0.1655, "step": 9275 }, { "epoch": 1.8773527625986643, "grad_norm": 0.2324318289756775, "learning_rate": 1.864115740642125e-06, "loss": 0.1964, "step": 9276 }, { "epoch": 1.8775551507791945, "grad_norm": 0.3259795308113098, "learning_rate": 1.8580078677968516e-06, "loss": 0.2207, "step": 9277 }, { "epoch": 1.8777575389597247, "grad_norm": 0.253817081451416, "learning_rate": 1.851909923928541e-06, "loss": 0.1631, "step": 9278 }, { "epoch": 1.877959927140255, "grad_norm": 0.28769010305404663, "learning_rate": 1.8458219096541218e-06, "loss": 0.1983, "step": 9279 }, { "epoch": 1.8781623153207851, "grad_norm": 0.2674994468688965, "learning_rate": 1.839743825589535e-06, "loss": 0.2164, "step": 9280 }, { "epoch": 1.8783647035013156, "grad_norm": 0.2848077416419983, "learning_rate": 1.8336756723496774e-06, "loss": 0.1822, "step": 9281 }, { "epoch": 1.8785670916818458, "grad_norm": 0.3198871910572052, "learning_rate": 1.8276174505484577e-06, "loss": 0.1854, "step": 9282 }, { "epoch": 1.8787694798623762, "grad_norm": 0.30895861983299255, "learning_rate": 1.8215691607988084e-06, "loss": 0.1675, "step": 9283 }, { "epoch": 1.8789718680429064, "grad_norm": 0.3190915882587433, "learning_rate": 1.8155308037126061e-06, "loss": 0.1914, "step": 9284 }, { "epoch": 1.8791742562234366, "grad_norm": 0.28208211064338684, "learning_rate": 1.8095023799007739e-06, "loss": 0.1668, "step": 9285 }, { "epoch": 1.8793766444039668, "grad_norm": 0.26353633403778076, "learning_rate": 1.8034838899731787e-06, "loss": 0.167, "step": 9286 }, { "epoch": 1.879579032584497, "grad_norm": 0.24951602518558502, "learning_rate": 1.797475334538723e-06, "loss": 0.1764, "step": 9287 }, { "epoch": 1.8797814207650272, "grad_norm": 0.3103049397468567, "learning_rate": 1.7914767142052758e-06, "loss": 0.209, "step": 9288 }, { "epoch": 1.8799838089455576, "grad_norm": 0.25802189111709595, "learning_rate": 1.7854880295797405e-06, "loss": 0.1699, "step": 9289 }, { "epoch": 1.8801861971260878, "grad_norm": 0.25156068801879883, "learning_rate": 1.7795092812679548e-06, "loss": 0.2003, "step": 9290 }, { "epoch": 1.8803885853066182, "grad_norm": 0.2825656235218048, "learning_rate": 1.7735404698748237e-06, "loss": 0.1887, "step": 9291 }, { "epoch": 1.8805909734871484, "grad_norm": 0.2530002295970917, "learning_rate": 1.7675815960041752e-06, "loss": 0.1759, "step": 9292 }, { "epoch": 1.8807933616676786, "grad_norm": 0.24998198449611664, "learning_rate": 1.7616326602588828e-06, "loss": 0.175, "step": 9293 }, { "epoch": 1.8809957498482088, "grad_norm": 0.3051765263080597, "learning_rate": 1.7556936632407983e-06, "loss": 0.1972, "step": 9294 }, { "epoch": 1.881198138028739, "grad_norm": 0.25916385650634766, "learning_rate": 1.7497646055507633e-06, "loss": 0.1511, "step": 9295 }, { "epoch": 1.8814005262092692, "grad_norm": 0.3032218813896179, "learning_rate": 1.743845487788609e-06, "loss": 0.1982, "step": 9296 }, { "epoch": 1.8816029143897997, "grad_norm": 0.3098996579647064, "learning_rate": 1.73793631055319e-06, "loss": 0.2386, "step": 9297 }, { "epoch": 1.8818053025703299, "grad_norm": 0.3222119212150574, "learning_rate": 1.7320370744423165e-06, "loss": 0.1864, "step": 9298 }, { "epoch": 1.8820076907508603, "grad_norm": 0.31922316551208496, "learning_rate": 1.7261477800528114e-06, "loss": 0.204, "step": 9299 }, { "epoch": 1.8822100789313905, "grad_norm": 0.29419052600860596, "learning_rate": 1.7202684279805092e-06, "loss": 0.1628, "step": 9300 }, { "epoch": 1.8822100789313905, "eval_loss": 0.25728121399879456, "eval_runtime": 0.7402, "eval_samples_per_second": 6.755, "eval_steps_per_second": 1.351, "step": 9300 }, { "epoch": 1.8824124671119207, "grad_norm": 0.2934373915195465, "learning_rate": 1.7143990188202007e-06, "loss": 0.2056, "step": 9301 }, { "epoch": 1.8826148552924509, "grad_norm": 0.29484328627586365, "learning_rate": 1.708539553165711e-06, "loss": 0.1865, "step": 9302 }, { "epoch": 1.882817243472981, "grad_norm": 0.27377191185951233, "learning_rate": 1.7026900316098215e-06, "loss": 0.1692, "step": 9303 }, { "epoch": 1.8830196316535113, "grad_norm": 0.2880317270755768, "learning_rate": 1.6968504547443364e-06, "loss": 0.2183, "step": 9304 }, { "epoch": 1.8832220198340417, "grad_norm": 0.2645496428012848, "learning_rate": 1.6910208231600389e-06, "loss": 0.1895, "step": 9305 }, { "epoch": 1.883424408014572, "grad_norm": 0.2487277090549469, "learning_rate": 1.6852011374467014e-06, "loss": 0.1905, "step": 9306 }, { "epoch": 1.8836267961951023, "grad_norm": 0.2735969126224518, "learning_rate": 1.6793913981931198e-06, "loss": 0.1663, "step": 9307 }, { "epoch": 1.8838291843756325, "grad_norm": 0.2630087733268738, "learning_rate": 1.6735916059870461e-06, "loss": 0.1747, "step": 9308 }, { "epoch": 1.8840315725561627, "grad_norm": 0.2995377779006958, "learning_rate": 1.6678017614152442e-06, "loss": 0.181, "step": 9309 }, { "epoch": 1.884233960736693, "grad_norm": 0.29509326815605164, "learning_rate": 1.6620218650634677e-06, "loss": 0.175, "step": 9310 }, { "epoch": 1.8844363489172231, "grad_norm": 0.25882208347320557, "learning_rate": 1.6562519175164827e-06, "loss": 0.1562, "step": 9311 }, { "epoch": 1.8846387370977535, "grad_norm": 0.2785455286502838, "learning_rate": 1.6504919193580105e-06, "loss": 0.176, "step": 9312 }, { "epoch": 1.8848411252782837, "grad_norm": 0.31672048568725586, "learning_rate": 1.6447418711707962e-06, "loss": 0.2095, "step": 9313 }, { "epoch": 1.8850435134588142, "grad_norm": 0.2423751801252365, "learning_rate": 1.6390017735365637e-06, "loss": 0.1446, "step": 9314 }, { "epoch": 1.8852459016393444, "grad_norm": 0.2900162935256958, "learning_rate": 1.6332716270360482e-06, "loss": 0.2023, "step": 9315 }, { "epoch": 1.8854482898198746, "grad_norm": 0.25335296988487244, "learning_rate": 1.6275514322489638e-06, "loss": 0.167, "step": 9316 }, { "epoch": 1.8856506780004048, "grad_norm": 0.26420995593070984, "learning_rate": 1.6218411897540252e-06, "loss": 0.2064, "step": 9317 }, { "epoch": 1.885853066180935, "grad_norm": 0.27242782711982727, "learning_rate": 1.616140900128904e-06, "loss": 0.1954, "step": 9318 }, { "epoch": 1.8860554543614652, "grad_norm": 0.24767664074897766, "learning_rate": 1.6104505639503276e-06, "loss": 0.1601, "step": 9319 }, { "epoch": 1.8862578425419956, "grad_norm": 0.38079896569252014, "learning_rate": 1.6047701817939687e-06, "loss": 0.2118, "step": 9320 }, { "epoch": 1.8864602307225258, "grad_norm": 0.24454255402088165, "learning_rate": 1.5990997542345121e-06, "loss": 0.1811, "step": 9321 }, { "epoch": 1.8866626189030562, "grad_norm": 0.2145971655845642, "learning_rate": 1.5934392818456323e-06, "loss": 0.1308, "step": 9322 }, { "epoch": 1.8868650070835864, "grad_norm": 0.3013664484024048, "learning_rate": 1.5877887652000045e-06, "loss": 0.1892, "step": 9323 }, { "epoch": 1.8870673952641166, "grad_norm": 0.25066500902175903, "learning_rate": 1.5821482048692716e-06, "loss": 0.1711, "step": 9324 }, { "epoch": 1.8872697834446468, "grad_norm": 0.31969258189201355, "learning_rate": 1.576517601424099e-06, "loss": 0.1989, "step": 9325 }, { "epoch": 1.887472171625177, "grad_norm": 0.2970082759857178, "learning_rate": 1.5708969554341424e-06, "loss": 0.1757, "step": 9326 }, { "epoch": 1.8876745598057072, "grad_norm": 0.25590500235557556, "learning_rate": 1.5652862674680136e-06, "loss": 0.1633, "step": 9327 }, { "epoch": 1.8878769479862376, "grad_norm": 0.27178964018821716, "learning_rate": 1.5596855380933584e-06, "loss": 0.1844, "step": 9328 }, { "epoch": 1.8880793361667678, "grad_norm": 0.27887216210365295, "learning_rate": 1.5540947678768013e-06, "loss": 0.1773, "step": 9329 }, { "epoch": 1.8882817243472982, "grad_norm": 0.2817661464214325, "learning_rate": 1.5485139573839569e-06, "loss": 0.1944, "step": 9330 }, { "epoch": 1.8884841125278284, "grad_norm": 0.30042463541030884, "learning_rate": 1.5429431071794175e-06, "loss": 0.1771, "step": 9331 }, { "epoch": 1.8886865007083586, "grad_norm": 0.30515265464782715, "learning_rate": 1.5373822178268105e-06, "loss": 0.1774, "step": 9332 }, { "epoch": 1.8888888888888888, "grad_norm": 0.2691422402858734, "learning_rate": 1.5318312898887078e-06, "loss": 0.1824, "step": 9333 }, { "epoch": 1.889091277069419, "grad_norm": 0.28377440571784973, "learning_rate": 1.5262903239267045e-06, "loss": 0.1677, "step": 9334 }, { "epoch": 1.8892936652499492, "grad_norm": 0.2840723693370819, "learning_rate": 1.5207593205013748e-06, "loss": 0.186, "step": 9335 }, { "epoch": 1.8894960534304797, "grad_norm": 0.30614086985588074, "learning_rate": 1.515238280172282e-06, "loss": 0.2031, "step": 9336 }, { "epoch": 1.88969844161101, "grad_norm": 0.27492690086364746, "learning_rate": 1.5097272034979904e-06, "loss": 0.1852, "step": 9337 }, { "epoch": 1.8899008297915403, "grad_norm": 0.34836092591285706, "learning_rate": 1.504226091036054e-06, "loss": 0.1754, "step": 9338 }, { "epoch": 1.8901032179720705, "grad_norm": 0.27250251173973083, "learning_rate": 1.4987349433430165e-06, "loss": 0.1595, "step": 9339 }, { "epoch": 1.8903056061526007, "grad_norm": 0.276806503534317, "learning_rate": 1.4932537609744112e-06, "loss": 0.1792, "step": 9340 }, { "epoch": 1.8905079943331309, "grad_norm": 0.26508834958076477, "learning_rate": 1.4877825444847838e-06, "loss": 0.1708, "step": 9341 }, { "epoch": 1.890710382513661, "grad_norm": 0.28936338424682617, "learning_rate": 1.4823212944276243e-06, "loss": 0.191, "step": 9342 }, { "epoch": 1.8909127706941915, "grad_norm": 0.31687653064727783, "learning_rate": 1.47687001135548e-06, "loss": 0.1835, "step": 9343 }, { "epoch": 1.8911151588747217, "grad_norm": 0.2686740458011627, "learning_rate": 1.471428695819821e-06, "loss": 0.1719, "step": 9344 }, { "epoch": 1.8913175470552521, "grad_norm": 0.28184860944747925, "learning_rate": 1.4659973483711732e-06, "loss": 0.1813, "step": 9345 }, { "epoch": 1.8915199352357823, "grad_norm": 0.2862120568752289, "learning_rate": 1.460575969558997e-06, "loss": 0.1912, "step": 9346 }, { "epoch": 1.8917223234163125, "grad_norm": 0.2824765145778656, "learning_rate": 1.4551645599317876e-06, "loss": 0.1711, "step": 9347 }, { "epoch": 1.8919247115968427, "grad_norm": 0.2883681356906891, "learning_rate": 1.4497631200370066e-06, "loss": 0.1805, "step": 9348 }, { "epoch": 1.892127099777373, "grad_norm": 0.26486730575561523, "learning_rate": 1.4443716504211168e-06, "loss": 0.1736, "step": 9349 }, { "epoch": 1.8923294879579031, "grad_norm": 0.20992836356163025, "learning_rate": 1.4389901516295713e-06, "loss": 0.0989, "step": 9350 }, { "epoch": 1.8923294879579031, "eval_loss": 0.25722378492355347, "eval_runtime": 0.7396, "eval_samples_per_second": 6.76, "eval_steps_per_second": 1.352, "step": 9350 }, { "epoch": 1.8925318761384335, "grad_norm": 0.27398180961608887, "learning_rate": 1.4336186242068117e-06, "loss": 0.1426, "step": 9351 }, { "epoch": 1.8927342643189637, "grad_norm": 0.2692059874534607, "learning_rate": 1.4282570686962705e-06, "loss": 0.2077, "step": 9352 }, { "epoch": 1.8929366524994942, "grad_norm": 0.3125195801258087, "learning_rate": 1.422905485640391e-06, "loss": 0.1772, "step": 9353 }, { "epoch": 1.8931390406800244, "grad_norm": 0.3143872916698456, "learning_rate": 1.417563875580563e-06, "loss": 0.1803, "step": 9354 }, { "epoch": 1.8933414288605546, "grad_norm": 0.28542307019233704, "learning_rate": 1.4122322390572096e-06, "loss": 0.1928, "step": 9355 }, { "epoch": 1.8935438170410848, "grad_norm": 0.30504897236824036, "learning_rate": 1.4069105766097323e-06, "loss": 0.2005, "step": 9356 }, { "epoch": 1.893746205221615, "grad_norm": 0.2561946511268616, "learning_rate": 1.401598888776523e-06, "loss": 0.1427, "step": 9357 }, { "epoch": 1.8939485934021452, "grad_norm": 0.29000937938690186, "learning_rate": 1.3962971760949518e-06, "loss": 0.2019, "step": 9358 }, { "epoch": 1.8941509815826756, "grad_norm": 0.29577577114105225, "learning_rate": 1.3910054391014005e-06, "loss": 0.1778, "step": 9359 }, { "epoch": 1.8943533697632058, "grad_norm": 0.26925763487815857, "learning_rate": 1.385723678331219e-06, "loss": 0.1734, "step": 9360 }, { "epoch": 1.8945557579437362, "grad_norm": 0.29018691182136536, "learning_rate": 1.3804518943187683e-06, "loss": 0.2129, "step": 9361 }, { "epoch": 1.8947581461242664, "grad_norm": 0.28104546666145325, "learning_rate": 1.3751900875974e-06, "loss": 0.1839, "step": 9362 }, { "epoch": 1.8949605343047966, "grad_norm": 0.25722119212150574, "learning_rate": 1.3699382586994325e-06, "loss": 0.183, "step": 9363 }, { "epoch": 1.8951629224853268, "grad_norm": 0.2599545419216156, "learning_rate": 1.3646964081561964e-06, "loss": 0.1856, "step": 9364 }, { "epoch": 1.895365310665857, "grad_norm": 0.2868998944759369, "learning_rate": 1.359464536498023e-06, "loss": 0.2232, "step": 9365 }, { "epoch": 1.8955676988463872, "grad_norm": 0.31910818815231323, "learning_rate": 1.3542426442541889e-06, "loss": 0.1922, "step": 9366 }, { "epoch": 1.8957700870269176, "grad_norm": 0.3114806115627289, "learning_rate": 1.349030731953016e-06, "loss": 0.2096, "step": 9367 }, { "epoch": 1.895972475207448, "grad_norm": 0.28310254216194153, "learning_rate": 1.3438288001217714e-06, "loss": 0.1975, "step": 9368 }, { "epoch": 1.8961748633879782, "grad_norm": 0.2802981436252594, "learning_rate": 1.3386368492867451e-06, "loss": 0.2046, "step": 9369 }, { "epoch": 1.8963772515685084, "grad_norm": 0.3002050220966339, "learning_rate": 1.333454879973206e-06, "loss": 0.1872, "step": 9370 }, { "epoch": 1.8965796397490386, "grad_norm": 0.2511701285839081, "learning_rate": 1.3282828927054015e-06, "loss": 0.1808, "step": 9371 }, { "epoch": 1.8967820279295688, "grad_norm": 0.29675614833831787, "learning_rate": 1.3231208880065794e-06, "loss": 0.1823, "step": 9372 }, { "epoch": 1.896984416110099, "grad_norm": 0.2509481608867645, "learning_rate": 1.3179688663989886e-06, "loss": 0.1539, "step": 9373 }, { "epoch": 1.8971868042906295, "grad_norm": 0.27059900760650635, "learning_rate": 1.3128268284038347e-06, "loss": 0.1443, "step": 9374 }, { "epoch": 1.8973891924711597, "grad_norm": 0.2761165499687195, "learning_rate": 1.307694774541368e-06, "loss": 0.1681, "step": 9375 }, { "epoch": 1.89759158065169, "grad_norm": 0.33413439989089966, "learning_rate": 1.3025727053307624e-06, "loss": 0.2027, "step": 9376 }, { "epoch": 1.8977939688322203, "grad_norm": 0.2996768355369568, "learning_rate": 1.2974606212902473e-06, "loss": 0.2213, "step": 9377 }, { "epoch": 1.8979963570127505, "grad_norm": 0.3011522591114044, "learning_rate": 1.2923585229369762e-06, "loss": 0.1947, "step": 9378 }, { "epoch": 1.8981987451932807, "grad_norm": 0.34616440534591675, "learning_rate": 1.2872664107871467e-06, "loss": 0.2339, "step": 9379 }, { "epoch": 1.898401133373811, "grad_norm": 0.25703170895576477, "learning_rate": 1.2821842853559252e-06, "loss": 0.1944, "step": 9380 }, { "epoch": 1.898603521554341, "grad_norm": 0.2764713168144226, "learning_rate": 1.2771121471574555e-06, "loss": 0.1975, "step": 9381 }, { "epoch": 1.8988059097348715, "grad_norm": 0.2664838135242462, "learning_rate": 1.2720499967049048e-06, "loss": 0.1742, "step": 9382 }, { "epoch": 1.8990082979154017, "grad_norm": 0.30014511942863464, "learning_rate": 1.2669978345103861e-06, "loss": 0.1966, "step": 9383 }, { "epoch": 1.8992106860959321, "grad_norm": 0.2880495488643646, "learning_rate": 1.2619556610850346e-06, "loss": 0.2124, "step": 9384 }, { "epoch": 1.8994130742764623, "grad_norm": 0.24450325965881348, "learning_rate": 1.2569234769389648e-06, "loss": 0.1728, "step": 9385 }, { "epoch": 1.8996154624569925, "grad_norm": 0.28422245383262634, "learning_rate": 1.2519012825812804e-06, "loss": 0.1947, "step": 9386 }, { "epoch": 1.8998178506375227, "grad_norm": 0.27359914779663086, "learning_rate": 1.2468890785200637e-06, "loss": 0.1601, "step": 9387 }, { "epoch": 1.900020238818053, "grad_norm": 0.28732553124427795, "learning_rate": 1.2418868652624093e-06, "loss": 0.2172, "step": 9388 }, { "epoch": 1.9002226269985831, "grad_norm": 0.3102622628211975, "learning_rate": 1.2368946433143792e-06, "loss": 0.1719, "step": 9389 }, { "epoch": 1.9004250151791136, "grad_norm": 0.2415602207183838, "learning_rate": 1.2319124131810468e-06, "loss": 0.1882, "step": 9390 }, { "epoch": 1.9006274033596438, "grad_norm": 0.29216253757476807, "learning_rate": 1.2269401753664533e-06, "loss": 0.1517, "step": 9391 }, { "epoch": 1.9008297915401742, "grad_norm": 0.2574816346168518, "learning_rate": 1.2219779303736412e-06, "loss": 0.173, "step": 9392 }, { "epoch": 1.9010321797207044, "grad_norm": 0.2932865023612976, "learning_rate": 1.2170256787046308e-06, "loss": 0.1946, "step": 9393 }, { "epoch": 1.9012345679012346, "grad_norm": 0.24139507114887238, "learning_rate": 1.2120834208604436e-06, "loss": 0.1737, "step": 9394 }, { "epoch": 1.9014369560817648, "grad_norm": 0.28557559847831726, "learning_rate": 1.2071511573410909e-06, "loss": 0.1958, "step": 9395 }, { "epoch": 1.901639344262295, "grad_norm": 0.4012734293937683, "learning_rate": 1.2022288886455512e-06, "loss": 0.1906, "step": 9396 }, { "epoch": 1.9018417324428254, "grad_norm": 0.27204346656799316, "learning_rate": 1.1973166152718262e-06, "loss": 0.1798, "step": 9397 }, { "epoch": 1.9020441206233556, "grad_norm": 0.29930251836776733, "learning_rate": 1.192414337716885e-06, "loss": 0.1721, "step": 9398 }, { "epoch": 1.902246508803886, "grad_norm": 0.2984878122806549, "learning_rate": 1.1875220564766865e-06, "loss": 0.1977, "step": 9399 }, { "epoch": 1.9024488969844162, "grad_norm": 0.29395338892936707, "learning_rate": 1.182639772046179e-06, "loss": 0.2079, "step": 9400 }, { "epoch": 1.9024488969844162, "eval_loss": 0.2572198510169983, "eval_runtime": 0.7401, "eval_samples_per_second": 6.756, "eval_steps_per_second": 1.351, "step": 9400 }, { "epoch": 1.9026512851649464, "grad_norm": 0.27311617136001587, "learning_rate": 1.17776748491929e-06, "loss": 0.1829, "step": 9401 }, { "epoch": 1.9028536733454766, "grad_norm": 0.26347020268440247, "learning_rate": 1.1729051955889692e-06, "loss": 0.1704, "step": 9402 }, { "epoch": 1.9030560615260068, "grad_norm": 0.296898752450943, "learning_rate": 1.1680529045471123e-06, "loss": 0.2021, "step": 9403 }, { "epoch": 1.903258449706537, "grad_norm": 0.311983197927475, "learning_rate": 1.1632106122846375e-06, "loss": 0.1891, "step": 9404 }, { "epoch": 1.9034608378870674, "grad_norm": 0.29881778359413147, "learning_rate": 1.15837831929142e-06, "loss": 0.193, "step": 9405 }, { "epoch": 1.9036632260675976, "grad_norm": 0.2944910228252411, "learning_rate": 1.1535560260563683e-06, "loss": 0.1827, "step": 9406 }, { "epoch": 1.903865614248128, "grad_norm": 0.3343166708946228, "learning_rate": 1.1487437330673146e-06, "loss": 0.2165, "step": 9407 }, { "epoch": 1.9040680024286583, "grad_norm": 0.28629180788993835, "learning_rate": 1.143941440811147e-06, "loss": 0.181, "step": 9408 }, { "epoch": 1.9042703906091885, "grad_norm": 0.24283328652381897, "learning_rate": 1.1391491497736995e-06, "loss": 0.1493, "step": 9409 }, { "epoch": 1.9044727787897187, "grad_norm": 0.25752711296081543, "learning_rate": 1.134366860439795e-06, "loss": 0.197, "step": 9410 }, { "epoch": 1.9046751669702489, "grad_norm": 0.30904826521873474, "learning_rate": 1.12959457329328e-06, "loss": 0.1888, "step": 9411 }, { "epoch": 1.904877555150779, "grad_norm": 0.2787197232246399, "learning_rate": 1.124832288816946e-06, "loss": 0.1935, "step": 9412 }, { "epoch": 1.9050799433313095, "grad_norm": 0.37014058232307434, "learning_rate": 1.1200800074925855e-06, "loss": 0.2135, "step": 9413 }, { "epoch": 1.9052823315118397, "grad_norm": 0.31408366560935974, "learning_rate": 1.1153377298010138e-06, "loss": 0.172, "step": 9414 }, { "epoch": 1.90548471969237, "grad_norm": 0.3022351861000061, "learning_rate": 1.1106054562219691e-06, "loss": 0.199, "step": 9415 }, { "epoch": 1.9056871078729003, "grad_norm": 0.2721509635448456, "learning_rate": 1.1058831872342357e-06, "loss": 0.1842, "step": 9416 }, { "epoch": 1.9058894960534305, "grad_norm": 0.2608672082424164, "learning_rate": 1.1011709233155532e-06, "loss": 0.1817, "step": 9417 }, { "epoch": 1.9060918842339607, "grad_norm": 0.2617418169975281, "learning_rate": 1.0964686649426736e-06, "loss": 0.2253, "step": 9418 }, { "epoch": 1.906294272414491, "grad_norm": 0.2891494631767273, "learning_rate": 1.0917764125913055e-06, "loss": 0.2057, "step": 9419 }, { "epoch": 1.906496660595021, "grad_norm": 0.22889705002307892, "learning_rate": 1.087094166736169e-06, "loss": 0.1462, "step": 9420 }, { "epoch": 1.9066990487755515, "grad_norm": 0.240357905626297, "learning_rate": 1.0824219278509518e-06, "loss": 0.1588, "step": 9421 }, { "epoch": 1.9069014369560817, "grad_norm": 0.23243705928325653, "learning_rate": 1.0777596964083647e-06, "loss": 0.1482, "step": 9422 }, { "epoch": 1.9071038251366121, "grad_norm": 0.2566569447517395, "learning_rate": 1.073107472880075e-06, "loss": 0.1832, "step": 9423 }, { "epoch": 1.9073062133171423, "grad_norm": 0.26307451725006104, "learning_rate": 1.0684652577367394e-06, "loss": 0.1853, "step": 9424 }, { "epoch": 1.9075086014976725, "grad_norm": 0.2929825484752655, "learning_rate": 1.0638330514480154e-06, "loss": 0.2002, "step": 9425 }, { "epoch": 1.9077109896782027, "grad_norm": 0.23552408814430237, "learning_rate": 1.0592108544825286e-06, "loss": 0.1549, "step": 9426 }, { "epoch": 1.907913377858733, "grad_norm": 0.30096322298049927, "learning_rate": 1.0545986673079155e-06, "loss": 0.2031, "step": 9427 }, { "epoch": 1.9081157660392634, "grad_norm": 0.2977411150932312, "learning_rate": 1.0499964903908033e-06, "loss": 0.2103, "step": 9428 }, { "epoch": 1.9083181542197936, "grad_norm": 0.25064200162887573, "learning_rate": 1.0454043241967636e-06, "loss": 0.152, "step": 9429 }, { "epoch": 1.908520542400324, "grad_norm": 0.33590954542160034, "learning_rate": 1.040822169190392e-06, "loss": 0.2018, "step": 9430 }, { "epoch": 1.9087229305808542, "grad_norm": 0.2594203054904938, "learning_rate": 1.0362500258352725e-06, "loss": 0.179, "step": 9431 }, { "epoch": 1.9089253187613844, "grad_norm": 0.2659510672092438, "learning_rate": 1.0316878945939579e-06, "loss": 0.1853, "step": 9432 }, { "epoch": 1.9091277069419146, "grad_norm": 0.3166869282722473, "learning_rate": 1.027135775928001e-06, "loss": 0.2172, "step": 9433 }, { "epoch": 1.9093300951224448, "grad_norm": 0.2751993238925934, "learning_rate": 1.0225936702979333e-06, "loss": 0.1769, "step": 9434 }, { "epoch": 1.909532483302975, "grad_norm": 0.3165026903152466, "learning_rate": 1.0180615781632762e-06, "loss": 0.2001, "step": 9435 }, { "epoch": 1.9097348714835054, "grad_norm": 0.296625018119812, "learning_rate": 1.013539499982552e-06, "loss": 0.2236, "step": 9436 }, { "epoch": 1.9099372596640356, "grad_norm": 0.256110817193985, "learning_rate": 1.009027436213239e-06, "loss": 0.1433, "step": 9437 }, { "epoch": 1.910139647844566, "grad_norm": 0.22374269366264343, "learning_rate": 1.0045253873118387e-06, "loss": 0.1515, "step": 9438 }, { "epoch": 1.9103420360250962, "grad_norm": 0.2797078788280487, "learning_rate": 1.0000333537337981e-06, "loss": 0.1829, "step": 9439 }, { "epoch": 1.9105444242056264, "grad_norm": 0.2668952941894531, "learning_rate": 9.955513359335978e-07, "loss": 0.213, "step": 9440 }, { "epoch": 1.9107468123861566, "grad_norm": 0.30301371216773987, "learning_rate": 9.910793343646751e-07, "loss": 0.204, "step": 9441 }, { "epoch": 1.9109492005666868, "grad_norm": 0.26662030816078186, "learning_rate": 9.866173494794462e-07, "loss": 0.1984, "step": 9442 }, { "epoch": 1.911151588747217, "grad_norm": 0.33214861154556274, "learning_rate": 9.821653817293498e-07, "loss": 0.1967, "step": 9443 }, { "epoch": 1.9113539769277474, "grad_norm": 0.29728424549102783, "learning_rate": 9.7772343156477e-07, "loss": 0.202, "step": 9444 }, { "epoch": 1.9115563651082776, "grad_norm": 0.26122555136680603, "learning_rate": 9.732914994351029e-07, "loss": 0.1525, "step": 9445 }, { "epoch": 1.911758753288808, "grad_norm": 0.26041847467422485, "learning_rate": 9.688695857887343e-07, "loss": 0.1637, "step": 9446 }, { "epoch": 1.9119611414693383, "grad_norm": 0.3184111714363098, "learning_rate": 9.644576910730174e-07, "loss": 0.1806, "step": 9447 }, { "epoch": 1.9121635296498685, "grad_norm": 0.37006789445877075, "learning_rate": 9.600558157342955e-07, "loss": 0.1855, "step": 9448 }, { "epoch": 1.9123659178303987, "grad_norm": 0.24486856162548065, "learning_rate": 9.556639602179229e-07, "loss": 0.1956, "step": 9449 }, { "epoch": 1.9125683060109289, "grad_norm": 0.2659735679626465, "learning_rate": 9.512821249682002e-07, "loss": 0.1842, "step": 9450 }, { "epoch": 1.9125683060109289, "eval_loss": 0.2572857439517975, "eval_runtime": 0.7408, "eval_samples_per_second": 6.75, "eval_steps_per_second": 1.35, "step": 9450 }, { "epoch": 1.912770694191459, "grad_norm": 0.2926306426525116, "learning_rate": 9.469103104284505e-07, "loss": 0.1947, "step": 9451 }, { "epoch": 1.9129730823719895, "grad_norm": 0.31365111470222473, "learning_rate": 9.425485170409642e-07, "loss": 0.2003, "step": 9452 }, { "epoch": 1.9131754705525197, "grad_norm": 0.27288708090782166, "learning_rate": 9.381967452470219e-07, "loss": 0.1724, "step": 9453 }, { "epoch": 1.91337785873305, "grad_norm": 0.3712238371372223, "learning_rate": 9.338549954868825e-07, "loss": 0.1556, "step": 9454 }, { "epoch": 1.9135802469135803, "grad_norm": 0.32234710454940796, "learning_rate": 9.295232681998167e-07, "loss": 0.2452, "step": 9455 }, { "epoch": 1.9137826350941105, "grad_norm": 0.3090566396713257, "learning_rate": 9.252015638240408e-07, "loss": 0.2055, "step": 9456 }, { "epoch": 1.9139850232746407, "grad_norm": 0.26599106192588806, "learning_rate": 9.208898827967938e-07, "loss": 0.1852, "step": 9457 }, { "epoch": 1.914187411455171, "grad_norm": 0.27205586433410645, "learning_rate": 9.165882255542824e-07, "loss": 0.1567, "step": 9458 }, { "epoch": 1.9143897996357013, "grad_norm": 0.23347721993923187, "learning_rate": 9.122965925317029e-07, "loss": 0.1577, "step": 9459 }, { "epoch": 1.9145921878162315, "grad_norm": 0.24848300218582153, "learning_rate": 9.080149841632523e-07, "loss": 0.1838, "step": 9460 }, { "epoch": 1.914794575996762, "grad_norm": 0.30119454860687256, "learning_rate": 9.037434008820733e-07, "loss": 0.2033, "step": 9461 }, { "epoch": 1.9149969641772921, "grad_norm": 0.30918315052986145, "learning_rate": 8.994818431203311e-07, "loss": 0.1813, "step": 9462 }, { "epoch": 1.9151993523578223, "grad_norm": 0.2923755347728729, "learning_rate": 8.952303113091697e-07, "loss": 0.2069, "step": 9463 }, { "epoch": 1.9154017405383525, "grad_norm": 0.29802224040031433, "learning_rate": 8.909888058787008e-07, "loss": 0.1863, "step": 9464 }, { "epoch": 1.9156041287188827, "grad_norm": 0.2790274918079376, "learning_rate": 8.867573272580587e-07, "loss": 0.2113, "step": 9465 }, { "epoch": 1.915806516899413, "grad_norm": 0.2592965066432953, "learning_rate": 8.825358758753232e-07, "loss": 0.1744, "step": 9466 }, { "epoch": 1.9160089050799434, "grad_norm": 0.2811659276485443, "learning_rate": 8.783244521575751e-07, "loss": 0.1633, "step": 9467 }, { "epoch": 1.9162112932604736, "grad_norm": 0.30291372537612915, "learning_rate": 8.741230565308956e-07, "loss": 0.187, "step": 9468 }, { "epoch": 1.916413681441004, "grad_norm": 0.2720467150211334, "learning_rate": 8.699316894203224e-07, "loss": 0.1707, "step": 9469 }, { "epoch": 1.9166160696215342, "grad_norm": 0.36524778604507446, "learning_rate": 8.657503512499055e-07, "loss": 0.2233, "step": 9470 }, { "epoch": 1.9168184578020644, "grad_norm": 0.3040582537651062, "learning_rate": 8.615790424426618e-07, "loss": 0.2155, "step": 9471 }, { "epoch": 1.9170208459825946, "grad_norm": 0.25449779629707336, "learning_rate": 8.574177634205982e-07, "loss": 0.185, "step": 9472 }, { "epoch": 1.9172232341631248, "grad_norm": 0.2561799883842468, "learning_rate": 8.532665146047225e-07, "loss": 0.1641, "step": 9473 }, { "epoch": 1.917425622343655, "grad_norm": 0.28597721457481384, "learning_rate": 8.491252964149987e-07, "loss": 0.1991, "step": 9474 }, { "epoch": 1.9176280105241854, "grad_norm": 0.2718763053417206, "learning_rate": 8.449941092704027e-07, "loss": 0.1786, "step": 9475 }, { "epoch": 1.9178303987047156, "grad_norm": 0.29240909218788147, "learning_rate": 8.408729535888893e-07, "loss": 0.1701, "step": 9476 }, { "epoch": 1.918032786885246, "grad_norm": 0.2624104619026184, "learning_rate": 8.367618297873692e-07, "loss": 0.1887, "step": 9477 }, { "epoch": 1.9182351750657762, "grad_norm": 0.26697927713394165, "learning_rate": 8.326607382817875e-07, "loss": 0.1584, "step": 9478 }, { "epoch": 1.9184375632463064, "grad_norm": 0.2614873945713043, "learning_rate": 8.285696794870457e-07, "loss": 0.1583, "step": 9479 }, { "epoch": 1.9186399514268366, "grad_norm": 0.2983810305595398, "learning_rate": 8.244886538170238e-07, "loss": 0.1853, "step": 9480 }, { "epoch": 1.9188423396073668, "grad_norm": 0.276559978723526, "learning_rate": 8.204176616846026e-07, "loss": 0.1827, "step": 9481 }, { "epoch": 1.919044727787897, "grad_norm": 0.2633993923664093, "learning_rate": 8.163567035016417e-07, "loss": 0.148, "step": 9482 }, { "epoch": 1.9192471159684275, "grad_norm": 0.24610120058059692, "learning_rate": 8.123057796789901e-07, "loss": 0.1881, "step": 9483 }, { "epoch": 1.9194495041489577, "grad_norm": 0.2540454566478729, "learning_rate": 8.082648906264756e-07, "loss": 0.1712, "step": 9484 }, { "epoch": 1.919651892329488, "grad_norm": 0.27683067321777344, "learning_rate": 8.042340367529155e-07, "loss": 0.2107, "step": 9485 }, { "epoch": 1.9198542805100183, "grad_norm": 0.2888847589492798, "learning_rate": 8.002132184660949e-07, "loss": 0.1725, "step": 9486 }, { "epoch": 1.9200566686905485, "grad_norm": 0.2787870764732361, "learning_rate": 7.962024361728216e-07, "loss": 0.1869, "step": 9487 }, { "epoch": 1.9202590568710787, "grad_norm": 0.3224725127220154, "learning_rate": 7.922016902788488e-07, "loss": 0.2191, "step": 9488 }, { "epoch": 1.9204614450516089, "grad_norm": 0.2715289294719696, "learning_rate": 7.882109811889304e-07, "loss": 0.1768, "step": 9489 }, { "epoch": 1.9206638332321393, "grad_norm": 0.26549339294433594, "learning_rate": 7.842303093068105e-07, "loss": 0.1786, "step": 9490 }, { "epoch": 1.9208662214126695, "grad_norm": 0.3416059911251068, "learning_rate": 7.802596750351998e-07, "loss": 0.2178, "step": 9491 }, { "epoch": 1.9210686095932, "grad_norm": 0.2986985743045807, "learning_rate": 7.762990787758217e-07, "loss": 0.1809, "step": 9492 }, { "epoch": 1.9212709977737301, "grad_norm": 0.3280656039714813, "learning_rate": 7.723485209293668e-07, "loss": 0.215, "step": 9493 }, { "epoch": 1.9214733859542603, "grad_norm": 0.3719618320465088, "learning_rate": 7.684080018954931e-07, "loss": 0.2052, "step": 9494 }, { "epoch": 1.9216757741347905, "grad_norm": 0.2605191469192505, "learning_rate": 7.644775220728817e-07, "loss": 0.1855, "step": 9495 }, { "epoch": 1.9218781623153207, "grad_norm": 0.3293147385120392, "learning_rate": 7.60557081859159e-07, "loss": 0.2265, "step": 9496 }, { "epoch": 1.922080550495851, "grad_norm": 0.29550546407699585, "learning_rate": 7.566466816509743e-07, "loss": 0.2152, "step": 9497 }, { "epoch": 1.9222829386763813, "grad_norm": 0.28164321184158325, "learning_rate": 7.527463218439223e-07, "loss": 0.1796, "step": 9498 }, { "epoch": 1.9224853268569115, "grad_norm": 0.27279436588287354, "learning_rate": 7.488560028326097e-07, "loss": 0.1865, "step": 9499 }, { "epoch": 1.922687715037442, "grad_norm": 0.2678142189979553, "learning_rate": 7.449757250106105e-07, "loss": 0.1624, "step": 9500 }, { "epoch": 1.922687715037442, "eval_loss": 0.25728070735931396, "eval_runtime": 0.7406, "eval_samples_per_second": 6.751, "eval_steps_per_second": 1.35, "step": 9500 } ], "logging_steps": 1, "max_steps": 9882, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.1164472942121943e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }