diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,68053 @@ +{ + "best_metric": 0.2571257948875427, + "best_model_checkpoint": "./model_outputs/checkpoint-9250", + "epoch": 1.922687715037442, + "eval_steps": 50, + "global_step": 9500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00020238818053025704, + "grad_norm": 4.267368316650391, + "learning_rate": 4e-05, + "loss": 2.089, + "step": 1 + }, + { + "epoch": 0.0004047763610605141, + "grad_norm": 4.084916114807129, + "learning_rate": 8e-05, + "loss": 2.0306, + "step": 2 + }, + { + "epoch": 0.0006071645415907711, + "grad_norm": 1.5448576211929321, + "learning_rate": 0.00012, + "loss": 1.8954, + "step": 3 + }, + { + "epoch": 0.0008095527221210282, + "grad_norm": NaN, + "learning_rate": 0.00012, + "loss": 2.0132, + "step": 4 + }, + { + "epoch": 0.0010119409026512851, + "grad_norm": NaN, + "learning_rate": 0.00012, + "loss": 1.9737, + "step": 5 + }, + { + "epoch": 0.0012143290831815423, + "grad_norm": 22.60654067993164, + "learning_rate": 0.00016, + "loss": 2.0251, + "step": 6 + }, + { + "epoch": 0.0014167172637117992, + "grad_norm": 1.1628669500350952, + "learning_rate": 0.0002, + "loss": 1.7159, + "step": 7 + }, + { + "epoch": 0.0016191054442420564, + "grad_norm": 0.7777935862541199, + "learning_rate": 0.00019999999494152464, + "loss": 1.6294, + "step": 8 + }, + { + "epoch": 0.0018214936247723133, + "grad_norm": 0.9375593066215515, + "learning_rate": 0.0001999999797660991, + "loss": 1.5124, + "step": 9 + }, + { + "epoch": 0.0020238818053025702, + "grad_norm": 0.7026050090789795, + "learning_rate": 0.00019999995447372488, + "loss": 1.462, + "step": 10 + }, + { + "epoch": 0.002226269985832827, + "grad_norm": 0.6454700231552124, + "learning_rate": 0.00019999991906440454, + "loss": 1.4071, + "step": 11 + }, + { + "epoch": 0.0024286581663630845, + "grad_norm": 0.7032003998756409, + "learning_rate": 0.0001999998735381417, + "loss": 1.3653, + "step": 12 + }, + { + "epoch": 0.0026310463468933415, + "grad_norm": 0.5892922282218933, + "learning_rate": 0.00019999981789494092, + "loss": 1.435, + "step": 13 + }, + { + "epoch": 0.0028334345274235984, + "grad_norm": 0.5047839283943176, + "learning_rate": 0.00019999975213480785, + "loss": 1.354, + "step": 14 + }, + { + "epoch": 0.0030358227079538553, + "grad_norm": 0.5063280463218689, + "learning_rate": 0.00019999967625774917, + "loss": 1.3813, + "step": 15 + }, + { + "epoch": 0.0032382108884841127, + "grad_norm": 0.4734111726284027, + "learning_rate": 0.00019999959026377253, + "loss": 1.3379, + "step": 16 + }, + { + "epoch": 0.0034405990690143697, + "grad_norm": 0.5824829339981079, + "learning_rate": 0.0001999994941528866, + "loss": 1.3633, + "step": 17 + }, + { + "epoch": 0.0036429872495446266, + "grad_norm": 0.880375325679779, + "learning_rate": 0.00019999938792510116, + "loss": 1.3669, + "step": 18 + }, + { + "epoch": 0.0038453754300748835, + "grad_norm": 0.7805163264274597, + "learning_rate": 0.00019999927158042695, + "loss": 1.311, + "step": 19 + }, + { + "epoch": 0.0040477636106051405, + "grad_norm": 0.6171135902404785, + "learning_rate": 0.00019999914511887568, + "loss": 1.2861, + "step": 20 + }, + { + "epoch": 0.004250151791135397, + "grad_norm": 0.837437093257904, + "learning_rate": 0.00019999900854046022, + "loss": 1.2161, + "step": 21 + }, + { + "epoch": 0.004452539971665654, + "grad_norm": 0.6754611134529114, + "learning_rate": 0.0001999988618451943, + "loss": 1.2434, + "step": 22 + }, + { + "epoch": 0.004654928152195912, + "grad_norm": 0.6884850263595581, + "learning_rate": 0.0001999987050330929, + "loss": 1.2502, + "step": 23 + }, + { + "epoch": 0.004857316332726169, + "grad_norm": 0.7658131718635559, + "learning_rate": 0.00019999853810417174, + "loss": 1.2025, + "step": 24 + }, + { + "epoch": 0.005059704513256426, + "grad_norm": 0.6967308521270752, + "learning_rate": 0.00019999836105844777, + "loss": 1.2064, + "step": 25 + }, + { + "epoch": 0.005262092693786683, + "grad_norm": 0.7907235026359558, + "learning_rate": 0.0001999981738959389, + "loss": 1.1801, + "step": 26 + }, + { + "epoch": 0.00546448087431694, + "grad_norm": 0.6149998903274536, + "learning_rate": 0.00019999797661666407, + "loss": 1.1459, + "step": 27 + }, + { + "epoch": 0.005666869054847197, + "grad_norm": 0.6044988036155701, + "learning_rate": 0.00019999776922064323, + "loss": 1.1821, + "step": 28 + }, + { + "epoch": 0.005869257235377454, + "grad_norm": 0.687698245048523, + "learning_rate": 0.00019999755170789735, + "loss": 1.1662, + "step": 29 + }, + { + "epoch": 0.006071645415907711, + "grad_norm": 0.5864324569702148, + "learning_rate": 0.0001999973240784485, + "loss": 1.1516, + "step": 30 + }, + { + "epoch": 0.006274033596437968, + "grad_norm": 0.628131628036499, + "learning_rate": 0.00019999708633231962, + "loss": 1.1252, + "step": 31 + }, + { + "epoch": 0.0064764217769682254, + "grad_norm": 0.5752449035644531, + "learning_rate": 0.0001999968384695348, + "loss": 1.0934, + "step": 32 + }, + { + "epoch": 0.006678809957498482, + "grad_norm": 0.5566519498825073, + "learning_rate": 0.00019999658049011916, + "loss": 1.0986, + "step": 33 + }, + { + "epoch": 0.006881198138028739, + "grad_norm": 0.5842620730400085, + "learning_rate": 0.0001999963123940987, + "loss": 1.1155, + "step": 34 + }, + { + "epoch": 0.007083586318558996, + "grad_norm": 0.6081081628799438, + "learning_rate": 0.00019999603418150065, + "loss": 1.0845, + "step": 35 + }, + { + "epoch": 0.007285974499089253, + "grad_norm": 0.6527591347694397, + "learning_rate": 0.0001999957458523531, + "loss": 1.0682, + "step": 36 + }, + { + "epoch": 0.00748836267961951, + "grad_norm": 0.5430381894111633, + "learning_rate": 0.0001999954474066852, + "loss": 1.1304, + "step": 37 + }, + { + "epoch": 0.007690750860149767, + "grad_norm": 0.6899943947792053, + "learning_rate": 0.0001999951388445272, + "loss": 1.0612, + "step": 38 + }, + { + "epoch": 0.007893139040680024, + "grad_norm": 0.5707619190216064, + "learning_rate": 0.00019999482016591028, + "loss": 1.1377, + "step": 39 + }, + { + "epoch": 0.008095527221210281, + "grad_norm": 0.583000898361206, + "learning_rate": 0.00019999449137086668, + "loss": 1.1237, + "step": 40 + }, + { + "epoch": 0.008297915401740538, + "grad_norm": 0.5590953826904297, + "learning_rate": 0.00019999415245942968, + "loss": 1.0657, + "step": 41 + }, + { + "epoch": 0.008500303582270795, + "grad_norm": 0.5293363928794861, + "learning_rate": 0.00019999380343163354, + "loss": 1.1134, + "step": 42 + }, + { + "epoch": 0.008702691762801052, + "grad_norm": 0.5645998120307922, + "learning_rate": 0.0001999934442875136, + "loss": 1.0908, + "step": 43 + }, + { + "epoch": 0.008905079943331309, + "grad_norm": 0.5520191192626953, + "learning_rate": 0.0001999930750271062, + "loss": 1.0689, + "step": 44 + }, + { + "epoch": 0.009107468123861567, + "grad_norm": 0.594061017036438, + "learning_rate": 0.0001999926956504487, + "loss": 1.1155, + "step": 45 + }, + { + "epoch": 0.009309856304391824, + "grad_norm": 0.5787252187728882, + "learning_rate": 0.00019999230615757942, + "loss": 1.0742, + "step": 46 + }, + { + "epoch": 0.009512244484922081, + "grad_norm": 0.5889455676078796, + "learning_rate": 0.00019999190654853785, + "loss": 1.0397, + "step": 47 + }, + { + "epoch": 0.009714632665452338, + "grad_norm": 0.5654120445251465, + "learning_rate": 0.00019999149682336435, + "loss": 1.0794, + "step": 48 + }, + { + "epoch": 0.009917020845982595, + "grad_norm": 0.6551898121833801, + "learning_rate": 0.0001999910769821004, + "loss": 1.0397, + "step": 49 + }, + { + "epoch": 0.010119409026512852, + "grad_norm": 0.5888538360595703, + "learning_rate": 0.00019999064702478853, + "loss": 1.0434, + "step": 50 + }, + { + "epoch": 0.010119409026512852, + "eval_loss": 1.1003731489181519, + "eval_runtime": 0.7794, + "eval_samples_per_second": 6.415, + "eval_steps_per_second": 1.283, + "step": 50 + }, + { + "epoch": 0.010321797207043109, + "grad_norm": 0.581838846206665, + "learning_rate": 0.00019999020695147214, + "loss": 0.9748, + "step": 51 + }, + { + "epoch": 0.010524185387573366, + "grad_norm": 0.5969087481498718, + "learning_rate": 0.00019998975676219582, + "loss": 1.0262, + "step": 52 + }, + { + "epoch": 0.010726573568103623, + "grad_norm": 0.691875696182251, + "learning_rate": 0.00019998929645700505, + "loss": 0.9956, + "step": 53 + }, + { + "epoch": 0.01092896174863388, + "grad_norm": 0.522814929485321, + "learning_rate": 0.00019998882603594647, + "loss": 1.0366, + "step": 54 + }, + { + "epoch": 0.011131349929164137, + "grad_norm": 0.5945473313331604, + "learning_rate": 0.00019998834549906765, + "loss": 0.9809, + "step": 55 + }, + { + "epoch": 0.011333738109694394, + "grad_norm": 0.4513246715068817, + "learning_rate": 0.00019998785484641717, + "loss": 1.0609, + "step": 56 + }, + { + "epoch": 0.01153612629022465, + "grad_norm": 0.5156210064888, + "learning_rate": 0.0001999873540780447, + "loss": 1.0375, + "step": 57 + }, + { + "epoch": 0.011738514470754908, + "grad_norm": 0.5518094897270203, + "learning_rate": 0.00019998684319400093, + "loss": 1.1117, + "step": 58 + }, + { + "epoch": 0.011940902651285164, + "grad_norm": 0.5378175973892212, + "learning_rate": 0.00019998632219433749, + "loss": 1.0078, + "step": 59 + }, + { + "epoch": 0.012143290831815421, + "grad_norm": 0.5768158435821533, + "learning_rate": 0.00019998579107910713, + "loss": 1.1274, + "step": 60 + }, + { + "epoch": 0.012345679012345678, + "grad_norm": 0.5991278290748596, + "learning_rate": 0.00019998524984836356, + "loss": 1.018, + "step": 61 + }, + { + "epoch": 0.012548067192875935, + "grad_norm": 0.5535243153572083, + "learning_rate": 0.00019998469850216152, + "loss": 1.0184, + "step": 62 + }, + { + "epoch": 0.012750455373406194, + "grad_norm": 0.5592597723007202, + "learning_rate": 0.00019998413704055686, + "loss": 1.0218, + "step": 63 + }, + { + "epoch": 0.012952843553936451, + "grad_norm": 0.5260300040245056, + "learning_rate": 0.0001999835654636063, + "loss": 1.0673, + "step": 64 + }, + { + "epoch": 0.013155231734466708, + "grad_norm": 0.5915527939796448, + "learning_rate": 0.00019998298377136772, + "loss": 0.9527, + "step": 65 + }, + { + "epoch": 0.013357619914996965, + "grad_norm": 0.6970841288566589, + "learning_rate": 0.00019998239196389995, + "loss": 1.0103, + "step": 66 + }, + { + "epoch": 0.013560008095527222, + "grad_norm": 0.5182502865791321, + "learning_rate": 0.00019998179004126286, + "loss": 0.9288, + "step": 67 + }, + { + "epoch": 0.013762396276057479, + "grad_norm": 0.8228877186775208, + "learning_rate": 0.00019998117800351734, + "loss": 0.9993, + "step": 68 + }, + { + "epoch": 0.013964784456587736, + "grad_norm": 0.5726291537284851, + "learning_rate": 0.00019998055585072533, + "loss": 0.9788, + "step": 69 + }, + { + "epoch": 0.014167172637117992, + "grad_norm": 0.8187235593795776, + "learning_rate": 0.00019997992358294976, + "loss": 1.004, + "step": 70 + }, + { + "epoch": 0.01436956081764825, + "grad_norm": 0.6061872839927673, + "learning_rate": 0.00019997928120025463, + "loss": 1.0284, + "step": 71 + }, + { + "epoch": 0.014571948998178506, + "grad_norm": 0.7122519612312317, + "learning_rate": 0.00019997862870270488, + "loss": 1.0282, + "step": 72 + }, + { + "epoch": 0.014774337178708763, + "grad_norm": 0.5993272066116333, + "learning_rate": 0.0001999779660903665, + "loss": 1.0159, + "step": 73 + }, + { + "epoch": 0.01497672535923902, + "grad_norm": 0.6778062582015991, + "learning_rate": 0.00019997729336330663, + "loss": 0.9764, + "step": 74 + }, + { + "epoch": 0.015179113539769277, + "grad_norm": 0.5996508002281189, + "learning_rate": 0.00019997661052159323, + "loss": 0.9797, + "step": 75 + }, + { + "epoch": 0.015381501720299534, + "grad_norm": 0.6299217343330383, + "learning_rate": 0.00019997591756529541, + "loss": 0.9712, + "step": 76 + }, + { + "epoch": 0.015583889900829791, + "grad_norm": 0.7457549571990967, + "learning_rate": 0.00019997521449448331, + "loss": 1.0071, + "step": 77 + }, + { + "epoch": 0.015786278081360048, + "grad_norm": 0.6136026382446289, + "learning_rate": 0.00019997450130922802, + "loss": 0.9881, + "step": 78 + }, + { + "epoch": 0.015988666261890307, + "grad_norm": 0.6008905172348022, + "learning_rate": 0.00019997377800960172, + "loss": 0.9617, + "step": 79 + }, + { + "epoch": 0.016191054442420562, + "grad_norm": 0.5873702764511108, + "learning_rate": 0.0001999730445956776, + "loss": 0.9714, + "step": 80 + }, + { + "epoch": 0.01639344262295082, + "grad_norm": 0.5964879989624023, + "learning_rate": 0.0001999723010675298, + "loss": 1.0452, + "step": 81 + }, + { + "epoch": 0.016595830803481076, + "grad_norm": 0.6384466886520386, + "learning_rate": 0.00019997154742523358, + "loss": 0.9848, + "step": 82 + }, + { + "epoch": 0.016798218984011334, + "grad_norm": 0.5400208234786987, + "learning_rate": 0.00019997078366886518, + "loss": 0.9715, + "step": 83 + }, + { + "epoch": 0.01700060716454159, + "grad_norm": 0.5620110630989075, + "learning_rate": 0.00019997000979850188, + "loss": 0.9445, + "step": 84 + }, + { + "epoch": 0.01720299534507185, + "grad_norm": 0.7637978792190552, + "learning_rate": 0.00019996922581422196, + "loss": 1.0477, + "step": 85 + }, + { + "epoch": 0.017405383525602103, + "grad_norm": 0.5261138677597046, + "learning_rate": 0.0001999684317161047, + "loss": 0.9975, + "step": 86 + }, + { + "epoch": 0.017607771706132362, + "grad_norm": 0.6341800689697266, + "learning_rate": 0.00019996762750423052, + "loss": 0.9253, + "step": 87 + }, + { + "epoch": 0.017810159886662617, + "grad_norm": 0.5864059329032898, + "learning_rate": 0.0001999668131786807, + "loss": 0.9491, + "step": 88 + }, + { + "epoch": 0.018012548067192876, + "grad_norm": 0.5787160992622375, + "learning_rate": 0.0001999659887395377, + "loss": 0.8889, + "step": 89 + }, + { + "epoch": 0.018214936247723135, + "grad_norm": 0.6294256448745728, + "learning_rate": 0.0001999651541868849, + "loss": 0.9698, + "step": 90 + }, + { + "epoch": 0.01841732442825339, + "grad_norm": 0.5774893760681152, + "learning_rate": 0.0001999643095208067, + "loss": 0.9502, + "step": 91 + }, + { + "epoch": 0.01861971260878365, + "grad_norm": 0.6308877468109131, + "learning_rate": 0.00019996345474138858, + "loss": 0.9221, + "step": 92 + }, + { + "epoch": 0.018822100789313904, + "grad_norm": 0.6650766730308533, + "learning_rate": 0.000199962589848717, + "loss": 1.0439, + "step": 93 + }, + { + "epoch": 0.019024488969844162, + "grad_norm": 0.5616105198860168, + "learning_rate": 0.0001999617148428795, + "loss": 0.9611, + "step": 94 + }, + { + "epoch": 0.019226877150374418, + "grad_norm": 0.6718441843986511, + "learning_rate": 0.00019996082972396456, + "loss": 0.832, + "step": 95 + }, + { + "epoch": 0.019429265330904676, + "grad_norm": 0.6764196157455444, + "learning_rate": 0.00019995993449206174, + "loss": 0.9261, + "step": 96 + }, + { + "epoch": 0.01963165351143493, + "grad_norm": 0.6313098073005676, + "learning_rate": 0.0001999590291472616, + "loss": 0.9441, + "step": 97 + }, + { + "epoch": 0.01983404169196519, + "grad_norm": 0.5967774987220764, + "learning_rate": 0.00019995811368965578, + "loss": 0.9431, + "step": 98 + }, + { + "epoch": 0.020036429872495445, + "grad_norm": 0.6203646063804626, + "learning_rate": 0.00019995718811933685, + "loss": 0.9719, + "step": 99 + }, + { + "epoch": 0.020238818053025704, + "grad_norm": 0.5203957557678223, + "learning_rate": 0.0001999562524363985, + "loss": 0.9755, + "step": 100 + }, + { + "epoch": 0.020238818053025704, + "eval_loss": 0.9987107515335083, + "eval_runtime": 0.7376, + "eval_samples_per_second": 6.779, + "eval_steps_per_second": 1.356, + "step": 100 + }, + { + "epoch": 0.02044120623355596, + "grad_norm": 0.5949529409408569, + "learning_rate": 0.00019995530664093533, + "loss": 0.9831, + "step": 101 + }, + { + "epoch": 0.020643594414086218, + "grad_norm": 0.6670980453491211, + "learning_rate": 0.00019995435073304305, + "loss": 0.8731, + "step": 102 + }, + { + "epoch": 0.020845982594616473, + "grad_norm": 0.6265388131141663, + "learning_rate": 0.00019995338471281838, + "loss": 0.9228, + "step": 103 + }, + { + "epoch": 0.021048370775146732, + "grad_norm": 0.6850742697715759, + "learning_rate": 0.000199952408580359, + "loss": 1.0626, + "step": 104 + }, + { + "epoch": 0.021250758955676987, + "grad_norm": 0.6624189019203186, + "learning_rate": 0.00019995142233576377, + "loss": 1.0332, + "step": 105 + }, + { + "epoch": 0.021453147136207246, + "grad_norm": 0.6240122318267822, + "learning_rate": 0.0001999504259791324, + "loss": 0.9175, + "step": 106 + }, + { + "epoch": 0.0216555353167375, + "grad_norm": 0.6845401525497437, + "learning_rate": 0.00019994941951056568, + "loss": 0.9271, + "step": 107 + }, + { + "epoch": 0.02185792349726776, + "grad_norm": 0.5907098054885864, + "learning_rate": 0.00019994840293016545, + "loss": 0.8607, + "step": 108 + }, + { + "epoch": 0.022060311677798018, + "grad_norm": 0.7503966093063354, + "learning_rate": 0.00019994737623803456, + "loss": 0.9726, + "step": 109 + }, + { + "epoch": 0.022262699858328273, + "grad_norm": 0.677544891834259, + "learning_rate": 0.00019994633943427688, + "loss": 1.004, + "step": 110 + }, + { + "epoch": 0.022465088038858532, + "grad_norm": 0.6157703399658203, + "learning_rate": 0.0001999452925189973, + "loss": 0.9558, + "step": 111 + }, + { + "epoch": 0.022667476219388787, + "grad_norm": 0.5559118986129761, + "learning_rate": 0.00019994423549230173, + "loss": 0.981, + "step": 112 + }, + { + "epoch": 0.022869864399919046, + "grad_norm": 0.5934160947799683, + "learning_rate": 0.00019994316835429714, + "loss": 0.8549, + "step": 113 + }, + { + "epoch": 0.0230722525804493, + "grad_norm": 0.5715833902359009, + "learning_rate": 0.00019994209110509145, + "loss": 0.9944, + "step": 114 + }, + { + "epoch": 0.02327464076097956, + "grad_norm": 0.5714934468269348, + "learning_rate": 0.00019994100374479365, + "loss": 0.91, + "step": 115 + }, + { + "epoch": 0.023477028941509815, + "grad_norm": 0.613496720790863, + "learning_rate": 0.0001999399062735138, + "loss": 0.8761, + "step": 116 + }, + { + "epoch": 0.023679417122040074, + "grad_norm": 0.6440281271934509, + "learning_rate": 0.00019993879869136284, + "loss": 0.9539, + "step": 117 + }, + { + "epoch": 0.02388180530257033, + "grad_norm": 0.6116454601287842, + "learning_rate": 0.0001999376809984529, + "loss": 0.9385, + "step": 118 + }, + { + "epoch": 0.024084193483100588, + "grad_norm": 0.6269987225532532, + "learning_rate": 0.00019993655319489704, + "loss": 0.9607, + "step": 119 + }, + { + "epoch": 0.024286581663630843, + "grad_norm": 0.6190333366394043, + "learning_rate": 0.00019993541528080932, + "loss": 0.9325, + "step": 120 + }, + { + "epoch": 0.0244889698441611, + "grad_norm": 0.5847398042678833, + "learning_rate": 0.00019993426725630492, + "loss": 0.8744, + "step": 121 + }, + { + "epoch": 0.024691358024691357, + "grad_norm": 0.614730715751648, + "learning_rate": 0.00019993310912149996, + "loss": 0.965, + "step": 122 + }, + { + "epoch": 0.024893746205221615, + "grad_norm": 0.6264632940292358, + "learning_rate": 0.00019993194087651158, + "loss": 0.8992, + "step": 123 + }, + { + "epoch": 0.02509613438575187, + "grad_norm": 0.6432466506958008, + "learning_rate": 0.00019993076252145802, + "loss": 0.9558, + "step": 124 + }, + { + "epoch": 0.02529852256628213, + "grad_norm": 0.6524417996406555, + "learning_rate": 0.00019992957405645846, + "loss": 0.9221, + "step": 125 + }, + { + "epoch": 0.025500910746812388, + "grad_norm": 0.6003560423851013, + "learning_rate": 0.00019992837548163316, + "loss": 0.9247, + "step": 126 + }, + { + "epoch": 0.025703298927342643, + "grad_norm": 0.5921458601951599, + "learning_rate": 0.00019992716679710334, + "loss": 0.963, + "step": 127 + }, + { + "epoch": 0.025905687107872902, + "grad_norm": 0.5762323141098022, + "learning_rate": 0.00019992594800299131, + "loss": 0.9888, + "step": 128 + }, + { + "epoch": 0.026108075288403157, + "grad_norm": 0.5996343493461609, + "learning_rate": 0.00019992471909942042, + "loss": 0.8842, + "step": 129 + }, + { + "epoch": 0.026310463468933416, + "grad_norm": 0.6580969095230103, + "learning_rate": 0.00019992348008651488, + "loss": 0.9383, + "step": 130 + }, + { + "epoch": 0.02651285164946367, + "grad_norm": 0.646142303943634, + "learning_rate": 0.00019992223096440014, + "loss": 0.9079, + "step": 131 + }, + { + "epoch": 0.02671523982999393, + "grad_norm": 0.6356844305992126, + "learning_rate": 0.00019992097173320255, + "loss": 0.9245, + "step": 132 + }, + { + "epoch": 0.026917628010524185, + "grad_norm": 0.5754806995391846, + "learning_rate": 0.0001999197023930495, + "loss": 0.9361, + "step": 133 + }, + { + "epoch": 0.027120016191054443, + "grad_norm": 0.6589515805244446, + "learning_rate": 0.0001999184229440694, + "loss": 0.8845, + "step": 134 + }, + { + "epoch": 0.0273224043715847, + "grad_norm": 0.555176854133606, + "learning_rate": 0.0001999171333863917, + "loss": 0.9382, + "step": 135 + }, + { + "epoch": 0.027524792552114957, + "grad_norm": 0.6286085844039917, + "learning_rate": 0.00019991583372014687, + "loss": 0.9328, + "step": 136 + }, + { + "epoch": 0.027727180732645212, + "grad_norm": 0.6221725344657898, + "learning_rate": 0.00019991452394546637, + "loss": 0.9229, + "step": 137 + }, + { + "epoch": 0.02792956891317547, + "grad_norm": 0.5152042508125305, + "learning_rate": 0.00019991320406248275, + "loss": 0.9372, + "step": 138 + }, + { + "epoch": 0.028131957093705726, + "grad_norm": 0.602803647518158, + "learning_rate": 0.0001999118740713295, + "loss": 0.8926, + "step": 139 + }, + { + "epoch": 0.028334345274235985, + "grad_norm": 0.5117329955101013, + "learning_rate": 0.00019991053397214122, + "loss": 1.0066, + "step": 140 + }, + { + "epoch": 0.02853673345476624, + "grad_norm": 0.5424915552139282, + "learning_rate": 0.00019990918376505343, + "loss": 0.9543, + "step": 141 + }, + { + "epoch": 0.0287391216352965, + "grad_norm": 0.5950594544410706, + "learning_rate": 0.00019990782345020275, + "loss": 0.8684, + "step": 142 + }, + { + "epoch": 0.028941509815826754, + "grad_norm": 0.7003294825553894, + "learning_rate": 0.00019990645302772687, + "loss": 0.8938, + "step": 143 + }, + { + "epoch": 0.029143897996357013, + "grad_norm": 0.6386146545410156, + "learning_rate": 0.00019990507249776433, + "loss": 0.9564, + "step": 144 + }, + { + "epoch": 0.02934628617688727, + "grad_norm": 0.597125768661499, + "learning_rate": 0.0001999036818604549, + "loss": 0.9712, + "step": 145 + }, + { + "epoch": 0.029548674357417527, + "grad_norm": 0.6054093837738037, + "learning_rate": 0.00019990228111593919, + "loss": 0.9828, + "step": 146 + }, + { + "epoch": 0.029751062537947785, + "grad_norm": 0.6329893469810486, + "learning_rate": 0.0001999008702643589, + "loss": 0.94, + "step": 147 + }, + { + "epoch": 0.02995345071847804, + "grad_norm": 0.7184765934944153, + "learning_rate": 0.00019989944930585683, + "loss": 0.9074, + "step": 148 + }, + { + "epoch": 0.0301558388990083, + "grad_norm": 0.6599225401878357, + "learning_rate": 0.00019989801824057675, + "loss": 0.8902, + "step": 149 + }, + { + "epoch": 0.030358227079538554, + "grad_norm": 0.7230977416038513, + "learning_rate": 0.0001998965770686634, + "loss": 0.8653, + "step": 150 + }, + { + "epoch": 0.030358227079538554, + "eval_loss": 0.9306267499923706, + "eval_runtime": 0.7367, + "eval_samples_per_second": 6.787, + "eval_steps_per_second": 1.357, + "step": 150 + }, + { + "epoch": 0.030560615260068813, + "grad_norm": 0.7149731516838074, + "learning_rate": 0.00019989512579026252, + "loss": 0.9479, + "step": 151 + }, + { + "epoch": 0.030763003440599068, + "grad_norm": 0.5758787989616394, + "learning_rate": 0.00019989366440552103, + "loss": 0.9579, + "step": 152 + }, + { + "epoch": 0.030965391621129327, + "grad_norm": 0.6900405287742615, + "learning_rate": 0.00019989219291458677, + "loss": 0.8796, + "step": 153 + }, + { + "epoch": 0.031167779801659582, + "grad_norm": 0.6407442688941956, + "learning_rate": 0.0001998907113176086, + "loss": 0.8888, + "step": 154 + }, + { + "epoch": 0.03137016798218984, + "grad_norm": 0.5915456414222717, + "learning_rate": 0.00019988921961473633, + "loss": 0.9336, + "step": 155 + }, + { + "epoch": 0.031572556162720096, + "grad_norm": 0.6366065740585327, + "learning_rate": 0.000199887717806121, + "loss": 0.9005, + "step": 156 + }, + { + "epoch": 0.03177494434325035, + "grad_norm": 0.7416878342628479, + "learning_rate": 0.0001998862058919145, + "loss": 0.9671, + "step": 157 + }, + { + "epoch": 0.03197733252378061, + "grad_norm": 0.6920793652534485, + "learning_rate": 0.00019988468387226974, + "loss": 0.9283, + "step": 158 + }, + { + "epoch": 0.03217972070431087, + "grad_norm": 0.577690064907074, + "learning_rate": 0.00019988315174734078, + "loss": 0.932, + "step": 159 + }, + { + "epoch": 0.032382108884841124, + "grad_norm": 0.5838318467140198, + "learning_rate": 0.0001998816095172826, + "loss": 0.8758, + "step": 160 + }, + { + "epoch": 0.03258449706537138, + "grad_norm": 0.6302077174186707, + "learning_rate": 0.00019988005718225117, + "loss": 0.8934, + "step": 161 + }, + { + "epoch": 0.03278688524590164, + "grad_norm": 0.5870795845985413, + "learning_rate": 0.0001998784947424036, + "loss": 0.8459, + "step": 162 + }, + { + "epoch": 0.032989273426431896, + "grad_norm": 0.6889308094978333, + "learning_rate": 0.00019987692219789794, + "loss": 0.8667, + "step": 163 + }, + { + "epoch": 0.03319166160696215, + "grad_norm": 0.6996739506721497, + "learning_rate": 0.0001998753395488933, + "loss": 0.8623, + "step": 164 + }, + { + "epoch": 0.033394049787492414, + "grad_norm": 0.8726261854171753, + "learning_rate": 0.00019987374679554979, + "loss": 0.905, + "step": 165 + }, + { + "epoch": 0.03359643796802267, + "grad_norm": 0.597016453742981, + "learning_rate": 0.00019987214393802854, + "loss": 0.8773, + "step": 166 + }, + { + "epoch": 0.033798826148552924, + "grad_norm": 0.8725893497467041, + "learning_rate": 0.00019987053097649172, + "loss": 0.8896, + "step": 167 + }, + { + "epoch": 0.03400121432908318, + "grad_norm": 0.6847428679466248, + "learning_rate": 0.0001998689079111025, + "loss": 0.8226, + "step": 168 + }, + { + "epoch": 0.03420360250961344, + "grad_norm": 0.613000750541687, + "learning_rate": 0.00019986727474202506, + "loss": 0.8768, + "step": 169 + }, + { + "epoch": 0.0344059906901437, + "grad_norm": 0.7318368554115295, + "learning_rate": 0.00019986563146942468, + "loss": 0.9174, + "step": 170 + }, + { + "epoch": 0.03460837887067395, + "grad_norm": 0.6846932172775269, + "learning_rate": 0.0001998639780934676, + "loss": 0.8588, + "step": 171 + }, + { + "epoch": 0.03481076705120421, + "grad_norm": 0.6385796666145325, + "learning_rate": 0.00019986231461432106, + "loss": 0.9034, + "step": 172 + }, + { + "epoch": 0.03501315523173447, + "grad_norm": 0.6725485324859619, + "learning_rate": 0.00019986064103215339, + "loss": 0.8191, + "step": 173 + }, + { + "epoch": 0.035215543412264724, + "grad_norm": 0.6478608846664429, + "learning_rate": 0.00019985895734713386, + "loss": 0.8056, + "step": 174 + }, + { + "epoch": 0.03541793159279498, + "grad_norm": 0.6572886109352112, + "learning_rate": 0.00019985726355943283, + "loss": 0.8324, + "step": 175 + }, + { + "epoch": 0.035620319773325235, + "grad_norm": 0.7601284980773926, + "learning_rate": 0.00019985555966922167, + "loss": 0.8534, + "step": 176 + }, + { + "epoch": 0.0358227079538555, + "grad_norm": 0.7233675122261047, + "learning_rate": 0.00019985384567667279, + "loss": 0.8744, + "step": 177 + }, + { + "epoch": 0.03602509613438575, + "grad_norm": 0.731354296207428, + "learning_rate": 0.00019985212158195952, + "loss": 0.7982, + "step": 178 + }, + { + "epoch": 0.03622748431491601, + "grad_norm": 0.7109301090240479, + "learning_rate": 0.00019985038738525634, + "loss": 0.7729, + "step": 179 + }, + { + "epoch": 0.03642987249544627, + "grad_norm": 0.7613882422447205, + "learning_rate": 0.00019984864308673867, + "loss": 0.8453, + "step": 180 + }, + { + "epoch": 0.036632260675976525, + "grad_norm": 0.8680360317230225, + "learning_rate": 0.000199846888686583, + "loss": 0.788, + "step": 181 + }, + { + "epoch": 0.03683464885650678, + "grad_norm": 1.017104983329773, + "learning_rate": 0.00019984512418496682, + "loss": 0.8349, + "step": 182 + }, + { + "epoch": 0.037037037037037035, + "grad_norm": 0.6914694905281067, + "learning_rate": 0.00019984334958206862, + "loss": 0.8457, + "step": 183 + }, + { + "epoch": 0.0372394252175673, + "grad_norm": 0.8141409158706665, + "learning_rate": 0.00019984156487806799, + "loss": 0.7979, + "step": 184 + }, + { + "epoch": 0.03744181339809755, + "grad_norm": 0.840029239654541, + "learning_rate": 0.00019983977007314544, + "loss": 0.8428, + "step": 185 + }, + { + "epoch": 0.03764420157862781, + "grad_norm": 0.8123815655708313, + "learning_rate": 0.00019983796516748252, + "loss": 0.7008, + "step": 186 + }, + { + "epoch": 0.03784658975915806, + "grad_norm": 0.8500047326087952, + "learning_rate": 0.00019983615016126193, + "loss": 0.7789, + "step": 187 + }, + { + "epoch": 0.038048977939688325, + "grad_norm": 0.809622585773468, + "learning_rate": 0.00019983432505466718, + "loss": 0.7469, + "step": 188 + }, + { + "epoch": 0.03825136612021858, + "grad_norm": 0.7599986791610718, + "learning_rate": 0.00019983248984788303, + "loss": 0.8498, + "step": 189 + }, + { + "epoch": 0.038453754300748835, + "grad_norm": 1.0785133838653564, + "learning_rate": 0.00019983064454109505, + "loss": 0.7924, + "step": 190 + }, + { + "epoch": 0.03865614248127909, + "grad_norm": 1.275370717048645, + "learning_rate": 0.00019982878913448997, + "loss": 0.8975, + "step": 191 + }, + { + "epoch": 0.03885853066180935, + "grad_norm": 1.2063267230987549, + "learning_rate": 0.0001998269236282555, + "loss": 0.7749, + "step": 192 + }, + { + "epoch": 0.03906091884233961, + "grad_norm": 1.0798983573913574, + "learning_rate": 0.00019982504802258037, + "loss": 0.8307, + "step": 193 + }, + { + "epoch": 0.03926330702286986, + "grad_norm": 1.2071725130081177, + "learning_rate": 0.00019982316231765431, + "loss": 0.8576, + "step": 194 + }, + { + "epoch": 0.03946569520340012, + "grad_norm": 0.8345642685890198, + "learning_rate": 0.00019982126651366816, + "loss": 0.8485, + "step": 195 + }, + { + "epoch": 0.03966808338393038, + "grad_norm": 0.9822136759757996, + "learning_rate": 0.00019981936061081365, + "loss": 0.8924, + "step": 196 + }, + { + "epoch": 0.039870471564460636, + "grad_norm": 1.0564842224121094, + "learning_rate": 0.0001998174446092836, + "loss": 0.9139, + "step": 197 + }, + { + "epoch": 0.04007285974499089, + "grad_norm": 0.8812574148178101, + "learning_rate": 0.00019981551850927195, + "loss": 0.7048, + "step": 198 + }, + { + "epoch": 0.04027524792552115, + "grad_norm": 1.013770580291748, + "learning_rate": 0.00019981358231097344, + "loss": 0.8904, + "step": 199 + }, + { + "epoch": 0.04047763610605141, + "grad_norm": 1.1347284317016602, + "learning_rate": 0.00019981163601458403, + "loss": 0.7091, + "step": 200 + }, + { + "epoch": 0.04047763610605141, + "eval_loss": 0.7794874906539917, + "eval_runtime": 0.7382, + "eval_samples_per_second": 6.773, + "eval_steps_per_second": 1.355, + "step": 200 + }, + { + "epoch": 0.04068002428658166, + "grad_norm": 0.9756168127059937, + "learning_rate": 0.00019980967962030056, + "loss": 0.7894, + "step": 201 + }, + { + "epoch": 0.04088241246711192, + "grad_norm": 1.3444828987121582, + "learning_rate": 0.00019980771312832105, + "loss": 0.8338, + "step": 202 + }, + { + "epoch": 0.04108480064764218, + "grad_norm": 1.014088749885559, + "learning_rate": 0.00019980573653884435, + "loss": 0.7022, + "step": 203 + }, + { + "epoch": 0.041287188828172436, + "grad_norm": 1.1845611333847046, + "learning_rate": 0.0001998037498520705, + "loss": 0.7177, + "step": 204 + }, + { + "epoch": 0.04148957700870269, + "grad_norm": 1.521763801574707, + "learning_rate": 0.00019980175306820046, + "loss": 0.8397, + "step": 205 + }, + { + "epoch": 0.041691965189232946, + "grad_norm": 1.1498165130615234, + "learning_rate": 0.00019979974618743626, + "loss": 0.8348, + "step": 206 + }, + { + "epoch": 0.04189435336976321, + "grad_norm": 1.7773405313491821, + "learning_rate": 0.00019979772920998093, + "loss": 0.7595, + "step": 207 + }, + { + "epoch": 0.042096741550293464, + "grad_norm": 1.1249701976776123, + "learning_rate": 0.0001997957021360385, + "loss": 0.747, + "step": 208 + }, + { + "epoch": 0.04229912973082372, + "grad_norm": 1.0314749479293823, + "learning_rate": 0.00019979366496581408, + "loss": 0.7159, + "step": 209 + }, + { + "epoch": 0.042501517911353974, + "grad_norm": 1.208742618560791, + "learning_rate": 0.00019979161769951377, + "loss": 0.5837, + "step": 210 + }, + { + "epoch": 0.042703906091884236, + "grad_norm": 1.2238479852676392, + "learning_rate": 0.00019978956033734471, + "loss": 0.6674, + "step": 211 + }, + { + "epoch": 0.04290629427241449, + "grad_norm": 1.3177680969238281, + "learning_rate": 0.00019978749287951497, + "loss": 0.6537, + "step": 212 + }, + { + "epoch": 0.04310868245294475, + "grad_norm": 1.5105770826339722, + "learning_rate": 0.00019978541532623379, + "loss": 0.7395, + "step": 213 + }, + { + "epoch": 0.043311070633475, + "grad_norm": 1.1323720216751099, + "learning_rate": 0.0001997833276777113, + "loss": 0.6217, + "step": 214 + }, + { + "epoch": 0.043513458814005264, + "grad_norm": 1.1617493629455566, + "learning_rate": 0.00019978122993415874, + "loss": 0.6077, + "step": 215 + }, + { + "epoch": 0.04371584699453552, + "grad_norm": 1.0218966007232666, + "learning_rate": 0.00019977912209578834, + "loss": 0.5779, + "step": 216 + }, + { + "epoch": 0.043918235175065774, + "grad_norm": 1.3140915632247925, + "learning_rate": 0.00019977700416281332, + "loss": 0.5813, + "step": 217 + }, + { + "epoch": 0.044120623355596036, + "grad_norm": 1.1791021823883057, + "learning_rate": 0.00019977487613544797, + "loss": 0.5821, + "step": 218 + }, + { + "epoch": 0.04432301153612629, + "grad_norm": 1.0278735160827637, + "learning_rate": 0.00019977273801390758, + "loss": 0.5312, + "step": 219 + }, + { + "epoch": 0.04452539971665655, + "grad_norm": 1.026606559753418, + "learning_rate": 0.00019977058979840848, + "loss": 0.4905, + "step": 220 + }, + { + "epoch": 0.0447277878971868, + "grad_norm": 1.115782380104065, + "learning_rate": 0.00019976843148916795, + "loss": 0.567, + "step": 221 + }, + { + "epoch": 0.044930176077717064, + "grad_norm": 0.9572044610977173, + "learning_rate": 0.0001997662630864044, + "loss": 0.4892, + "step": 222 + }, + { + "epoch": 0.04513256425824732, + "grad_norm": 1.084818959236145, + "learning_rate": 0.0001997640845903372, + "loss": 0.5356, + "step": 223 + }, + { + "epoch": 0.045334952438777575, + "grad_norm": 0.996979832649231, + "learning_rate": 0.0001997618960011867, + "loss": 0.5158, + "step": 224 + }, + { + "epoch": 0.04553734061930783, + "grad_norm": 0.9665465354919434, + "learning_rate": 0.0001997596973191744, + "loss": 0.4367, + "step": 225 + }, + { + "epoch": 0.04573972879983809, + "grad_norm": 1.0468083620071411, + "learning_rate": 0.00019975748854452263, + "loss": 0.4481, + "step": 226 + }, + { + "epoch": 0.04594211698036835, + "grad_norm": 0.9373990297317505, + "learning_rate": 0.00019975526967745496, + "loss": 0.4282, + "step": 227 + }, + { + "epoch": 0.0461445051608986, + "grad_norm": 0.7844080328941345, + "learning_rate": 0.0001997530407181958, + "loss": 0.4338, + "step": 228 + }, + { + "epoch": 0.04634689334142886, + "grad_norm": 0.8212091326713562, + "learning_rate": 0.00019975080166697068, + "loss": 0.4436, + "step": 229 + }, + { + "epoch": 0.04654928152195912, + "grad_norm": 0.8424368500709534, + "learning_rate": 0.00019974855252400615, + "loss": 0.419, + "step": 230 + }, + { + "epoch": 0.046751669702489375, + "grad_norm": 0.8820065855979919, + "learning_rate": 0.00019974629328952967, + "loss": 0.3786, + "step": 231 + }, + { + "epoch": 0.04695405788301963, + "grad_norm": 0.8870169520378113, + "learning_rate": 0.00019974402396376992, + "loss": 0.3783, + "step": 232 + }, + { + "epoch": 0.047156446063549885, + "grad_norm": 0.9616879820823669, + "learning_rate": 0.0001997417445469564, + "loss": 0.5076, + "step": 233 + }, + { + "epoch": 0.04735883424408015, + "grad_norm": 1.1007381677627563, + "learning_rate": 0.00019973945503931972, + "loss": 0.4013, + "step": 234 + }, + { + "epoch": 0.0475612224246104, + "grad_norm": 0.9436114430427551, + "learning_rate": 0.00019973715544109157, + "loss": 0.3849, + "step": 235 + }, + { + "epoch": 0.04776361060514066, + "grad_norm": 0.9205026626586914, + "learning_rate": 0.00019973484575250457, + "loss": 0.4035, + "step": 236 + }, + { + "epoch": 0.04796599878567092, + "grad_norm": 0.8011429905891418, + "learning_rate": 0.00019973252597379234, + "loss": 0.4059, + "step": 237 + }, + { + "epoch": 0.048168386966201175, + "grad_norm": 0.8591095805168152, + "learning_rate": 0.00019973019610518966, + "loss": 0.3347, + "step": 238 + }, + { + "epoch": 0.04837077514673143, + "grad_norm": 1.0715107917785645, + "learning_rate": 0.00019972785614693215, + "loss": 0.3903, + "step": 239 + }, + { + "epoch": 0.048573163327261686, + "grad_norm": 0.7088459134101868, + "learning_rate": 0.00019972550609925662, + "loss": 0.3701, + "step": 240 + }, + { + "epoch": 0.04877555150779195, + "grad_norm": 1.190577507019043, + "learning_rate": 0.00019972314596240076, + "loss": 0.4032, + "step": 241 + }, + { + "epoch": 0.0489779396883222, + "grad_norm": 0.8017274141311646, + "learning_rate": 0.00019972077573660342, + "loss": 0.3818, + "step": 242 + }, + { + "epoch": 0.04918032786885246, + "grad_norm": 0.844630241394043, + "learning_rate": 0.00019971839542210434, + "loss": 0.3827, + "step": 243 + }, + { + "epoch": 0.04938271604938271, + "grad_norm": 1.1047987937927246, + "learning_rate": 0.00019971600501914432, + "loss": 0.3752, + "step": 244 + }, + { + "epoch": 0.049585104229912975, + "grad_norm": 0.7282317876815796, + "learning_rate": 0.00019971360452796522, + "loss": 0.3373, + "step": 245 + }, + { + "epoch": 0.04978749241044323, + "grad_norm": 0.8181321620941162, + "learning_rate": 0.00019971119394880988, + "loss": 0.3282, + "step": 246 + }, + { + "epoch": 0.049989880590973486, + "grad_norm": 0.9017817378044128, + "learning_rate": 0.00019970877328192224, + "loss": 0.3949, + "step": 247 + }, + { + "epoch": 0.05019226877150374, + "grad_norm": 0.5751867890357971, + "learning_rate": 0.0001997063425275471, + "loss": 0.3443, + "step": 248 + }, + { + "epoch": 0.050394656952034, + "grad_norm": 0.5587199926376343, + "learning_rate": 0.0001997039016859305, + "loss": 0.3545, + "step": 249 + }, + { + "epoch": 0.05059704513256426, + "grad_norm": 0.6850367188453674, + "learning_rate": 0.00019970145075731926, + "loss": 0.3835, + "step": 250 + }, + { + "epoch": 0.05059704513256426, + "eval_loss": 0.39252138137817383, + "eval_runtime": 0.7379, + "eval_samples_per_second": 6.776, + "eval_steps_per_second": 1.355, + "step": 250 + }, + { + "epoch": 0.050799433313094514, + "grad_norm": 0.706892728805542, + "learning_rate": 0.0001996989897419614, + "loss": 0.3413, + "step": 251 + }, + { + "epoch": 0.051001821493624776, + "grad_norm": 0.6600444912910461, + "learning_rate": 0.00019969651864010587, + "loss": 0.3494, + "step": 252 + }, + { + "epoch": 0.05120420967415503, + "grad_norm": 0.6261760592460632, + "learning_rate": 0.0001996940374520027, + "loss": 0.3519, + "step": 253 + }, + { + "epoch": 0.051406597854685286, + "grad_norm": 0.932579517364502, + "learning_rate": 0.00019969154617790292, + "loss": 0.4178, + "step": 254 + }, + { + "epoch": 0.05160898603521554, + "grad_norm": 0.5545374751091003, + "learning_rate": 0.00019968904481805852, + "loss": 0.3525, + "step": 255 + }, + { + "epoch": 0.051811374215745803, + "grad_norm": 0.6660155057907104, + "learning_rate": 0.00019968653337272261, + "loss": 0.3532, + "step": 256 + }, + { + "epoch": 0.05201376239627606, + "grad_norm": 0.576330304145813, + "learning_rate": 0.00019968401184214924, + "loss": 0.3391, + "step": 257 + }, + { + "epoch": 0.052216150576806314, + "grad_norm": 0.7081141471862793, + "learning_rate": 0.00019968148022659352, + "loss": 0.3385, + "step": 258 + }, + { + "epoch": 0.05241853875733657, + "grad_norm": 0.687074601650238, + "learning_rate": 0.00019967893852631158, + "loss": 0.3058, + "step": 259 + }, + { + "epoch": 0.05262092693786683, + "grad_norm": 0.5989205241203308, + "learning_rate": 0.00019967638674156057, + "loss": 0.3554, + "step": 260 + }, + { + "epoch": 0.052823315118397086, + "grad_norm": 0.6394159197807312, + "learning_rate": 0.00019967382487259865, + "loss": 0.3622, + "step": 261 + }, + { + "epoch": 0.05302570329892734, + "grad_norm": 0.6000388264656067, + "learning_rate": 0.00019967125291968496, + "loss": 0.3167, + "step": 262 + }, + { + "epoch": 0.0532280914794576, + "grad_norm": 0.4961284101009369, + "learning_rate": 0.00019966867088307976, + "loss": 0.3073, + "step": 263 + }, + { + "epoch": 0.05343047965998786, + "grad_norm": 0.7699441909790039, + "learning_rate": 0.00019966607876304427, + "loss": 0.3673, + "step": 264 + }, + { + "epoch": 0.053632867840518114, + "grad_norm": 0.4595373272895813, + "learning_rate": 0.00019966347655984068, + "loss": 0.3183, + "step": 265 + }, + { + "epoch": 0.05383525602104837, + "grad_norm": 0.5160897970199585, + "learning_rate": 0.00019966086427373233, + "loss": 0.3031, + "step": 266 + }, + { + "epoch": 0.054037644201578625, + "grad_norm": 0.7531689405441284, + "learning_rate": 0.00019965824190498342, + "loss": 0.3117, + "step": 267 + }, + { + "epoch": 0.05424003238210889, + "grad_norm": 0.6607016324996948, + "learning_rate": 0.0001996556094538593, + "loss": 0.2863, + "step": 268 + }, + { + "epoch": 0.05444242056263914, + "grad_norm": 0.7021990418434143, + "learning_rate": 0.0001996529669206263, + "loss": 0.3234, + "step": 269 + }, + { + "epoch": 0.0546448087431694, + "grad_norm": 0.9091718792915344, + "learning_rate": 0.00019965031430555177, + "loss": 0.2699, + "step": 270 + }, + { + "epoch": 0.05484719692369966, + "grad_norm": 0.5182921886444092, + "learning_rate": 0.00019964765160890405, + "loss": 0.2926, + "step": 271 + }, + { + "epoch": 0.055049585104229914, + "grad_norm": 0.5352545380592346, + "learning_rate": 0.0001996449788309525, + "loss": 0.3292, + "step": 272 + }, + { + "epoch": 0.05525197328476017, + "grad_norm": 0.5082312226295471, + "learning_rate": 0.00019964229597196757, + "loss": 0.3404, + "step": 273 + }, + { + "epoch": 0.055454361465290425, + "grad_norm": 0.6266235113143921, + "learning_rate": 0.0001996396030322207, + "loss": 0.3178, + "step": 274 + }, + { + "epoch": 0.05565674964582069, + "grad_norm": 0.5914604663848877, + "learning_rate": 0.00019963690001198426, + "loss": 0.3164, + "step": 275 + }, + { + "epoch": 0.05585913782635094, + "grad_norm": 0.6798471212387085, + "learning_rate": 0.00019963418691153176, + "loss": 0.3606, + "step": 276 + }, + { + "epoch": 0.0560615260068812, + "grad_norm": 0.68598872423172, + "learning_rate": 0.0001996314637311377, + "loss": 0.3397, + "step": 277 + }, + { + "epoch": 0.05626391418741145, + "grad_norm": 0.5811251401901245, + "learning_rate": 0.00019962873047107757, + "loss": 0.3073, + "step": 278 + }, + { + "epoch": 0.056466302367941715, + "grad_norm": 0.5858151912689209, + "learning_rate": 0.00019962598713162786, + "loss": 0.3172, + "step": 279 + }, + { + "epoch": 0.05666869054847197, + "grad_norm": 0.5117138028144836, + "learning_rate": 0.00019962323371306616, + "loss": 0.3322, + "step": 280 + }, + { + "epoch": 0.056871078729002225, + "grad_norm": 0.6146894097328186, + "learning_rate": 0.000199620470215671, + "loss": 0.3634, + "step": 281 + }, + { + "epoch": 0.05707346690953248, + "grad_norm": 0.5277398228645325, + "learning_rate": 0.00019961769663972195, + "loss": 0.3092, + "step": 282 + }, + { + "epoch": 0.05727585509006274, + "grad_norm": 0.6759743094444275, + "learning_rate": 0.00019961491298549962, + "loss": 0.2589, + "step": 283 + }, + { + "epoch": 0.057478243270593, + "grad_norm": 0.45867061614990234, + "learning_rate": 0.00019961211925328566, + "loss": 0.3521, + "step": 284 + }, + { + "epoch": 0.05768063145112325, + "grad_norm": 0.7512937188148499, + "learning_rate": 0.0001996093154433627, + "loss": 0.3041, + "step": 285 + }, + { + "epoch": 0.05788301963165351, + "grad_norm": 0.5749024748802185, + "learning_rate": 0.00019960650155601437, + "loss": 0.328, + "step": 286 + }, + { + "epoch": 0.05808540781218377, + "grad_norm": 0.6303148865699768, + "learning_rate": 0.00019960367759152542, + "loss": 0.3481, + "step": 287 + }, + { + "epoch": 0.058287795992714025, + "grad_norm": 0.6051326990127563, + "learning_rate": 0.00019960084355018145, + "loss": 0.3371, + "step": 288 + }, + { + "epoch": 0.05849018417324428, + "grad_norm": 0.5055463910102844, + "learning_rate": 0.00019959799943226924, + "loss": 0.3593, + "step": 289 + }, + { + "epoch": 0.05869257235377454, + "grad_norm": 0.5214646458625793, + "learning_rate": 0.00019959514523807653, + "loss": 0.2492, + "step": 290 + }, + { + "epoch": 0.0588949605343048, + "grad_norm": 0.5222678780555725, + "learning_rate": 0.00019959228096789206, + "loss": 0.2845, + "step": 291 + }, + { + "epoch": 0.05909734871483505, + "grad_norm": 0.4554538130760193, + "learning_rate": 0.00019958940662200561, + "loss": 0.2753, + "step": 292 + }, + { + "epoch": 0.05929973689536531, + "grad_norm": 0.6560251116752625, + "learning_rate": 0.00019958652220070796, + "loss": 0.2887, + "step": 293 + }, + { + "epoch": 0.05950212507589557, + "grad_norm": 0.5200150012969971, + "learning_rate": 0.00019958362770429097, + "loss": 0.3094, + "step": 294 + }, + { + "epoch": 0.059704513256425826, + "grad_norm": 0.5180234909057617, + "learning_rate": 0.00019958072313304746, + "loss": 0.3173, + "step": 295 + }, + { + "epoch": 0.05990690143695608, + "grad_norm": 0.5091387033462524, + "learning_rate": 0.00019957780848727123, + "loss": 0.3008, + "step": 296 + }, + { + "epoch": 0.060109289617486336, + "grad_norm": 0.4643876254558563, + "learning_rate": 0.0001995748837672572, + "loss": 0.284, + "step": 297 + }, + { + "epoch": 0.0603116777980166, + "grad_norm": 0.6453856825828552, + "learning_rate": 0.00019957194897330128, + "loss": 0.3132, + "step": 298 + }, + { + "epoch": 0.060514065978546853, + "grad_norm": 0.5977994203567505, + "learning_rate": 0.00019956900410570037, + "loss": 0.3169, + "step": 299 + }, + { + "epoch": 0.06071645415907711, + "grad_norm": 0.5075947642326355, + "learning_rate": 0.00019956604916475235, + "loss": 0.2862, + "step": 300 + }, + { + "epoch": 0.06071645415907711, + "eval_loss": 0.3347070813179016, + "eval_runtime": 0.742, + "eval_samples_per_second": 6.739, + "eval_steps_per_second": 1.348, + "step": 300 + }, + { + "epoch": 0.060918842339607364, + "grad_norm": 0.7045861482620239, + "learning_rate": 0.00019956308415075626, + "loss": 0.2855, + "step": 301 + }, + { + "epoch": 0.061121230520137626, + "grad_norm": 0.5203765034675598, + "learning_rate": 0.000199560109064012, + "loss": 0.3551, + "step": 302 + }, + { + "epoch": 0.06132361870066788, + "grad_norm": 0.39355793595314026, + "learning_rate": 0.00019955712390482057, + "loss": 0.3208, + "step": 303 + }, + { + "epoch": 0.061526006881198136, + "grad_norm": 0.7209200859069824, + "learning_rate": 0.00019955412867348398, + "loss": 0.2472, + "step": 304 + }, + { + "epoch": 0.06172839506172839, + "grad_norm": 0.489533007144928, + "learning_rate": 0.00019955112337030525, + "loss": 0.2821, + "step": 305 + }, + { + "epoch": 0.061930783242258654, + "grad_norm": 0.4404822587966919, + "learning_rate": 0.00019954810799558846, + "loss": 0.3182, + "step": 306 + }, + { + "epoch": 0.06213317142278891, + "grad_norm": 0.5157645344734192, + "learning_rate": 0.00019954508254963865, + "loss": 0.3032, + "step": 307 + }, + { + "epoch": 0.062335559603319164, + "grad_norm": 0.5006906390190125, + "learning_rate": 0.0001995420470327619, + "loss": 0.2992, + "step": 308 + }, + { + "epoch": 0.06253794778384943, + "grad_norm": 0.4116860330104828, + "learning_rate": 0.00019953900144526528, + "loss": 0.3001, + "step": 309 + }, + { + "epoch": 0.06274033596437968, + "grad_norm": 0.49453118443489075, + "learning_rate": 0.00019953594578745698, + "loss": 0.2938, + "step": 310 + }, + { + "epoch": 0.06294272414490994, + "grad_norm": 0.5103276371955872, + "learning_rate": 0.0001995328800596461, + "loss": 0.3285, + "step": 311 + }, + { + "epoch": 0.06314511232544019, + "grad_norm": 0.47672298550605774, + "learning_rate": 0.00019952980426214277, + "loss": 0.3173, + "step": 312 + }, + { + "epoch": 0.06334750050597045, + "grad_norm": 0.45488908886909485, + "learning_rate": 0.00019952671839525824, + "loss": 0.355, + "step": 313 + }, + { + "epoch": 0.0635498886865007, + "grad_norm": 0.47179242968559265, + "learning_rate": 0.00019952362245930467, + "loss": 0.2831, + "step": 314 + }, + { + "epoch": 0.06375227686703097, + "grad_norm": 0.5481370687484741, + "learning_rate": 0.00019952051645459525, + "loss": 0.3201, + "step": 315 + }, + { + "epoch": 0.06395466504756123, + "grad_norm": 0.44173574447631836, + "learning_rate": 0.00019951740038144422, + "loss": 0.3181, + "step": 316 + }, + { + "epoch": 0.06415705322809148, + "grad_norm": 0.5051449537277222, + "learning_rate": 0.00019951427424016687, + "loss": 0.3416, + "step": 317 + }, + { + "epoch": 0.06435944140862174, + "grad_norm": 0.49225544929504395, + "learning_rate": 0.0001995111380310794, + "loss": 0.2848, + "step": 318 + }, + { + "epoch": 0.06456182958915199, + "grad_norm": 0.472187340259552, + "learning_rate": 0.00019950799175449922, + "loss": 0.2474, + "step": 319 + }, + { + "epoch": 0.06476421776968225, + "grad_norm": 0.5424083471298218, + "learning_rate": 0.0001995048354107445, + "loss": 0.3382, + "step": 320 + }, + { + "epoch": 0.0649666059502125, + "grad_norm": 0.5043047070503235, + "learning_rate": 0.00019950166900013463, + "loss": 0.2834, + "step": 321 + }, + { + "epoch": 0.06516899413074276, + "grad_norm": 0.3971298038959503, + "learning_rate": 0.00019949849252298994, + "loss": 0.2451, + "step": 322 + }, + { + "epoch": 0.06537138231127303, + "grad_norm": 0.51171875, + "learning_rate": 0.00019949530597963185, + "loss": 0.3206, + "step": 323 + }, + { + "epoch": 0.06557377049180328, + "grad_norm": 0.4314689040184021, + "learning_rate": 0.00019949210937038266, + "loss": 0.2662, + "step": 324 + }, + { + "epoch": 0.06577615867233354, + "grad_norm": 0.5465297102928162, + "learning_rate": 0.00019948890269556578, + "loss": 0.335, + "step": 325 + }, + { + "epoch": 0.06597854685286379, + "grad_norm": 0.731951892375946, + "learning_rate": 0.0001994856859555057, + "loss": 0.2778, + "step": 326 + }, + { + "epoch": 0.06618093503339405, + "grad_norm": 0.6282134652137756, + "learning_rate": 0.00019948245915052778, + "loss": 0.3051, + "step": 327 + }, + { + "epoch": 0.0663833232139243, + "grad_norm": 0.5031090378761292, + "learning_rate": 0.0001994792222809585, + "loss": 0.2717, + "step": 328 + }, + { + "epoch": 0.06658571139445456, + "grad_norm": 0.5202815532684326, + "learning_rate": 0.00019947597534712531, + "loss": 0.2698, + "step": 329 + }, + { + "epoch": 0.06678809957498483, + "grad_norm": 0.42810511589050293, + "learning_rate": 0.00019947271834935677, + "loss": 0.2872, + "step": 330 + }, + { + "epoch": 0.06699048775551508, + "grad_norm": 0.5623170137405396, + "learning_rate": 0.00019946945128798232, + "loss": 0.2862, + "step": 331 + }, + { + "epoch": 0.06719287593604534, + "grad_norm": 0.4195396602153778, + "learning_rate": 0.00019946617416333252, + "loss": 0.2956, + "step": 332 + }, + { + "epoch": 0.06739526411657559, + "grad_norm": 0.5211588144302368, + "learning_rate": 0.0001994628869757389, + "loss": 0.2792, + "step": 333 + }, + { + "epoch": 0.06759765229710585, + "grad_norm": 0.39513328671455383, + "learning_rate": 0.00019945958972553403, + "loss": 0.2431, + "step": 334 + }, + { + "epoch": 0.0678000404776361, + "grad_norm": 0.6008070707321167, + "learning_rate": 0.00019945628241305148, + "loss": 0.271, + "step": 335 + }, + { + "epoch": 0.06800242865816636, + "grad_norm": 0.46714478731155396, + "learning_rate": 0.00019945296503862586, + "loss": 0.2815, + "step": 336 + }, + { + "epoch": 0.06820481683869661, + "grad_norm": 0.3947727680206299, + "learning_rate": 0.0001994496376025928, + "loss": 0.2492, + "step": 337 + }, + { + "epoch": 0.06840720501922688, + "grad_norm": 0.38070622086524963, + "learning_rate": 0.00019944630010528891, + "loss": 0.2768, + "step": 338 + }, + { + "epoch": 0.06860959319975714, + "grad_norm": 0.602689802646637, + "learning_rate": 0.00019944295254705185, + "loss": 0.2819, + "step": 339 + }, + { + "epoch": 0.0688119813802874, + "grad_norm": 0.7967174649238586, + "learning_rate": 0.0001994395949282203, + "loss": 0.2612, + "step": 340 + }, + { + "epoch": 0.06901436956081765, + "grad_norm": 0.4428558647632599, + "learning_rate": 0.00019943622724913395, + "loss": 0.3029, + "step": 341 + }, + { + "epoch": 0.0692167577413479, + "grad_norm": 0.5136562585830688, + "learning_rate": 0.00019943284951013347, + "loss": 0.2933, + "step": 342 + }, + { + "epoch": 0.06941914592187816, + "grad_norm": 0.5629222989082336, + "learning_rate": 0.00019942946171156063, + "loss": 0.3204, + "step": 343 + }, + { + "epoch": 0.06962153410240841, + "grad_norm": 0.4126991927623749, + "learning_rate": 0.00019942606385375816, + "loss": 0.3101, + "step": 344 + }, + { + "epoch": 0.06982392228293868, + "grad_norm": 0.5552066564559937, + "learning_rate": 0.0001994226559370698, + "loss": 0.3645, + "step": 345 + }, + { + "epoch": 0.07002631046346894, + "grad_norm": 0.42373156547546387, + "learning_rate": 0.0001994192379618404, + "loss": 0.2591, + "step": 346 + }, + { + "epoch": 0.0702286986439992, + "grad_norm": 0.3356430232524872, + "learning_rate": 0.0001994158099284156, + "loss": 0.2292, + "step": 347 + }, + { + "epoch": 0.07043108682452945, + "grad_norm": 0.5170024037361145, + "learning_rate": 0.0001994123718371424, + "loss": 0.2949, + "step": 348 + }, + { + "epoch": 0.0706334750050597, + "grad_norm": 0.5372616052627563, + "learning_rate": 0.0001994089236883685, + "loss": 0.2594, + "step": 349 + }, + { + "epoch": 0.07083586318558996, + "grad_norm": 0.48990383744239807, + "learning_rate": 0.00019940546548244278, + "loss": 0.2907, + "step": 350 + }, + { + "epoch": 0.07083586318558996, + "eval_loss": 0.31532177329063416, + "eval_runtime": 0.7386, + "eval_samples_per_second": 6.77, + "eval_steps_per_second": 1.354, + "step": 350 + }, + { + "epoch": 0.07103825136612021, + "grad_norm": 0.6602955460548401, + "learning_rate": 0.00019940199721971515, + "loss": 0.301, + "step": 351 + }, + { + "epoch": 0.07124063954665047, + "grad_norm": 0.5398226380348206, + "learning_rate": 0.00019939851890053643, + "loss": 0.2974, + "step": 352 + }, + { + "epoch": 0.07144302772718074, + "grad_norm": 0.44971731305122375, + "learning_rate": 0.00019939503052525853, + "loss": 0.2919, + "step": 353 + }, + { + "epoch": 0.071645415907711, + "grad_norm": 0.49475017189979553, + "learning_rate": 0.00019939153209423438, + "loss": 0.2624, + "step": 354 + }, + { + "epoch": 0.07184780408824125, + "grad_norm": 0.5040010809898376, + "learning_rate": 0.00019938802360781795, + "loss": 0.3241, + "step": 355 + }, + { + "epoch": 0.0720501922687715, + "grad_norm": 0.35557180643081665, + "learning_rate": 0.00019938450506636413, + "loss": 0.2849, + "step": 356 + }, + { + "epoch": 0.07225258044930176, + "grad_norm": 0.4988034665584564, + "learning_rate": 0.00019938097647022893, + "loss": 0.2742, + "step": 357 + }, + { + "epoch": 0.07245496862983201, + "grad_norm": 0.5231152772903442, + "learning_rate": 0.0001993774378197693, + "loss": 0.2937, + "step": 358 + }, + { + "epoch": 0.07265735681036227, + "grad_norm": 0.5258811712265015, + "learning_rate": 0.00019937388911534328, + "loss": 0.3157, + "step": 359 + }, + { + "epoch": 0.07285974499089254, + "grad_norm": 0.42325958609580994, + "learning_rate": 0.0001993703303573099, + "loss": 0.2782, + "step": 360 + }, + { + "epoch": 0.0730621331714228, + "grad_norm": 0.5115485191345215, + "learning_rate": 0.00019936676154602915, + "loss": 0.3047, + "step": 361 + }, + { + "epoch": 0.07326452135195305, + "grad_norm": 0.4342189133167267, + "learning_rate": 0.00019936318268186213, + "loss": 0.3054, + "step": 362 + }, + { + "epoch": 0.0734669095324833, + "grad_norm": 0.41122326254844666, + "learning_rate": 0.00019935959376517087, + "loss": 0.3087, + "step": 363 + }, + { + "epoch": 0.07366929771301356, + "grad_norm": 0.35741302371025085, + "learning_rate": 0.0001993559947963185, + "loss": 0.2805, + "step": 364 + }, + { + "epoch": 0.07387168589354381, + "grad_norm": 0.3425546884536743, + "learning_rate": 0.0001993523857756691, + "loss": 0.2627, + "step": 365 + }, + { + "epoch": 0.07407407407407407, + "grad_norm": 0.668941080570221, + "learning_rate": 0.0001993487667035878, + "loss": 0.275, + "step": 366 + }, + { + "epoch": 0.07427646225460433, + "grad_norm": 0.4595528542995453, + "learning_rate": 0.00019934513758044074, + "loss": 0.2705, + "step": 367 + }, + { + "epoch": 0.0744788504351346, + "grad_norm": 0.5246752500534058, + "learning_rate": 0.00019934149840659506, + "loss": 0.3243, + "step": 368 + }, + { + "epoch": 0.07468123861566485, + "grad_norm": 0.4581606388092041, + "learning_rate": 0.00019933784918241897, + "loss": 0.2529, + "step": 369 + }, + { + "epoch": 0.0748836267961951, + "grad_norm": 0.40944111347198486, + "learning_rate": 0.00019933418990828163, + "loss": 0.2986, + "step": 370 + }, + { + "epoch": 0.07508601497672536, + "grad_norm": 0.3873656392097473, + "learning_rate": 0.00019933052058455325, + "loss": 0.2519, + "step": 371 + }, + { + "epoch": 0.07528840315725562, + "grad_norm": 0.4861612915992737, + "learning_rate": 0.0001993268412116051, + "loss": 0.2279, + "step": 372 + }, + { + "epoch": 0.07549079133778587, + "grad_norm": 0.5027487277984619, + "learning_rate": 0.00019932315178980935, + "loss": 0.26, + "step": 373 + }, + { + "epoch": 0.07569317951831613, + "grad_norm": 0.42377370595932007, + "learning_rate": 0.00019931945231953927, + "loss": 0.2662, + "step": 374 + }, + { + "epoch": 0.07589556769884638, + "grad_norm": 0.4166119396686554, + "learning_rate": 0.0001993157428011692, + "loss": 0.2813, + "step": 375 + }, + { + "epoch": 0.07609795587937665, + "grad_norm": 0.4710562825202942, + "learning_rate": 0.00019931202323507434, + "loss": 0.2973, + "step": 376 + }, + { + "epoch": 0.0763003440599069, + "grad_norm": 0.41344717144966125, + "learning_rate": 0.00019930829362163106, + "loss": 0.2842, + "step": 377 + }, + { + "epoch": 0.07650273224043716, + "grad_norm": 0.567787766456604, + "learning_rate": 0.00019930455396121666, + "loss": 0.2918, + "step": 378 + }, + { + "epoch": 0.07670512042096742, + "grad_norm": 0.39079976081848145, + "learning_rate": 0.00019930080425420947, + "loss": 0.2845, + "step": 379 + }, + { + "epoch": 0.07690750860149767, + "grad_norm": 0.32150888442993164, + "learning_rate": 0.00019929704450098889, + "loss": 0.2704, + "step": 380 + }, + { + "epoch": 0.07710989678202793, + "grad_norm": 0.4112628996372223, + "learning_rate": 0.0001992932747019352, + "loss": 0.2013, + "step": 381 + }, + { + "epoch": 0.07731228496255818, + "grad_norm": 0.4239389896392822, + "learning_rate": 0.0001992894948574299, + "loss": 0.2387, + "step": 382 + }, + { + "epoch": 0.07751467314308845, + "grad_norm": 0.4737512767314911, + "learning_rate": 0.00019928570496785533, + "loss": 0.334, + "step": 383 + }, + { + "epoch": 0.0777170613236187, + "grad_norm": 0.36126965284347534, + "learning_rate": 0.0001992819050335949, + "loss": 0.2539, + "step": 384 + }, + { + "epoch": 0.07791944950414896, + "grad_norm": 0.4831307828426361, + "learning_rate": 0.00019927809505503307, + "loss": 0.2965, + "step": 385 + }, + { + "epoch": 0.07812183768467922, + "grad_norm": 0.5115605592727661, + "learning_rate": 0.00019927427503255534, + "loss": 0.2953, + "step": 386 + }, + { + "epoch": 0.07832422586520947, + "grad_norm": 0.38006240129470825, + "learning_rate": 0.0001992704449665481, + "loss": 0.2978, + "step": 387 + }, + { + "epoch": 0.07852661404573973, + "grad_norm": 0.4983793795108795, + "learning_rate": 0.00019926660485739887, + "loss": 0.2788, + "step": 388 + }, + { + "epoch": 0.07872900222626998, + "grad_norm": 0.41345569491386414, + "learning_rate": 0.00019926275470549617, + "loss": 0.2738, + "step": 389 + }, + { + "epoch": 0.07893139040680024, + "grad_norm": 0.4223555326461792, + "learning_rate": 0.00019925889451122953, + "loss": 0.2921, + "step": 390 + }, + { + "epoch": 0.0791337785873305, + "grad_norm": 0.5941314697265625, + "learning_rate": 0.00019925502427498942, + "loss": 0.2642, + "step": 391 + }, + { + "epoch": 0.07933616676786076, + "grad_norm": 0.4179031252861023, + "learning_rate": 0.0001992511439971674, + "loss": 0.2429, + "step": 392 + }, + { + "epoch": 0.07953855494839102, + "grad_norm": 0.4050491750240326, + "learning_rate": 0.0001992472536781561, + "loss": 0.2923, + "step": 393 + }, + { + "epoch": 0.07974094312892127, + "grad_norm": 0.38059109449386597, + "learning_rate": 0.00019924335331834904, + "loss": 0.2807, + "step": 394 + }, + { + "epoch": 0.07994333130945153, + "grad_norm": 0.42187055945396423, + "learning_rate": 0.00019923944291814084, + "loss": 0.2692, + "step": 395 + }, + { + "epoch": 0.08014571948998178, + "grad_norm": 0.42098861932754517, + "learning_rate": 0.0001992355224779271, + "loss": 0.3065, + "step": 396 + }, + { + "epoch": 0.08034810767051204, + "grad_norm": 0.38886579871177673, + "learning_rate": 0.00019923159199810453, + "loss": 0.2607, + "step": 397 + }, + { + "epoch": 0.0805504958510423, + "grad_norm": 0.4102109968662262, + "learning_rate": 0.00019922765147907065, + "loss": 0.292, + "step": 398 + }, + { + "epoch": 0.08075288403157256, + "grad_norm": 0.5397422909736633, + "learning_rate": 0.0001992237009212242, + "loss": 0.3344, + "step": 399 + }, + { + "epoch": 0.08095527221210282, + "grad_norm": 0.45712950825691223, + "learning_rate": 0.0001992197403249648, + "loss": 0.2833, + "step": 400 + }, + { + "epoch": 0.08095527221210282, + "eval_loss": 0.3104330003261566, + "eval_runtime": 0.7392, + "eval_samples_per_second": 6.764, + "eval_steps_per_second": 1.353, + "step": 400 + }, + { + "epoch": 0.08115766039263307, + "grad_norm": 0.3736235499382019, + "learning_rate": 0.0001992157696906932, + "loss": 0.2656, + "step": 401 + }, + { + "epoch": 0.08136004857316333, + "grad_norm": 0.3134549558162689, + "learning_rate": 0.0001992117890188111, + "loss": 0.2562, + "step": 402 + }, + { + "epoch": 0.08156243675369358, + "grad_norm": 0.3223460614681244, + "learning_rate": 0.00019920779830972116, + "loss": 0.2533, + "step": 403 + }, + { + "epoch": 0.08176482493422384, + "grad_norm": 0.39283445477485657, + "learning_rate": 0.0001992037975638272, + "loss": 0.2602, + "step": 404 + }, + { + "epoch": 0.08196721311475409, + "grad_norm": 0.44545242190361023, + "learning_rate": 0.00019919978678153391, + "loss": 0.2675, + "step": 405 + }, + { + "epoch": 0.08216960129528436, + "grad_norm": 0.373585045337677, + "learning_rate": 0.00019919576596324709, + "loss": 0.2244, + "step": 406 + }, + { + "epoch": 0.08237198947581462, + "grad_norm": 0.38392719626426697, + "learning_rate": 0.00019919173510937356, + "loss": 0.2645, + "step": 407 + }, + { + "epoch": 0.08257437765634487, + "grad_norm": 0.43317118287086487, + "learning_rate": 0.00019918769422032102, + "loss": 0.2755, + "step": 408 + }, + { + "epoch": 0.08277676583687513, + "grad_norm": 0.4535873234272003, + "learning_rate": 0.00019918364329649837, + "loss": 0.2632, + "step": 409 + }, + { + "epoch": 0.08297915401740538, + "grad_norm": 0.32678788900375366, + "learning_rate": 0.00019917958233831538, + "loss": 0.2825, + "step": 410 + }, + { + "epoch": 0.08318154219793564, + "grad_norm": 0.4875165522098541, + "learning_rate": 0.00019917551134618298, + "loss": 0.2603, + "step": 411 + }, + { + "epoch": 0.08338393037846589, + "grad_norm": 0.4667682647705078, + "learning_rate": 0.00019917143032051297, + "loss": 0.262, + "step": 412 + }, + { + "epoch": 0.08358631855899615, + "grad_norm": 0.32662510871887207, + "learning_rate": 0.00019916733926171823, + "loss": 0.222, + "step": 413 + }, + { + "epoch": 0.08378870673952642, + "grad_norm": 0.3929062485694885, + "learning_rate": 0.00019916323817021264, + "loss": 0.2742, + "step": 414 + }, + { + "epoch": 0.08399109492005667, + "grad_norm": 0.3036748170852661, + "learning_rate": 0.00019915912704641112, + "loss": 0.2677, + "step": 415 + }, + { + "epoch": 0.08419348310058693, + "grad_norm": 0.28849026560783386, + "learning_rate": 0.00019915500589072962, + "loss": 0.2248, + "step": 416 + }, + { + "epoch": 0.08439587128111718, + "grad_norm": 0.4197934567928314, + "learning_rate": 0.00019915087470358502, + "loss": 0.3324, + "step": 417 + }, + { + "epoch": 0.08459825946164744, + "grad_norm": 0.3673010766506195, + "learning_rate": 0.00019914673348539529, + "loss": 0.2377, + "step": 418 + }, + { + "epoch": 0.08480064764217769, + "grad_norm": 0.4578173756599426, + "learning_rate": 0.00019914258223657942, + "loss": 0.2526, + "step": 419 + }, + { + "epoch": 0.08500303582270795, + "grad_norm": 0.3941150903701782, + "learning_rate": 0.00019913842095755735, + "loss": 0.2522, + "step": 420 + }, + { + "epoch": 0.08520542400323822, + "grad_norm": 0.42108628153800964, + "learning_rate": 0.00019913424964875009, + "loss": 0.2516, + "step": 421 + }, + { + "epoch": 0.08540781218376847, + "grad_norm": 0.3706568777561188, + "learning_rate": 0.00019913006831057969, + "loss": 0.2809, + "step": 422 + }, + { + "epoch": 0.08561020036429873, + "grad_norm": 0.37976858019828796, + "learning_rate": 0.00019912587694346912, + "loss": 0.2495, + "step": 423 + }, + { + "epoch": 0.08581258854482898, + "grad_norm": 0.3249634802341461, + "learning_rate": 0.00019912167554784246, + "loss": 0.2631, + "step": 424 + }, + { + "epoch": 0.08601497672535924, + "grad_norm": 0.35132887959480286, + "learning_rate": 0.0001991174641241247, + "loss": 0.28, + "step": 425 + }, + { + "epoch": 0.0862173649058895, + "grad_norm": 0.3174639046192169, + "learning_rate": 0.00019911324267274196, + "loss": 0.272, + "step": 426 + }, + { + "epoch": 0.08641975308641975, + "grad_norm": 0.3432652950286865, + "learning_rate": 0.0001991090111941213, + "loss": 0.2545, + "step": 427 + }, + { + "epoch": 0.08662214126695, + "grad_norm": 0.4003385305404663, + "learning_rate": 0.00019910476968869082, + "loss": 0.2637, + "step": 428 + }, + { + "epoch": 0.08682452944748027, + "grad_norm": 0.2984203100204468, + "learning_rate": 0.0001991005181568797, + "loss": 0.2719, + "step": 429 + }, + { + "epoch": 0.08702691762801053, + "grad_norm": 0.5444194078445435, + "learning_rate": 0.00019909625659911794, + "loss": 0.2601, + "step": 430 + }, + { + "epoch": 0.08722930580854078, + "grad_norm": 0.43773773312568665, + "learning_rate": 0.00019909198501583678, + "loss": 0.2497, + "step": 431 + }, + { + "epoch": 0.08743169398907104, + "grad_norm": 0.3558596074581146, + "learning_rate": 0.00019908770340746829, + "loss": 0.2708, + "step": 432 + }, + { + "epoch": 0.0876340821696013, + "grad_norm": 0.37624043226242065, + "learning_rate": 0.00019908341177444575, + "loss": 0.2568, + "step": 433 + }, + { + "epoch": 0.08783647035013155, + "grad_norm": 0.37379008531570435, + "learning_rate": 0.0001990791101172032, + "loss": 0.2937, + "step": 434 + }, + { + "epoch": 0.0880388585306618, + "grad_norm": 0.44534832239151, + "learning_rate": 0.00019907479843617597, + "loss": 0.2665, + "step": 435 + }, + { + "epoch": 0.08824124671119207, + "grad_norm": 0.356768935918808, + "learning_rate": 0.00019907047673180023, + "loss": 0.2367, + "step": 436 + }, + { + "epoch": 0.08844363489172233, + "grad_norm": 0.3663882315158844, + "learning_rate": 0.00019906614500451314, + "loss": 0.2213, + "step": 437 + }, + { + "epoch": 0.08864602307225258, + "grad_norm": 0.6157559156417847, + "learning_rate": 0.000199061803254753, + "loss": 0.2592, + "step": 438 + }, + { + "epoch": 0.08884841125278284, + "grad_norm": 0.3663040101528168, + "learning_rate": 0.00019905745148295905, + "loss": 0.2704, + "step": 439 + }, + { + "epoch": 0.0890507994333131, + "grad_norm": 0.41940397024154663, + "learning_rate": 0.00019905308968957156, + "loss": 0.2783, + "step": 440 + }, + { + "epoch": 0.08925318761384335, + "grad_norm": 0.28495171666145325, + "learning_rate": 0.0001990487178750318, + "loss": 0.2551, + "step": 441 + }, + { + "epoch": 0.0894555757943736, + "grad_norm": 0.33689308166503906, + "learning_rate": 0.0001990443360397821, + "loss": 0.2662, + "step": 442 + }, + { + "epoch": 0.08965796397490386, + "grad_norm": 0.34871089458465576, + "learning_rate": 0.00019903994418426571, + "loss": 0.2518, + "step": 443 + }, + { + "epoch": 0.08986035215543413, + "grad_norm": 0.3919788897037506, + "learning_rate": 0.000199035542308927, + "loss": 0.2686, + "step": 444 + }, + { + "epoch": 0.09006274033596438, + "grad_norm": 0.33044981956481934, + "learning_rate": 0.00019903113041421126, + "loss": 0.2436, + "step": 445 + }, + { + "epoch": 0.09026512851649464, + "grad_norm": 0.4084889888763428, + "learning_rate": 0.0001990267085005649, + "loss": 0.2694, + "step": 446 + }, + { + "epoch": 0.0904675166970249, + "grad_norm": 0.43821752071380615, + "learning_rate": 0.00019902227656843523, + "loss": 0.286, + "step": 447 + }, + { + "epoch": 0.09066990487755515, + "grad_norm": 0.41732826828956604, + "learning_rate": 0.00019901783461827066, + "loss": 0.2943, + "step": 448 + }, + { + "epoch": 0.0908722930580854, + "grad_norm": 0.6563799381256104, + "learning_rate": 0.00019901338265052056, + "loss": 0.249, + "step": 449 + }, + { + "epoch": 0.09107468123861566, + "grad_norm": 0.3646370768547058, + "learning_rate": 0.0001990089206656353, + "loss": 0.2762, + "step": 450 + }, + { + "epoch": 0.09107468123861566, + "eval_loss": 0.3083243668079376, + "eval_runtime": 0.7375, + "eval_samples_per_second": 6.78, + "eval_steps_per_second": 1.356, + "step": 450 + }, + { + "epoch": 0.09127706941914593, + "grad_norm": 0.3965453803539276, + "learning_rate": 0.0001990044486640664, + "loss": 0.2632, + "step": 451 + }, + { + "epoch": 0.09147945759967618, + "grad_norm": 0.4205472469329834, + "learning_rate": 0.0001989999666462662, + "loss": 0.2831, + "step": 452 + }, + { + "epoch": 0.09168184578020644, + "grad_norm": 0.4374144971370697, + "learning_rate": 0.00019899547461268817, + "loss": 0.2215, + "step": 453 + }, + { + "epoch": 0.0918842339607367, + "grad_norm": 0.6188966631889343, + "learning_rate": 0.00019899097256378677, + "loss": 0.2578, + "step": 454 + }, + { + "epoch": 0.09208662214126695, + "grad_norm": 0.34634727239608765, + "learning_rate": 0.00019898646050001747, + "loss": 0.2623, + "step": 455 + }, + { + "epoch": 0.0922890103217972, + "grad_norm": 0.326874315738678, + "learning_rate": 0.00019898193842183672, + "loss": 0.2283, + "step": 456 + }, + { + "epoch": 0.09249139850232746, + "grad_norm": 0.3612661361694336, + "learning_rate": 0.00019897740632970207, + "loss": 0.2627, + "step": 457 + }, + { + "epoch": 0.09269378668285772, + "grad_norm": 0.3630461096763611, + "learning_rate": 0.00019897286422407203, + "loss": 0.2503, + "step": 458 + }, + { + "epoch": 0.09289617486338798, + "grad_norm": 0.4167366027832031, + "learning_rate": 0.00019896831210540605, + "loss": 0.2897, + "step": 459 + }, + { + "epoch": 0.09309856304391824, + "grad_norm": 0.41102683544158936, + "learning_rate": 0.00019896374997416475, + "loss": 0.2091, + "step": 460 + }, + { + "epoch": 0.0933009512244485, + "grad_norm": 0.4077226221561432, + "learning_rate": 0.0001989591778308096, + "loss": 0.3367, + "step": 461 + }, + { + "epoch": 0.09350333940497875, + "grad_norm": 0.5037345886230469, + "learning_rate": 0.00019895459567580325, + "loss": 0.2544, + "step": 462 + }, + { + "epoch": 0.093705727585509, + "grad_norm": 0.40945005416870117, + "learning_rate": 0.00019895000350960923, + "loss": 0.2801, + "step": 463 + }, + { + "epoch": 0.09390811576603926, + "grad_norm": 0.561182975769043, + "learning_rate": 0.00019894540133269208, + "loss": 0.2737, + "step": 464 + }, + { + "epoch": 0.09411050394656952, + "grad_norm": 0.33685287833213806, + "learning_rate": 0.00019894078914551748, + "loss": 0.2281, + "step": 465 + }, + { + "epoch": 0.09431289212709977, + "grad_norm": 0.3741171360015869, + "learning_rate": 0.000198936166948552, + "loss": 0.3131, + "step": 466 + }, + { + "epoch": 0.09451528030763004, + "grad_norm": 0.4491271674633026, + "learning_rate": 0.00019893153474226328, + "loss": 0.2873, + "step": 467 + }, + { + "epoch": 0.0947176684881603, + "grad_norm": 0.36082473397254944, + "learning_rate": 0.00019892689252711993, + "loss": 0.2676, + "step": 468 + }, + { + "epoch": 0.09492005666869055, + "grad_norm": 0.39217400550842285, + "learning_rate": 0.00019892224030359165, + "loss": 0.2614, + "step": 469 + }, + { + "epoch": 0.0951224448492208, + "grad_norm": 0.45073944330215454, + "learning_rate": 0.00019891757807214905, + "loss": 0.2643, + "step": 470 + }, + { + "epoch": 0.09532483302975106, + "grad_norm": 0.32423001527786255, + "learning_rate": 0.00019891290583326385, + "loss": 0.2128, + "step": 471 + }, + { + "epoch": 0.09552722121028132, + "grad_norm": 0.47926604747772217, + "learning_rate": 0.0001989082235874087, + "loss": 0.2651, + "step": 472 + }, + { + "epoch": 0.09572960939081157, + "grad_norm": 0.8147411346435547, + "learning_rate": 0.00019890353133505734, + "loss": 0.2591, + "step": 473 + }, + { + "epoch": 0.09593199757134184, + "grad_norm": 0.422296941280365, + "learning_rate": 0.00019889882907668445, + "loss": 0.2747, + "step": 474 + }, + { + "epoch": 0.0961343857518721, + "grad_norm": 0.4397691786289215, + "learning_rate": 0.00019889411681276578, + "loss": 0.2674, + "step": 475 + }, + { + "epoch": 0.09633677393240235, + "grad_norm": 0.6375408172607422, + "learning_rate": 0.00019888939454377805, + "loss": 0.3053, + "step": 476 + }, + { + "epoch": 0.0965391621129326, + "grad_norm": 0.40631526708602905, + "learning_rate": 0.00019888466227019902, + "loss": 0.302, + "step": 477 + }, + { + "epoch": 0.09674155029346286, + "grad_norm": 0.4207494556903839, + "learning_rate": 0.00019887991999250742, + "loss": 0.3136, + "step": 478 + }, + { + "epoch": 0.09694393847399312, + "grad_norm": 0.41882240772247314, + "learning_rate": 0.00019887516771118307, + "loss": 0.3117, + "step": 479 + }, + { + "epoch": 0.09714632665452337, + "grad_norm": 0.3637535572052002, + "learning_rate": 0.00019887040542670672, + "loss": 0.2458, + "step": 480 + }, + { + "epoch": 0.09734871483505363, + "grad_norm": 0.5529453158378601, + "learning_rate": 0.0001988656331395602, + "loss": 0.3039, + "step": 481 + }, + { + "epoch": 0.0975511030155839, + "grad_norm": 0.36648619174957275, + "learning_rate": 0.00019886085085022632, + "loss": 0.323, + "step": 482 + }, + { + "epoch": 0.09775349119611415, + "grad_norm": 0.3846886157989502, + "learning_rate": 0.00019885605855918885, + "loss": 0.2829, + "step": 483 + }, + { + "epoch": 0.0979558793766444, + "grad_norm": 0.4345422089099884, + "learning_rate": 0.0001988512562669327, + "loss": 0.2507, + "step": 484 + }, + { + "epoch": 0.09815826755717466, + "grad_norm": 0.39755547046661377, + "learning_rate": 0.00019884644397394366, + "loss": 0.245, + "step": 485 + }, + { + "epoch": 0.09836065573770492, + "grad_norm": 0.36251211166381836, + "learning_rate": 0.0001988416216807086, + "loss": 0.2837, + "step": 486 + }, + { + "epoch": 0.09856304391823517, + "grad_norm": 0.3217640221118927, + "learning_rate": 0.00019883678938771538, + "loss": 0.2498, + "step": 487 + }, + { + "epoch": 0.09876543209876543, + "grad_norm": 0.3786596953868866, + "learning_rate": 0.0001988319470954529, + "loss": 0.2828, + "step": 488 + }, + { + "epoch": 0.0989678202792957, + "grad_norm": 0.2914827764034271, + "learning_rate": 0.00019882709480441104, + "loss": 0.2648, + "step": 489 + }, + { + "epoch": 0.09917020845982595, + "grad_norm": 0.5327249765396118, + "learning_rate": 0.00019882223251508073, + "loss": 0.3445, + "step": 490 + }, + { + "epoch": 0.0993725966403562, + "grad_norm": 0.30911511182785034, + "learning_rate": 0.00019881736022795383, + "loss": 0.2566, + "step": 491 + }, + { + "epoch": 0.09957498482088646, + "grad_norm": 0.8316447734832764, + "learning_rate": 0.00019881247794352333, + "loss": 0.266, + "step": 492 + }, + { + "epoch": 0.09977737300141672, + "grad_norm": 0.32144424319267273, + "learning_rate": 0.00019880758566228314, + "loss": 0.2761, + "step": 493 + }, + { + "epoch": 0.09997976118194697, + "grad_norm": 0.4167412221431732, + "learning_rate": 0.00019880268338472819, + "loss": 0.2849, + "step": 494 + }, + { + "epoch": 0.10018214936247723, + "grad_norm": 0.39232102036476135, + "learning_rate": 0.00019879777111135444, + "loss": 0.253, + "step": 495 + }, + { + "epoch": 0.10038453754300748, + "grad_norm": 0.34585121273994446, + "learning_rate": 0.0001987928488426589, + "loss": 0.2682, + "step": 496 + }, + { + "epoch": 0.10058692572353775, + "grad_norm": 0.41705626249313354, + "learning_rate": 0.00019878791657913957, + "loss": 0.2431, + "step": 497 + }, + { + "epoch": 0.100789313904068, + "grad_norm": 0.38251325488090515, + "learning_rate": 0.0001987829743212954, + "loss": 0.3012, + "step": 498 + }, + { + "epoch": 0.10099170208459826, + "grad_norm": 0.43135866522789, + "learning_rate": 0.00019877802206962639, + "loss": 0.2738, + "step": 499 + }, + { + "epoch": 0.10119409026512852, + "grad_norm": 0.3900761902332306, + "learning_rate": 0.00019877305982463357, + "loss": 0.2732, + "step": 500 + }, + { + "epoch": 0.10119409026512852, + "eval_loss": 0.30053189396858215, + "eval_runtime": 0.7369, + "eval_samples_per_second": 6.785, + "eval_steps_per_second": 1.357, + "step": 500 + }, + { + "epoch": 0.10139647844565877, + "grad_norm": 0.3199058473110199, + "learning_rate": 0.00019876808758681897, + "loss": 0.2594, + "step": 501 + }, + { + "epoch": 0.10159886662618903, + "grad_norm": 0.3172077238559723, + "learning_rate": 0.00019876310535668564, + "loss": 0.2061, + "step": 502 + }, + { + "epoch": 0.10180125480671928, + "grad_norm": 0.3133200705051422, + "learning_rate": 0.00019875811313473763, + "loss": 0.2348, + "step": 503 + }, + { + "epoch": 0.10200364298724955, + "grad_norm": 0.331843763589859, + "learning_rate": 0.00019875311092147998, + "loss": 0.2242, + "step": 504 + }, + { + "epoch": 0.1022060311677798, + "grad_norm": 0.4921395778656006, + "learning_rate": 0.00019874809871741876, + "loss": 0.2463, + "step": 505 + }, + { + "epoch": 0.10240841934831006, + "grad_norm": 0.4361092746257782, + "learning_rate": 0.00019874307652306106, + "loss": 0.2927, + "step": 506 + }, + { + "epoch": 0.10261080752884032, + "grad_norm": 0.3385670781135559, + "learning_rate": 0.00019873804433891498, + "loss": 0.2534, + "step": 507 + }, + { + "epoch": 0.10281319570937057, + "grad_norm": 0.36069896817207336, + "learning_rate": 0.0001987330021654896, + "loss": 0.2575, + "step": 508 + }, + { + "epoch": 0.10301558388990083, + "grad_norm": 0.3283306062221527, + "learning_rate": 0.0001987279500032951, + "loss": 0.2374, + "step": 509 + }, + { + "epoch": 0.10321797207043108, + "grad_norm": 0.4866870939731598, + "learning_rate": 0.00019872288785284257, + "loss": 0.3021, + "step": 510 + }, + { + "epoch": 0.10342036025096134, + "grad_norm": 0.3630296587944031, + "learning_rate": 0.0001987178157146441, + "loss": 0.2506, + "step": 511 + }, + { + "epoch": 0.10362274843149161, + "grad_norm": 0.5312589406967163, + "learning_rate": 0.00019871273358921284, + "loss": 0.322, + "step": 512 + }, + { + "epoch": 0.10382513661202186, + "grad_norm": 0.3144959807395935, + "learning_rate": 0.00019870764147706304, + "loss": 0.2577, + "step": 513 + }, + { + "epoch": 0.10402752479255212, + "grad_norm": 0.32503482699394226, + "learning_rate": 0.00019870253937870978, + "loss": 0.2335, + "step": 514 + }, + { + "epoch": 0.10422991297308237, + "grad_norm": 0.34515443444252014, + "learning_rate": 0.00019869742729466925, + "loss": 0.2417, + "step": 515 + }, + { + "epoch": 0.10443230115361263, + "grad_norm": 0.43555840849876404, + "learning_rate": 0.00019869230522545866, + "loss": 0.239, + "step": 516 + }, + { + "epoch": 0.10463468933414288, + "grad_norm": 0.3596293032169342, + "learning_rate": 0.00019868717317159617, + "loss": 0.2569, + "step": 517 + }, + { + "epoch": 0.10483707751467314, + "grad_norm": 0.7044296264648438, + "learning_rate": 0.00019868203113360103, + "loss": 0.2603, + "step": 518 + }, + { + "epoch": 0.1050394656952034, + "grad_norm": 0.32274308800697327, + "learning_rate": 0.0001986768791119934, + "loss": 0.2647, + "step": 519 + }, + { + "epoch": 0.10524185387573366, + "grad_norm": 0.4142962396144867, + "learning_rate": 0.00019867171710729462, + "loss": 0.2608, + "step": 520 + }, + { + "epoch": 0.10544424205626392, + "grad_norm": 0.5066673159599304, + "learning_rate": 0.00019866654512002682, + "loss": 0.2298, + "step": 521 + }, + { + "epoch": 0.10564663023679417, + "grad_norm": 0.36532062292099, + "learning_rate": 0.00019866136315071326, + "loss": 0.2578, + "step": 522 + }, + { + "epoch": 0.10584901841732443, + "grad_norm": 0.30255767703056335, + "learning_rate": 0.00019865617119987824, + "loss": 0.2639, + "step": 523 + }, + { + "epoch": 0.10605140659785468, + "grad_norm": 0.5272563099861145, + "learning_rate": 0.000198650969268047, + "loss": 0.2291, + "step": 524 + }, + { + "epoch": 0.10625379477838494, + "grad_norm": 0.2946690022945404, + "learning_rate": 0.00019864575735574583, + "loss": 0.2495, + "step": 525 + }, + { + "epoch": 0.1064561829589152, + "grad_norm": 0.3070518374443054, + "learning_rate": 0.000198640535463502, + "loss": 0.2251, + "step": 526 + }, + { + "epoch": 0.10665857113944546, + "grad_norm": 0.37028443813323975, + "learning_rate": 0.00019863530359184381, + "loss": 0.2619, + "step": 527 + }, + { + "epoch": 0.10686095931997572, + "grad_norm": 0.3705368936061859, + "learning_rate": 0.00019863006174130056, + "loss": 0.2826, + "step": 528 + }, + { + "epoch": 0.10706334750050597, + "grad_norm": 0.3553207814693451, + "learning_rate": 0.0001986248099124026, + "loss": 0.2731, + "step": 529 + }, + { + "epoch": 0.10726573568103623, + "grad_norm": 0.306058406829834, + "learning_rate": 0.00019861954810568123, + "loss": 0.2201, + "step": 530 + }, + { + "epoch": 0.10746812386156648, + "grad_norm": 0.30540916323661804, + "learning_rate": 0.00019861427632166879, + "loss": 0.2476, + "step": 531 + }, + { + "epoch": 0.10767051204209674, + "grad_norm": 0.28308695554733276, + "learning_rate": 0.0001986089945608986, + "loss": 0.2123, + "step": 532 + }, + { + "epoch": 0.107872900222627, + "grad_norm": 0.3797146677970886, + "learning_rate": 0.00019860370282390505, + "loss": 0.2896, + "step": 533 + }, + { + "epoch": 0.10807528840315725, + "grad_norm": 0.40093159675598145, + "learning_rate": 0.0001985984011112235, + "loss": 0.2256, + "step": 534 + }, + { + "epoch": 0.10827767658368752, + "grad_norm": 0.5490695238113403, + "learning_rate": 0.00019859308942339027, + "loss": 0.2718, + "step": 535 + }, + { + "epoch": 0.10848006476421777, + "grad_norm": 0.3820257782936096, + "learning_rate": 0.00019858776776094278, + "loss": 0.2431, + "step": 536 + }, + { + "epoch": 0.10868245294474803, + "grad_norm": 0.3355390727519989, + "learning_rate": 0.00019858243612441945, + "loss": 0.2208, + "step": 537 + }, + { + "epoch": 0.10888484112527828, + "grad_norm": 0.28901511430740356, + "learning_rate": 0.00019857709451435963, + "loss": 0.2429, + "step": 538 + }, + { + "epoch": 0.10908722930580854, + "grad_norm": 0.3239997625350952, + "learning_rate": 0.00019857174293130375, + "loss": 0.2538, + "step": 539 + }, + { + "epoch": 0.1092896174863388, + "grad_norm": 0.37941160798072815, + "learning_rate": 0.0001985663813757932, + "loss": 0.2529, + "step": 540 + }, + { + "epoch": 0.10949200566686905, + "grad_norm": 0.39999866485595703, + "learning_rate": 0.00019856100984837042, + "loss": 0.2871, + "step": 541 + }, + { + "epoch": 0.10969439384739932, + "grad_norm": 0.3257283866405487, + "learning_rate": 0.0001985556283495789, + "loss": 0.2384, + "step": 542 + }, + { + "epoch": 0.10989678202792957, + "grad_norm": 0.7400651574134827, + "learning_rate": 0.000198550236879963, + "loss": 0.2667, + "step": 543 + }, + { + "epoch": 0.11009917020845983, + "grad_norm": 0.40268924832344055, + "learning_rate": 0.00019854483544006821, + "loss": 0.2802, + "step": 544 + }, + { + "epoch": 0.11030155838899008, + "grad_norm": 0.44967809319496155, + "learning_rate": 0.000198539424030441, + "loss": 0.2922, + "step": 545 + }, + { + "epoch": 0.11050394656952034, + "grad_norm": 0.3962949812412262, + "learning_rate": 0.00019853400265162883, + "loss": 0.2769, + "step": 546 + }, + { + "epoch": 0.1107063347500506, + "grad_norm": 0.388681560754776, + "learning_rate": 0.00019852857130418019, + "loss": 0.2319, + "step": 547 + }, + { + "epoch": 0.11090872293058085, + "grad_norm": 0.5428779125213623, + "learning_rate": 0.00019852312998864453, + "loss": 0.2121, + "step": 548 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.3685462176799774, + "learning_rate": 0.00019851767870557237, + "loss": 0.2442, + "step": 549 + }, + { + "epoch": 0.11131349929164137, + "grad_norm": 0.4083040952682495, + "learning_rate": 0.00019851221745551524, + "loss": 0.2762, + "step": 550 + }, + { + "epoch": 0.11131349929164137, + "eval_loss": 0.30158162117004395, + "eval_runtime": 0.7395, + "eval_samples_per_second": 6.761, + "eval_steps_per_second": 1.352, + "step": 550 + }, + { + "epoch": 0.11151588747217163, + "grad_norm": 0.43971139192581177, + "learning_rate": 0.00019850674623902558, + "loss": 0.2168, + "step": 551 + }, + { + "epoch": 0.11171827565270188, + "grad_norm": 0.4038715958595276, + "learning_rate": 0.00019850126505665698, + "loss": 0.2518, + "step": 552 + }, + { + "epoch": 0.11192066383323214, + "grad_norm": 0.5503129959106445, + "learning_rate": 0.00019849577390896396, + "loss": 0.2501, + "step": 553 + }, + { + "epoch": 0.1121230520137624, + "grad_norm": 0.779593825340271, + "learning_rate": 0.00019849027279650203, + "loss": 0.294, + "step": 554 + }, + { + "epoch": 0.11232544019429265, + "grad_norm": 0.29945191740989685, + "learning_rate": 0.00019848476171982772, + "loss": 0.2523, + "step": 555 + }, + { + "epoch": 0.1125278283748229, + "grad_norm": 0.33953744173049927, + "learning_rate": 0.00019847924067949862, + "loss": 0.2657, + "step": 556 + }, + { + "epoch": 0.11273021655535317, + "grad_norm": 0.4823399484157562, + "learning_rate": 0.00019847370967607331, + "loss": 0.2603, + "step": 557 + }, + { + "epoch": 0.11293260473588343, + "grad_norm": 0.330797016620636, + "learning_rate": 0.00019846816871011132, + "loss": 0.2241, + "step": 558 + }, + { + "epoch": 0.11313499291641368, + "grad_norm": 0.31722167134284973, + "learning_rate": 0.0001984626177821732, + "loss": 0.2284, + "step": 559 + }, + { + "epoch": 0.11333738109694394, + "grad_norm": 0.3729718327522278, + "learning_rate": 0.0001984570568928206, + "loss": 0.3019, + "step": 560 + }, + { + "epoch": 0.1135397692774742, + "grad_norm": 0.29263293743133545, + "learning_rate": 0.00019845148604261605, + "loss": 0.2791, + "step": 561 + }, + { + "epoch": 0.11374215745800445, + "grad_norm": 0.6067195534706116, + "learning_rate": 0.0001984459052321232, + "loss": 0.2651, + "step": 562 + }, + { + "epoch": 0.1139445456385347, + "grad_norm": 0.5350626111030579, + "learning_rate": 0.00019844031446190666, + "loss": 0.294, + "step": 563 + }, + { + "epoch": 0.11414693381906496, + "grad_norm": 0.346068412065506, + "learning_rate": 0.00019843471373253202, + "loss": 0.2733, + "step": 564 + }, + { + "epoch": 0.11434932199959523, + "grad_norm": 0.3487130105495453, + "learning_rate": 0.00019842910304456587, + "loss": 0.2378, + "step": 565 + }, + { + "epoch": 0.11455171018012549, + "grad_norm": 0.24237936735153198, + "learning_rate": 0.0001984234823985759, + "loss": 0.2055, + "step": 566 + }, + { + "epoch": 0.11475409836065574, + "grad_norm": 0.3237369656562805, + "learning_rate": 0.00019841785179513073, + "loss": 0.2398, + "step": 567 + }, + { + "epoch": 0.114956486541186, + "grad_norm": 0.3716401755809784, + "learning_rate": 0.0001984122112348, + "loss": 0.2309, + "step": 568 + }, + { + "epoch": 0.11515887472171625, + "grad_norm": 0.5387085676193237, + "learning_rate": 0.00019840656071815437, + "loss": 0.2603, + "step": 569 + }, + { + "epoch": 0.1153612629022465, + "grad_norm": 0.5174623131752014, + "learning_rate": 0.0001984009002457655, + "loss": 0.2898, + "step": 570 + }, + { + "epoch": 0.11556365108277676, + "grad_norm": 0.3925536274909973, + "learning_rate": 0.00019839522981820605, + "loss": 0.2316, + "step": 571 + }, + { + "epoch": 0.11576603926330702, + "grad_norm": 0.42717403173446655, + "learning_rate": 0.0001983895494360497, + "loss": 0.2339, + "step": 572 + }, + { + "epoch": 0.11596842744383729, + "grad_norm": 0.3932379484176636, + "learning_rate": 0.0001983838590998711, + "loss": 0.2916, + "step": 573 + }, + { + "epoch": 0.11617081562436754, + "grad_norm": 0.3760148584842682, + "learning_rate": 0.000198378158810246, + "loss": 0.2469, + "step": 574 + }, + { + "epoch": 0.1163732038048978, + "grad_norm": 0.48632335662841797, + "learning_rate": 0.00019837244856775102, + "loss": 0.27, + "step": 575 + }, + { + "epoch": 0.11657559198542805, + "grad_norm": 0.44857627153396606, + "learning_rate": 0.00019836672837296394, + "loss": 0.253, + "step": 576 + }, + { + "epoch": 0.1167779801659583, + "grad_norm": 0.37322884798049927, + "learning_rate": 0.00019836099822646342, + "loss": 0.2575, + "step": 577 + }, + { + "epoch": 0.11698036834648856, + "grad_norm": 0.3037974238395691, + "learning_rate": 0.00019835525812882923, + "loss": 0.2609, + "step": 578 + }, + { + "epoch": 0.11718275652701882, + "grad_norm": 0.5397683382034302, + "learning_rate": 0.000198349508080642, + "loss": 0.2463, + "step": 579 + }, + { + "epoch": 0.11738514470754909, + "grad_norm": 0.44430914521217346, + "learning_rate": 0.00019834374808248351, + "loss": 0.2565, + "step": 580 + }, + { + "epoch": 0.11758753288807934, + "grad_norm": 0.7803641557693481, + "learning_rate": 0.00019833797813493655, + "loss": 0.2483, + "step": 581 + }, + { + "epoch": 0.1177899210686096, + "grad_norm": 0.3826069235801697, + "learning_rate": 0.00019833219823858477, + "loss": 0.2909, + "step": 582 + }, + { + "epoch": 0.11799230924913985, + "grad_norm": 0.3330939710140228, + "learning_rate": 0.00019832640839401297, + "loss": 0.2606, + "step": 583 + }, + { + "epoch": 0.1181946974296701, + "grad_norm": 0.35879385471343994, + "learning_rate": 0.00019832060860180688, + "loss": 0.2447, + "step": 584 + }, + { + "epoch": 0.11839708561020036, + "grad_norm": 0.7940172553062439, + "learning_rate": 0.0001983147988625533, + "loss": 0.2826, + "step": 585 + }, + { + "epoch": 0.11859947379073062, + "grad_norm": 0.4618555009365082, + "learning_rate": 0.00019830897917683997, + "loss": 0.2495, + "step": 586 + }, + { + "epoch": 0.11880186197126087, + "grad_norm": 0.2834542691707611, + "learning_rate": 0.0001983031495452557, + "loss": 0.2505, + "step": 587 + }, + { + "epoch": 0.11900425015179114, + "grad_norm": 0.40233033895492554, + "learning_rate": 0.0001982973099683902, + "loss": 0.2571, + "step": 588 + }, + { + "epoch": 0.1192066383323214, + "grad_norm": 0.355094850063324, + "learning_rate": 0.00019829146044683432, + "loss": 0.2356, + "step": 589 + }, + { + "epoch": 0.11940902651285165, + "grad_norm": 0.7586387991905212, + "learning_rate": 0.0001982856009811798, + "loss": 0.3068, + "step": 590 + }, + { + "epoch": 0.1196114146933819, + "grad_norm": 0.54310542345047, + "learning_rate": 0.0001982797315720195, + "loss": 0.2868, + "step": 591 + }, + { + "epoch": 0.11981380287391216, + "grad_norm": 0.42550796270370483, + "learning_rate": 0.0001982738522199472, + "loss": 0.3018, + "step": 592 + }, + { + "epoch": 0.12001619105444242, + "grad_norm": 0.39309918880462646, + "learning_rate": 0.0001982679629255577, + "loss": 0.2296, + "step": 593 + }, + { + "epoch": 0.12021857923497267, + "grad_norm": 0.3419604003429413, + "learning_rate": 0.00019826206368944683, + "loss": 0.2438, + "step": 594 + }, + { + "epoch": 0.12042096741550294, + "grad_norm": 0.6050029397010803, + "learning_rate": 0.0001982561545122114, + "loss": 0.2923, + "step": 595 + }, + { + "epoch": 0.1206233555960332, + "grad_norm": 0.3203630745410919, + "learning_rate": 0.00019825023539444926, + "loss": 0.2406, + "step": 596 + }, + { + "epoch": 0.12082574377656345, + "grad_norm": 0.5872333645820618, + "learning_rate": 0.00019824430633675922, + "loss": 0.2742, + "step": 597 + }, + { + "epoch": 0.12102813195709371, + "grad_norm": 0.3505767285823822, + "learning_rate": 0.00019823836733974114, + "loss": 0.2484, + "step": 598 + }, + { + "epoch": 0.12123052013762396, + "grad_norm": 0.395964115858078, + "learning_rate": 0.0001982324184039958, + "loss": 0.2487, + "step": 599 + }, + { + "epoch": 0.12143290831815422, + "grad_norm": 0.4352482259273529, + "learning_rate": 0.00019822645953012518, + "loss": 0.2336, + "step": 600 + }, + { + "epoch": 0.12143290831815422, + "eval_loss": 0.3115096390247345, + "eval_runtime": 0.7395, + "eval_samples_per_second": 6.761, + "eval_steps_per_second": 1.352, + "step": 600 + }, + { + "epoch": 0.12163529649868447, + "grad_norm": 0.47118180990219116, + "learning_rate": 0.00019822049071873204, + "loss": 0.3171, + "step": 601 + }, + { + "epoch": 0.12183768467921473, + "grad_norm": 0.3833599090576172, + "learning_rate": 0.00019821451197042026, + "loss": 0.2849, + "step": 602 + }, + { + "epoch": 0.122040072859745, + "grad_norm": 0.3530329763889313, + "learning_rate": 0.00019820852328579472, + "loss": 0.2554, + "step": 603 + }, + { + "epoch": 0.12224246104027525, + "grad_norm": 0.42997848987579346, + "learning_rate": 0.0001982025246654613, + "loss": 0.2597, + "step": 604 + }, + { + "epoch": 0.12244484922080551, + "grad_norm": 0.3684461712837219, + "learning_rate": 0.00019819651611002685, + "loss": 0.2819, + "step": 605 + }, + { + "epoch": 0.12264723740133576, + "grad_norm": 0.4009121060371399, + "learning_rate": 0.00019819049762009926, + "loss": 0.2501, + "step": 606 + }, + { + "epoch": 0.12284962558186602, + "grad_norm": 0.4022650420665741, + "learning_rate": 0.00019818446919628738, + "loss": 0.3116, + "step": 607 + }, + { + "epoch": 0.12305201376239627, + "grad_norm": 0.37116739153862, + "learning_rate": 0.0001981784308392012, + "loss": 0.2623, + "step": 608 + }, + { + "epoch": 0.12325440194292653, + "grad_norm": 0.4383566975593567, + "learning_rate": 0.00019817238254945157, + "loss": 0.2417, + "step": 609 + }, + { + "epoch": 0.12345679012345678, + "grad_norm": 0.4073963165283203, + "learning_rate": 0.00019816632432765035, + "loss": 0.2518, + "step": 610 + }, + { + "epoch": 0.12365917830398705, + "grad_norm": 0.29820218682289124, + "learning_rate": 0.00019816025617441047, + "loss": 0.2828, + "step": 611 + }, + { + "epoch": 0.12386156648451731, + "grad_norm": 0.3478280305862427, + "learning_rate": 0.00019815417809034588, + "loss": 0.264, + "step": 612 + }, + { + "epoch": 0.12406395466504756, + "grad_norm": 0.4512695074081421, + "learning_rate": 0.00019814809007607148, + "loss": 0.2657, + "step": 613 + }, + { + "epoch": 0.12426634284557782, + "grad_norm": 0.3232296407222748, + "learning_rate": 0.00019814199213220317, + "loss": 0.241, + "step": 614 + }, + { + "epoch": 0.12446873102610807, + "grad_norm": 0.4013686776161194, + "learning_rate": 0.00019813588425935787, + "loss": 0.2306, + "step": 615 + }, + { + "epoch": 0.12467111920663833, + "grad_norm": 0.4323452115058899, + "learning_rate": 0.00019812976645815358, + "loss": 0.2774, + "step": 616 + }, + { + "epoch": 0.12487350738716858, + "grad_norm": 0.38302597403526306, + "learning_rate": 0.00019812363872920915, + "loss": 0.2361, + "step": 617 + }, + { + "epoch": 0.12507589556769885, + "grad_norm": 0.31080150604248047, + "learning_rate": 0.00019811750107314455, + "loss": 0.2229, + "step": 618 + }, + { + "epoch": 0.1252782837482291, + "grad_norm": 0.39039042592048645, + "learning_rate": 0.00019811135349058072, + "loss": 0.2586, + "step": 619 + }, + { + "epoch": 0.12548067192875936, + "grad_norm": 0.43987491726875305, + "learning_rate": 0.0001981051959821396, + "loss": 0.2318, + "step": 620 + }, + { + "epoch": 0.12568306010928962, + "grad_norm": 0.6152598857879639, + "learning_rate": 0.0001980990285484442, + "loss": 0.2319, + "step": 621 + }, + { + "epoch": 0.12588544828981987, + "grad_norm": 0.40612903237342834, + "learning_rate": 0.00019809285119011842, + "loss": 0.3145, + "step": 622 + }, + { + "epoch": 0.12608783647035013, + "grad_norm": 0.3982747793197632, + "learning_rate": 0.00019808666390778724, + "loss": 0.2536, + "step": 623 + }, + { + "epoch": 0.12629022465088038, + "grad_norm": 0.28268828988075256, + "learning_rate": 0.0001980804667020766, + "loss": 0.2054, + "step": 624 + }, + { + "epoch": 0.12649261283141064, + "grad_norm": 0.3771643042564392, + "learning_rate": 0.0001980742595736135, + "loss": 0.2429, + "step": 625 + }, + { + "epoch": 0.1266950010119409, + "grad_norm": 0.45596668124198914, + "learning_rate": 0.00019806804252302587, + "loss": 0.2702, + "step": 626 + }, + { + "epoch": 0.12689738919247115, + "grad_norm": 0.34389880299568176, + "learning_rate": 0.00019806181555094275, + "loss": 0.2403, + "step": 627 + }, + { + "epoch": 0.1270997773730014, + "grad_norm": 0.4212355613708496, + "learning_rate": 0.00019805557865799407, + "loss": 0.248, + "step": 628 + }, + { + "epoch": 0.1273021655535317, + "grad_norm": 0.36532604694366455, + "learning_rate": 0.0001980493318448108, + "loss": 0.2886, + "step": 629 + }, + { + "epoch": 0.12750455373406194, + "grad_norm": 0.3870829939842224, + "learning_rate": 0.00019804307511202499, + "loss": 0.2781, + "step": 630 + }, + { + "epoch": 0.1277069419145922, + "grad_norm": 0.2700032889842987, + "learning_rate": 0.00019803680846026958, + "loss": 0.2415, + "step": 631 + }, + { + "epoch": 0.12790933009512245, + "grad_norm": 0.3396844267845154, + "learning_rate": 0.00019803053189017858, + "loss": 0.2563, + "step": 632 + }, + { + "epoch": 0.1281117182756527, + "grad_norm": 0.3511733114719391, + "learning_rate": 0.00019802424540238698, + "loss": 0.2378, + "step": 633 + }, + { + "epoch": 0.12831410645618296, + "grad_norm": 0.3880968689918518, + "learning_rate": 0.0001980179489975308, + "loss": 0.2632, + "step": 634 + }, + { + "epoch": 0.12851649463671322, + "grad_norm": 0.5152776837348938, + "learning_rate": 0.00019801164267624702, + "loss": 0.2662, + "step": 635 + }, + { + "epoch": 0.12871888281724347, + "grad_norm": 0.5050997138023376, + "learning_rate": 0.00019800532643917364, + "loss": 0.2408, + "step": 636 + }, + { + "epoch": 0.12892127099777373, + "grad_norm": 0.38669291138648987, + "learning_rate": 0.00019799900028694974, + "loss": 0.2369, + "step": 637 + }, + { + "epoch": 0.12912365917830398, + "grad_norm": 0.32681190967559814, + "learning_rate": 0.00019799266422021523, + "loss": 0.2198, + "step": 638 + }, + { + "epoch": 0.12932604735883424, + "grad_norm": 0.35121649503707886, + "learning_rate": 0.0001979863182396112, + "loss": 0.2482, + "step": 639 + }, + { + "epoch": 0.1295284355393645, + "grad_norm": 0.3443315923213959, + "learning_rate": 0.00019797996234577968, + "loss": 0.2369, + "step": 640 + }, + { + "epoch": 0.12973082371989475, + "grad_norm": 0.5270746946334839, + "learning_rate": 0.00019797359653936364, + "loss": 0.2847, + "step": 641 + }, + { + "epoch": 0.129933211900425, + "grad_norm": 0.3135738968849182, + "learning_rate": 0.0001979672208210071, + "loss": 0.2449, + "step": 642 + }, + { + "epoch": 0.13013560008095526, + "grad_norm": 0.4503747522830963, + "learning_rate": 0.00019796083519135516, + "loss": 0.2765, + "step": 643 + }, + { + "epoch": 0.13033798826148552, + "grad_norm": 0.4427570700645447, + "learning_rate": 0.00019795443965105376, + "loss": 0.2366, + "step": 644 + }, + { + "epoch": 0.1305403764420158, + "grad_norm": 0.42543524503707886, + "learning_rate": 0.00019794803420075002, + "loss": 0.2786, + "step": 645 + }, + { + "epoch": 0.13074276462254605, + "grad_norm": 0.3327653408050537, + "learning_rate": 0.0001979416188410919, + "loss": 0.229, + "step": 646 + }, + { + "epoch": 0.1309451528030763, + "grad_norm": 0.396659791469574, + "learning_rate": 0.0001979351935727285, + "loss": 0.2764, + "step": 647 + }, + { + "epoch": 0.13114754098360656, + "grad_norm": 0.4171930253505707, + "learning_rate": 0.00019792875839630986, + "loss": 0.2593, + "step": 648 + }, + { + "epoch": 0.13134992916413682, + "grad_norm": 0.3459969162940979, + "learning_rate": 0.00019792231331248696, + "loss": 0.2561, + "step": 649 + }, + { + "epoch": 0.13155231734466707, + "grad_norm": 0.3193213641643524, + "learning_rate": 0.0001979158583219119, + "loss": 0.2353, + "step": 650 + }, + { + "epoch": 0.13155231734466707, + "eval_loss": 0.2984254062175751, + "eval_runtime": 0.7375, + "eval_samples_per_second": 6.779, + "eval_steps_per_second": 1.356, + "step": 650 + }, + { + "epoch": 0.13175470552519733, + "grad_norm": 0.35675105452537537, + "learning_rate": 0.00019790939342523772, + "loss": 0.2582, + "step": 651 + }, + { + "epoch": 0.13195709370572759, + "grad_norm": 0.4250843822956085, + "learning_rate": 0.00019790291862311845, + "loss": 0.2315, + "step": 652 + }, + { + "epoch": 0.13215948188625784, + "grad_norm": 0.35945165157318115, + "learning_rate": 0.00019789643391620917, + "loss": 0.2649, + "step": 653 + }, + { + "epoch": 0.1323618700667881, + "grad_norm": 0.47871312499046326, + "learning_rate": 0.0001978899393051659, + "loss": 0.2292, + "step": 654 + }, + { + "epoch": 0.13256425824731835, + "grad_norm": 0.3194306492805481, + "learning_rate": 0.00019788343479064575, + "loss": 0.2355, + "step": 655 + }, + { + "epoch": 0.1327666464278486, + "grad_norm": 0.4799971878528595, + "learning_rate": 0.00019787692037330674, + "loss": 0.2406, + "step": 656 + }, + { + "epoch": 0.13296903460837886, + "grad_norm": 0.3283880352973938, + "learning_rate": 0.00019787039605380791, + "loss": 0.2594, + "step": 657 + }, + { + "epoch": 0.13317142278890912, + "grad_norm": 0.8230167031288147, + "learning_rate": 0.00019786386183280938, + "loss": 0.2599, + "step": 658 + }, + { + "epoch": 0.13337381096943937, + "grad_norm": 0.4739612936973572, + "learning_rate": 0.0001978573177109722, + "loss": 0.2229, + "step": 659 + }, + { + "epoch": 0.13357619914996965, + "grad_norm": 0.3426133692264557, + "learning_rate": 0.00019785076368895838, + "loss": 0.2493, + "step": 660 + }, + { + "epoch": 0.1337785873304999, + "grad_norm": 0.3582127094268799, + "learning_rate": 0.00019784419976743106, + "loss": 0.2569, + "step": 661 + }, + { + "epoch": 0.13398097551103016, + "grad_norm": 0.4506543278694153, + "learning_rate": 0.00019783762594705425, + "loss": 0.2642, + "step": 662 + }, + { + "epoch": 0.13418336369156042, + "grad_norm": 0.3988342881202698, + "learning_rate": 0.00019783104222849304, + "loss": 0.2934, + "step": 663 + }, + { + "epoch": 0.13438575187209068, + "grad_norm": 0.5457602739334106, + "learning_rate": 0.0001978244486124135, + "loss": 0.2505, + "step": 664 + }, + { + "epoch": 0.13458814005262093, + "grad_norm": 0.35253384709358215, + "learning_rate": 0.00019781784509948275, + "loss": 0.2468, + "step": 665 + }, + { + "epoch": 0.13479052823315119, + "grad_norm": 0.41637489199638367, + "learning_rate": 0.00019781123169036882, + "loss": 0.246, + "step": 666 + }, + { + "epoch": 0.13499291641368144, + "grad_norm": 0.3531467616558075, + "learning_rate": 0.00019780460838574076, + "loss": 0.2765, + "step": 667 + }, + { + "epoch": 0.1351953045942117, + "grad_norm": 0.3421364724636078, + "learning_rate": 0.00019779797518626867, + "loss": 0.2719, + "step": 668 + }, + { + "epoch": 0.13539769277474195, + "grad_norm": 0.42127346992492676, + "learning_rate": 0.00019779133209262367, + "loss": 0.2805, + "step": 669 + }, + { + "epoch": 0.1356000809552722, + "grad_norm": 0.3892177641391754, + "learning_rate": 0.00019778467910547777, + "loss": 0.2316, + "step": 670 + }, + { + "epoch": 0.13580246913580246, + "grad_norm": 0.383327841758728, + "learning_rate": 0.00019777801622550408, + "loss": 0.2751, + "step": 671 + }, + { + "epoch": 0.13600485731633272, + "grad_norm": 0.3991505205631256, + "learning_rate": 0.0001977713434533767, + "loss": 0.27, + "step": 672 + }, + { + "epoch": 0.13620724549686297, + "grad_norm": 0.44599419832229614, + "learning_rate": 0.0001977646607897707, + "loss": 0.2381, + "step": 673 + }, + { + "epoch": 0.13640963367739323, + "grad_norm": 0.3119847774505615, + "learning_rate": 0.00019775796823536212, + "loss": 0.2381, + "step": 674 + }, + { + "epoch": 0.1366120218579235, + "grad_norm": 0.4660552144050598, + "learning_rate": 0.0001977512657908281, + "loss": 0.26, + "step": 675 + }, + { + "epoch": 0.13681441003845377, + "grad_norm": 0.3962903916835785, + "learning_rate": 0.0001977445534568467, + "loss": 0.2909, + "step": 676 + }, + { + "epoch": 0.13701679821898402, + "grad_norm": 0.34353816509246826, + "learning_rate": 0.00019773783123409698, + "loss": 0.2648, + "step": 677 + }, + { + "epoch": 0.13721918639951428, + "grad_norm": 0.31514930725097656, + "learning_rate": 0.0001977310991232591, + "loss": 0.2701, + "step": 678 + }, + { + "epoch": 0.13742157458004453, + "grad_norm": 0.4266648292541504, + "learning_rate": 0.00019772435712501406, + "loss": 0.3107, + "step": 679 + }, + { + "epoch": 0.1376239627605748, + "grad_norm": 0.5088281631469727, + "learning_rate": 0.00019771760524004396, + "loss": 0.3094, + "step": 680 + }, + { + "epoch": 0.13782635094110504, + "grad_norm": 0.358623743057251, + "learning_rate": 0.00019771084346903193, + "loss": 0.2786, + "step": 681 + }, + { + "epoch": 0.1380287391216353, + "grad_norm": 0.5978558659553528, + "learning_rate": 0.000197704071812662, + "loss": 0.2459, + "step": 682 + }, + { + "epoch": 0.13823112730216555, + "grad_norm": 0.3651149272918701, + "learning_rate": 0.00019769729027161928, + "loss": 0.2472, + "step": 683 + }, + { + "epoch": 0.1384335154826958, + "grad_norm": 0.3216381371021271, + "learning_rate": 0.0001976904988465899, + "loss": 0.2599, + "step": 684 + }, + { + "epoch": 0.13863590366322606, + "grad_norm": 0.35924655199050903, + "learning_rate": 0.00019768369753826086, + "loss": 0.2816, + "step": 685 + }, + { + "epoch": 0.13883829184375632, + "grad_norm": 0.3398737609386444, + "learning_rate": 0.00019767688634732026, + "loss": 0.2536, + "step": 686 + }, + { + "epoch": 0.13904068002428657, + "grad_norm": 0.3374193012714386, + "learning_rate": 0.00019767006527445728, + "loss": 0.2369, + "step": 687 + }, + { + "epoch": 0.13924306820481683, + "grad_norm": 0.3065876364707947, + "learning_rate": 0.00019766323432036188, + "loss": 0.2748, + "step": 688 + }, + { + "epoch": 0.13944545638534708, + "grad_norm": 0.48267465829849243, + "learning_rate": 0.00019765639348572525, + "loss": 0.2428, + "step": 689 + }, + { + "epoch": 0.13964784456587737, + "grad_norm": 0.33604946732521057, + "learning_rate": 0.0001976495427712394, + "loss": 0.3006, + "step": 690 + }, + { + "epoch": 0.13985023274640762, + "grad_norm": 0.31897780299186707, + "learning_rate": 0.00019764268217759744, + "loss": 0.2808, + "step": 691 + }, + { + "epoch": 0.14005262092693788, + "grad_norm": 0.3280028700828552, + "learning_rate": 0.00019763581170549342, + "loss": 0.2503, + "step": 692 + }, + { + "epoch": 0.14025500910746813, + "grad_norm": 0.2858153283596039, + "learning_rate": 0.0001976289313556225, + "loss": 0.2548, + "step": 693 + }, + { + "epoch": 0.1404573972879984, + "grad_norm": 0.2821749746799469, + "learning_rate": 0.00019762204112868067, + "loss": 0.2294, + "step": 694 + }, + { + "epoch": 0.14065978546852864, + "grad_norm": 0.33796799182891846, + "learning_rate": 0.0001976151410253651, + "loss": 0.2496, + "step": 695 + }, + { + "epoch": 0.1408621736490589, + "grad_norm": 0.44861510396003723, + "learning_rate": 0.0001976082310463738, + "loss": 0.255, + "step": 696 + }, + { + "epoch": 0.14106456182958915, + "grad_norm": 0.3637007176876068, + "learning_rate": 0.00019760131119240585, + "loss": 0.2256, + "step": 697 + }, + { + "epoch": 0.1412669500101194, + "grad_norm": 0.5837531089782715, + "learning_rate": 0.00019759438146416138, + "loss": 0.2494, + "step": 698 + }, + { + "epoch": 0.14146933819064966, + "grad_norm": 0.36459869146347046, + "learning_rate": 0.00019758744186234147, + "loss": 0.3114, + "step": 699 + }, + { + "epoch": 0.14167172637117992, + "grad_norm": 0.2742053270339966, + "learning_rate": 0.00019758049238764814, + "loss": 0.2047, + "step": 700 + }, + { + "epoch": 0.14167172637117992, + "eval_loss": 0.2933715283870697, + "eval_runtime": 0.7397, + "eval_samples_per_second": 6.759, + "eval_steps_per_second": 1.352, + "step": 700 + }, + { + "epoch": 0.14187411455171017, + "grad_norm": 0.42038586735725403, + "learning_rate": 0.00019757353304078446, + "loss": 0.2474, + "step": 701 + }, + { + "epoch": 0.14207650273224043, + "grad_norm": 0.5313589572906494, + "learning_rate": 0.00019756656382245456, + "loss": 0.2622, + "step": 702 + }, + { + "epoch": 0.14227889091277068, + "grad_norm": 0.3278883695602417, + "learning_rate": 0.0001975595847333635, + "loss": 0.2398, + "step": 703 + }, + { + "epoch": 0.14248127909330094, + "grad_norm": 0.39546725153923035, + "learning_rate": 0.00019755259577421732, + "loss": 0.2423, + "step": 704 + }, + { + "epoch": 0.14268366727383122, + "grad_norm": 0.33064937591552734, + "learning_rate": 0.0001975455969457231, + "loss": 0.2816, + "step": 705 + }, + { + "epoch": 0.14288605545436148, + "grad_norm": 0.2638942003250122, + "learning_rate": 0.00019753858824858895, + "loss": 0.2267, + "step": 706 + }, + { + "epoch": 0.14308844363489173, + "grad_norm": 0.408956378698349, + "learning_rate": 0.00019753156968352385, + "loss": 0.2375, + "step": 707 + }, + { + "epoch": 0.143290831815422, + "grad_norm": 0.4136710464954376, + "learning_rate": 0.00019752454125123795, + "loss": 0.276, + "step": 708 + }, + { + "epoch": 0.14349321999595224, + "grad_norm": 0.27548322081565857, + "learning_rate": 0.00019751750295244228, + "loss": 0.2297, + "step": 709 + }, + { + "epoch": 0.1436956081764825, + "grad_norm": 0.31546905636787415, + "learning_rate": 0.0001975104547878489, + "loss": 0.2758, + "step": 710 + }, + { + "epoch": 0.14389799635701275, + "grad_norm": 0.2947577238082886, + "learning_rate": 0.00019750339675817086, + "loss": 0.2268, + "step": 711 + }, + { + "epoch": 0.144100384537543, + "grad_norm": 0.4339549243450165, + "learning_rate": 0.00019749632886412223, + "loss": 0.2353, + "step": 712 + }, + { + "epoch": 0.14430277271807326, + "grad_norm": 0.2924511432647705, + "learning_rate": 0.00019748925110641807, + "loss": 0.2333, + "step": 713 + }, + { + "epoch": 0.14450516089860352, + "grad_norm": 0.31660690903663635, + "learning_rate": 0.00019748216348577442, + "loss": 0.2287, + "step": 714 + }, + { + "epoch": 0.14470754907913377, + "grad_norm": 0.3289544880390167, + "learning_rate": 0.00019747506600290834, + "loss": 0.2926, + "step": 715 + }, + { + "epoch": 0.14490993725966403, + "grad_norm": 0.3553188443183899, + "learning_rate": 0.0001974679586585379, + "loss": 0.2954, + "step": 716 + }, + { + "epoch": 0.14511232544019428, + "grad_norm": 0.47059759497642517, + "learning_rate": 0.0001974608414533821, + "loss": 0.2394, + "step": 717 + }, + { + "epoch": 0.14531471362072454, + "grad_norm": 0.3676668703556061, + "learning_rate": 0.000197453714388161, + "loss": 0.234, + "step": 718 + }, + { + "epoch": 0.1455171018012548, + "grad_norm": 0.31340935826301575, + "learning_rate": 0.00019744657746359562, + "loss": 0.2305, + "step": 719 + }, + { + "epoch": 0.14571948998178508, + "grad_norm": 0.3455619215965271, + "learning_rate": 0.00019743943068040808, + "loss": 0.2689, + "step": 720 + }, + { + "epoch": 0.14592187816231533, + "grad_norm": 0.4046059250831604, + "learning_rate": 0.00019743227403932134, + "loss": 0.2544, + "step": 721 + }, + { + "epoch": 0.1461242663428456, + "grad_norm": 0.46637916564941406, + "learning_rate": 0.00019742510754105946, + "loss": 0.2285, + "step": 722 + }, + { + "epoch": 0.14632665452337584, + "grad_norm": 0.3575298488140106, + "learning_rate": 0.00019741793118634748, + "loss": 0.2497, + "step": 723 + }, + { + "epoch": 0.1465290427039061, + "grad_norm": 0.38678669929504395, + "learning_rate": 0.0001974107449759114, + "loss": 0.2577, + "step": 724 + }, + { + "epoch": 0.14673143088443635, + "grad_norm": 0.28437402844429016, + "learning_rate": 0.00019740354891047826, + "loss": 0.2304, + "step": 725 + }, + { + "epoch": 0.1469338190649666, + "grad_norm": 0.31594318151474, + "learning_rate": 0.00019739634299077613, + "loss": 0.2755, + "step": 726 + }, + { + "epoch": 0.14713620724549686, + "grad_norm": 0.2776789963245392, + "learning_rate": 0.00019738912721753394, + "loss": 0.2377, + "step": 727 + }, + { + "epoch": 0.14733859542602712, + "grad_norm": 0.30711445212364197, + "learning_rate": 0.00019738190159148178, + "loss": 0.2254, + "step": 728 + }, + { + "epoch": 0.14754098360655737, + "grad_norm": 0.3012600839138031, + "learning_rate": 0.0001973746661133506, + "loss": 0.2529, + "step": 729 + }, + { + "epoch": 0.14774337178708763, + "grad_norm": 0.33163461089134216, + "learning_rate": 0.00019736742078387245, + "loss": 0.258, + "step": 730 + }, + { + "epoch": 0.14794575996761788, + "grad_norm": 0.26741090416908264, + "learning_rate": 0.00019736016560378036, + "loss": 0.2405, + "step": 731 + }, + { + "epoch": 0.14814814814814814, + "grad_norm": 0.4081975519657135, + "learning_rate": 0.00019735290057380827, + "loss": 0.2999, + "step": 732 + }, + { + "epoch": 0.1483505363286784, + "grad_norm": 0.3062702715396881, + "learning_rate": 0.00019734562569469124, + "loss": 0.2618, + "step": 733 + }, + { + "epoch": 0.14855292450920865, + "grad_norm": 0.28259265422821045, + "learning_rate": 0.0001973383409671652, + "loss": 0.2309, + "step": 734 + }, + { + "epoch": 0.1487553126897389, + "grad_norm": 0.25090116262435913, + "learning_rate": 0.00019733104639196722, + "loss": 0.248, + "step": 735 + }, + { + "epoch": 0.1489577008702692, + "grad_norm": 0.3727152347564697, + "learning_rate": 0.00019732374196983522, + "loss": 0.2266, + "step": 736 + }, + { + "epoch": 0.14916008905079944, + "grad_norm": 0.4587722718715668, + "learning_rate": 0.00019731642770150822, + "loss": 0.2854, + "step": 737 + }, + { + "epoch": 0.1493624772313297, + "grad_norm": 0.5172100067138672, + "learning_rate": 0.0001973091035877262, + "loss": 0.2573, + "step": 738 + }, + { + "epoch": 0.14956486541185995, + "grad_norm": 0.3253283202648163, + "learning_rate": 0.00019730176962923015, + "loss": 0.2639, + "step": 739 + }, + { + "epoch": 0.1497672535923902, + "grad_norm": 0.4492495357990265, + "learning_rate": 0.00019729442582676202, + "loss": 0.2732, + "step": 740 + }, + { + "epoch": 0.14996964177292046, + "grad_norm": 0.34648340940475464, + "learning_rate": 0.00019728707218106477, + "loss": 0.2595, + "step": 741 + }, + { + "epoch": 0.15017202995345072, + "grad_norm": 0.3878565728664398, + "learning_rate": 0.00019727970869288237, + "loss": 0.3247, + "step": 742 + }, + { + "epoch": 0.15037441813398097, + "grad_norm": 0.2799864113330841, + "learning_rate": 0.0001972723353629598, + "loss": 0.2582, + "step": 743 + }, + { + "epoch": 0.15057680631451123, + "grad_norm": 0.3522235155105591, + "learning_rate": 0.000197264952192043, + "loss": 0.2187, + "step": 744 + }, + { + "epoch": 0.15077919449504149, + "grad_norm": 0.3626643717288971, + "learning_rate": 0.00019725755918087893, + "loss": 0.2197, + "step": 745 + }, + { + "epoch": 0.15098158267557174, + "grad_norm": 0.304533988237381, + "learning_rate": 0.00019725015633021553, + "loss": 0.2334, + "step": 746 + }, + { + "epoch": 0.151183970856102, + "grad_norm": 0.4297725260257721, + "learning_rate": 0.00019724274364080175, + "loss": 0.2646, + "step": 747 + }, + { + "epoch": 0.15138635903663225, + "grad_norm": 0.4226777255535126, + "learning_rate": 0.00019723532111338754, + "loss": 0.2325, + "step": 748 + }, + { + "epoch": 0.1515887472171625, + "grad_norm": 0.3075390160083771, + "learning_rate": 0.00019722788874872377, + "loss": 0.2664, + "step": 749 + }, + { + "epoch": 0.15179113539769276, + "grad_norm": 0.3131004273891449, + "learning_rate": 0.00019722044654756248, + "loss": 0.2607, + "step": 750 + }, + { + "epoch": 0.15179113539769276, + "eval_loss": 0.28550732135772705, + "eval_runtime": 0.7363, + "eval_samples_per_second": 6.791, + "eval_steps_per_second": 1.358, + "step": 750 + }, + { + "epoch": 0.15199352357822304, + "grad_norm": 0.34832969307899475, + "learning_rate": 0.0001972129945106565, + "loss": 0.2407, + "step": 751 + }, + { + "epoch": 0.1521959117587533, + "grad_norm": 0.33916226029396057, + "learning_rate": 0.00019720553263875978, + "loss": 0.2806, + "step": 752 + }, + { + "epoch": 0.15239829993928355, + "grad_norm": 0.40339797735214233, + "learning_rate": 0.00019719806093262726, + "loss": 0.3036, + "step": 753 + }, + { + "epoch": 0.1526006881198138, + "grad_norm": 0.259329617023468, + "learning_rate": 0.00019719057939301477, + "loss": 0.25, + "step": 754 + }, + { + "epoch": 0.15280307630034407, + "grad_norm": 0.351357638835907, + "learning_rate": 0.0001971830880206793, + "loss": 0.2487, + "step": 755 + }, + { + "epoch": 0.15300546448087432, + "grad_norm": 0.41735726594924927, + "learning_rate": 0.0001971755868163787, + "loss": 0.2562, + "step": 756 + }, + { + "epoch": 0.15320785266140458, + "grad_norm": 0.3808186948299408, + "learning_rate": 0.0001971680757808719, + "loss": 0.3059, + "step": 757 + }, + { + "epoch": 0.15341024084193483, + "grad_norm": 0.2662958800792694, + "learning_rate": 0.00019716055491491875, + "loss": 0.2254, + "step": 758 + }, + { + "epoch": 0.15361262902246509, + "grad_norm": 0.30748385190963745, + "learning_rate": 0.00019715302421928013, + "loss": 0.2595, + "step": 759 + }, + { + "epoch": 0.15381501720299534, + "grad_norm": 0.3423989415168762, + "learning_rate": 0.00019714548369471796, + "loss": 0.2642, + "step": 760 + }, + { + "epoch": 0.1540174053835256, + "grad_norm": 0.35559195280075073, + "learning_rate": 0.00019713793334199511, + "loss": 0.2546, + "step": 761 + }, + { + "epoch": 0.15421979356405585, + "grad_norm": 0.29996034502983093, + "learning_rate": 0.00019713037316187537, + "loss": 0.237, + "step": 762 + }, + { + "epoch": 0.1544221817445861, + "grad_norm": 0.2678242325782776, + "learning_rate": 0.0001971228031551237, + "loss": 0.2591, + "step": 763 + }, + { + "epoch": 0.15462456992511636, + "grad_norm": 0.3217318058013916, + "learning_rate": 0.00019711522332250585, + "loss": 0.2598, + "step": 764 + }, + { + "epoch": 0.15482695810564662, + "grad_norm": 0.31618547439575195, + "learning_rate": 0.00019710763366478874, + "loss": 0.2455, + "step": 765 + }, + { + "epoch": 0.1550293462861769, + "grad_norm": 0.28240564465522766, + "learning_rate": 0.0001971000341827402, + "loss": 0.2189, + "step": 766 + }, + { + "epoch": 0.15523173446670716, + "grad_norm": 0.3480214774608612, + "learning_rate": 0.0001970924248771291, + "loss": 0.2573, + "step": 767 + }, + { + "epoch": 0.1554341226472374, + "grad_norm": 0.3173188865184784, + "learning_rate": 0.00019708480574872516, + "loss": 0.259, + "step": 768 + }, + { + "epoch": 0.15563651082776767, + "grad_norm": 0.4622735381126404, + "learning_rate": 0.00019707717679829935, + "loss": 0.2777, + "step": 769 + }, + { + "epoch": 0.15583889900829792, + "grad_norm": 0.583462655544281, + "learning_rate": 0.00019706953802662333, + "loss": 0.2666, + "step": 770 + }, + { + "epoch": 0.15604128718882818, + "grad_norm": 0.371926873922348, + "learning_rate": 0.00019706188943447006, + "loss": 0.2508, + "step": 771 + }, + { + "epoch": 0.15624367536935843, + "grad_norm": 0.2995915710926056, + "learning_rate": 0.00019705423102261326, + "loss": 0.2464, + "step": 772 + }, + { + "epoch": 0.1564460635498887, + "grad_norm": 0.35817331075668335, + "learning_rate": 0.0001970465627918277, + "loss": 0.2718, + "step": 773 + }, + { + "epoch": 0.15664845173041894, + "grad_norm": 0.32463064789772034, + "learning_rate": 0.00019703888474288924, + "loss": 0.2746, + "step": 774 + }, + { + "epoch": 0.1568508399109492, + "grad_norm": 0.3261243402957916, + "learning_rate": 0.00019703119687657466, + "loss": 0.2844, + "step": 775 + }, + { + "epoch": 0.15705322809147945, + "grad_norm": 0.39310258626937866, + "learning_rate": 0.0001970234991936617, + "loss": 0.2369, + "step": 776 + }, + { + "epoch": 0.1572556162720097, + "grad_norm": 0.22917023301124573, + "learning_rate": 0.00019701579169492916, + "loss": 0.2154, + "step": 777 + }, + { + "epoch": 0.15745800445253996, + "grad_norm": 0.22985297441482544, + "learning_rate": 0.0001970080743811568, + "loss": 0.2075, + "step": 778 + }, + { + "epoch": 0.15766039263307022, + "grad_norm": 0.3643643260002136, + "learning_rate": 0.0001970003472531253, + "loss": 0.2588, + "step": 779 + }, + { + "epoch": 0.15786278081360047, + "grad_norm": 0.3869781494140625, + "learning_rate": 0.00019699261031161657, + "loss": 0.2537, + "step": 780 + }, + { + "epoch": 0.15806516899413076, + "grad_norm": 0.4232870638370514, + "learning_rate": 0.0001969848635574132, + "loss": 0.2638, + "step": 781 + }, + { + "epoch": 0.158267557174661, + "grad_norm": 0.3287450969219208, + "learning_rate": 0.000196977106991299, + "loss": 0.2503, + "step": 782 + }, + { + "epoch": 0.15846994535519127, + "grad_norm": 0.34829604625701904, + "learning_rate": 0.0001969693406140587, + "loss": 0.2564, + "step": 783 + }, + { + "epoch": 0.15867233353572152, + "grad_norm": 0.31483200192451477, + "learning_rate": 0.00019696156442647797, + "loss": 0.2465, + "step": 784 + }, + { + "epoch": 0.15887472171625178, + "grad_norm": 0.41794779896736145, + "learning_rate": 0.0001969537784293436, + "loss": 0.2569, + "step": 785 + }, + { + "epoch": 0.15907710989678203, + "grad_norm": 0.3754033148288727, + "learning_rate": 0.00019694598262344322, + "loss": 0.2626, + "step": 786 + }, + { + "epoch": 0.1592794980773123, + "grad_norm": 0.39740636944770813, + "learning_rate": 0.00019693817700956555, + "loss": 0.2436, + "step": 787 + }, + { + "epoch": 0.15948188625784254, + "grad_norm": 0.34521111845970154, + "learning_rate": 0.00019693036158850033, + "loss": 0.2442, + "step": 788 + }, + { + "epoch": 0.1596842744383728, + "grad_norm": 0.5534895658493042, + "learning_rate": 0.0001969225363610382, + "loss": 0.2119, + "step": 789 + }, + { + "epoch": 0.15988666261890305, + "grad_norm": 0.42546847462654114, + "learning_rate": 0.00019691470132797081, + "loss": 0.2589, + "step": 790 + }, + { + "epoch": 0.1600890507994333, + "grad_norm": 0.31263595819473267, + "learning_rate": 0.00019690685649009087, + "loss": 0.237, + "step": 791 + }, + { + "epoch": 0.16029143897996356, + "grad_norm": 0.43939104676246643, + "learning_rate": 0.00019689900184819204, + "loss": 0.2212, + "step": 792 + }, + { + "epoch": 0.16049382716049382, + "grad_norm": 0.3826169967651367, + "learning_rate": 0.0001968911374030689, + "loss": 0.2643, + "step": 793 + }, + { + "epoch": 0.16069621534102407, + "grad_norm": 0.32133644819259644, + "learning_rate": 0.0001968832631555172, + "loss": 0.2412, + "step": 794 + }, + { + "epoch": 0.16089860352155433, + "grad_norm": 0.31465601921081543, + "learning_rate": 0.00019687537910633349, + "loss": 0.2176, + "step": 795 + }, + { + "epoch": 0.1611009917020846, + "grad_norm": 0.26472586393356323, + "learning_rate": 0.00019686748525631545, + "loss": 0.2301, + "step": 796 + }, + { + "epoch": 0.16130337988261487, + "grad_norm": 0.33633241057395935, + "learning_rate": 0.00019685958160626163, + "loss": 0.271, + "step": 797 + }, + { + "epoch": 0.16150576806314512, + "grad_norm": 0.30255284905433655, + "learning_rate": 0.0001968516681569717, + "loss": 0.23, + "step": 798 + }, + { + "epoch": 0.16170815624367538, + "grad_norm": 0.3203800618648529, + "learning_rate": 0.00019684374490924625, + "loss": 0.2417, + "step": 799 + }, + { + "epoch": 0.16191054442420563, + "grad_norm": 0.32507458329200745, + "learning_rate": 0.00019683581186388685, + "loss": 0.2584, + "step": 800 + }, + { + "epoch": 0.16191054442420563, + "eval_loss": 0.2888225317001343, + "eval_runtime": 0.7379, + "eval_samples_per_second": 6.776, + "eval_steps_per_second": 1.355, + "step": 800 + }, + { + "epoch": 0.1621129326047359, + "grad_norm": 0.3781580924987793, + "learning_rate": 0.00019682786902169608, + "loss": 0.2743, + "step": 801 + }, + { + "epoch": 0.16231532078526614, + "grad_norm": 0.32408541440963745, + "learning_rate": 0.00019681991638347755, + "loss": 0.2528, + "step": 802 + }, + { + "epoch": 0.1625177089657964, + "grad_norm": 0.30190667510032654, + "learning_rate": 0.00019681195395003577, + "loss": 0.2434, + "step": 803 + }, + { + "epoch": 0.16272009714632665, + "grad_norm": 0.34094536304473877, + "learning_rate": 0.00019680398172217635, + "loss": 0.2479, + "step": 804 + }, + { + "epoch": 0.1629224853268569, + "grad_norm": 0.45574498176574707, + "learning_rate": 0.00019679599970070578, + "loss": 0.2652, + "step": 805 + }, + { + "epoch": 0.16312487350738716, + "grad_norm": 0.2918074429035187, + "learning_rate": 0.00019678800788643167, + "loss": 0.2581, + "step": 806 + }, + { + "epoch": 0.16332726168791742, + "grad_norm": 0.42664429545402527, + "learning_rate": 0.00019678000628016248, + "loss": 0.2459, + "step": 807 + }, + { + "epoch": 0.16352964986844767, + "grad_norm": 0.32714366912841797, + "learning_rate": 0.00019677199488270778, + "loss": 0.2543, + "step": 808 + }, + { + "epoch": 0.16373203804897793, + "grad_norm": 0.4104800522327423, + "learning_rate": 0.00019676397369487804, + "loss": 0.2388, + "step": 809 + }, + { + "epoch": 0.16393442622950818, + "grad_norm": 0.4657924473285675, + "learning_rate": 0.0001967559427174848, + "loss": 0.2687, + "step": 810 + }, + { + "epoch": 0.16413681441003847, + "grad_norm": 0.39114388823509216, + "learning_rate": 0.00019674790195134048, + "loss": 0.2364, + "step": 811 + }, + { + "epoch": 0.16433920259056872, + "grad_norm": 0.40501561760902405, + "learning_rate": 0.00019673985139725863, + "loss": 0.2602, + "step": 812 + }, + { + "epoch": 0.16454159077109898, + "grad_norm": 0.2732917368412018, + "learning_rate": 0.00019673179105605368, + "loss": 0.2295, + "step": 813 + }, + { + "epoch": 0.16474397895162923, + "grad_norm": 0.5099993944168091, + "learning_rate": 0.00019672372092854111, + "loss": 0.282, + "step": 814 + }, + { + "epoch": 0.1649463671321595, + "grad_norm": 0.31335383653640747, + "learning_rate": 0.0001967156410155374, + "loss": 0.2502, + "step": 815 + }, + { + "epoch": 0.16514875531268974, + "grad_norm": 0.2994532585144043, + "learning_rate": 0.00019670755131785992, + "loss": 0.2277, + "step": 816 + }, + { + "epoch": 0.16535114349322, + "grad_norm": 0.425386905670166, + "learning_rate": 0.00019669945183632716, + "loss": 0.257, + "step": 817 + }, + { + "epoch": 0.16555353167375025, + "grad_norm": 0.27471521496772766, + "learning_rate": 0.0001966913425717585, + "loss": 0.249, + "step": 818 + }, + { + "epoch": 0.1657559198542805, + "grad_norm": 0.39158228039741516, + "learning_rate": 0.0001966832235249744, + "loss": 0.2765, + "step": 819 + }, + { + "epoch": 0.16595830803481076, + "grad_norm": 0.3115609884262085, + "learning_rate": 0.0001966750946967962, + "loss": 0.2527, + "step": 820 + }, + { + "epoch": 0.16616069621534102, + "grad_norm": 0.3412054777145386, + "learning_rate": 0.00019666695608804632, + "loss": 0.2534, + "step": 821 + }, + { + "epoch": 0.16636308439587127, + "grad_norm": 0.2714287042617798, + "learning_rate": 0.00019665880769954814, + "loss": 0.227, + "step": 822 + }, + { + "epoch": 0.16656547257640153, + "grad_norm": 0.33095890283584595, + "learning_rate": 0.00019665064953212604, + "loss": 0.2628, + "step": 823 + }, + { + "epoch": 0.16676786075693179, + "grad_norm": 0.4704902470111847, + "learning_rate": 0.00019664248158660533, + "loss": 0.2982, + "step": 824 + }, + { + "epoch": 0.16697024893746204, + "grad_norm": 0.32132309675216675, + "learning_rate": 0.00019663430386381242, + "loss": 0.2643, + "step": 825 + }, + { + "epoch": 0.1671726371179923, + "grad_norm": 0.3183353543281555, + "learning_rate": 0.00019662611636457462, + "loss": 0.2679, + "step": 826 + }, + { + "epoch": 0.16737502529852258, + "grad_norm": 0.3988979756832123, + "learning_rate": 0.00019661791908972024, + "loss": 0.2103, + "step": 827 + }, + { + "epoch": 0.16757741347905283, + "grad_norm": 0.3745168149471283, + "learning_rate": 0.00019660971204007863, + "loss": 0.253, + "step": 828 + }, + { + "epoch": 0.1677798016595831, + "grad_norm": 0.33534538745880127, + "learning_rate": 0.00019660149521648004, + "loss": 0.253, + "step": 829 + }, + { + "epoch": 0.16798218984011334, + "grad_norm": 0.3371214270591736, + "learning_rate": 0.0001965932686197558, + "loss": 0.2287, + "step": 830 + }, + { + "epoch": 0.1681845780206436, + "grad_norm": 0.3766498863697052, + "learning_rate": 0.00019658503225073817, + "loss": 0.2868, + "step": 831 + }, + { + "epoch": 0.16838696620117385, + "grad_norm": 0.3650453984737396, + "learning_rate": 0.0001965767861102605, + "loss": 0.2496, + "step": 832 + }, + { + "epoch": 0.1685893543817041, + "grad_norm": 0.3437459170818329, + "learning_rate": 0.0001965685301991569, + "loss": 0.2374, + "step": 833 + }, + { + "epoch": 0.16879174256223436, + "grad_norm": 0.5330494046211243, + "learning_rate": 0.00019656026451826274, + "loss": 0.3107, + "step": 834 + }, + { + "epoch": 0.16899413074276462, + "grad_norm": 0.28489741683006287, + "learning_rate": 0.0001965519890684142, + "loss": 0.24, + "step": 835 + }, + { + "epoch": 0.16919651892329488, + "grad_norm": 0.6033898591995239, + "learning_rate": 0.00019654370385044852, + "loss": 0.2826, + "step": 836 + }, + { + "epoch": 0.16939890710382513, + "grad_norm": 0.42504727840423584, + "learning_rate": 0.00019653540886520386, + "loss": 0.2215, + "step": 837 + }, + { + "epoch": 0.16960129528435539, + "grad_norm": 0.3004084825515747, + "learning_rate": 0.00019652710411351953, + "loss": 0.282, + "step": 838 + }, + { + "epoch": 0.16980368346488564, + "grad_norm": 0.3893960416316986, + "learning_rate": 0.0001965187895962356, + "loss": 0.3009, + "step": 839 + }, + { + "epoch": 0.1700060716454159, + "grad_norm": 0.40491530299186707, + "learning_rate": 0.00019651046531419332, + "loss": 0.2341, + "step": 840 + }, + { + "epoch": 0.17020845982594615, + "grad_norm": 0.37225958704948425, + "learning_rate": 0.00019650213126823487, + "loss": 0.2727, + "step": 841 + }, + { + "epoch": 0.17041084800647643, + "grad_norm": 0.34467262029647827, + "learning_rate": 0.00019649378745920332, + "loss": 0.2944, + "step": 842 + }, + { + "epoch": 0.1706132361870067, + "grad_norm": 0.3478735387325287, + "learning_rate": 0.00019648543388794284, + "loss": 0.2406, + "step": 843 + }, + { + "epoch": 0.17081562436753694, + "grad_norm": 0.24952255189418793, + "learning_rate": 0.0001964770705552986, + "loss": 0.2476, + "step": 844 + }, + { + "epoch": 0.1710180125480672, + "grad_norm": 0.34091097116470337, + "learning_rate": 0.0001964686974621167, + "loss": 0.2735, + "step": 845 + }, + { + "epoch": 0.17122040072859745, + "grad_norm": 0.4286462068557739, + "learning_rate": 0.0001964603146092442, + "loss": 0.2825, + "step": 846 + }, + { + "epoch": 0.1714227889091277, + "grad_norm": 0.32420212030410767, + "learning_rate": 0.0001964519219975292, + "loss": 0.2661, + "step": 847 + }, + { + "epoch": 0.17162517708965797, + "grad_norm": 0.3544958233833313, + "learning_rate": 0.0001964435196278208, + "loss": 0.2623, + "step": 848 + }, + { + "epoch": 0.17182756527018822, + "grad_norm": 0.3037719428539276, + "learning_rate": 0.00019643510750096908, + "loss": 0.2334, + "step": 849 + }, + { + "epoch": 0.17202995345071848, + "grad_norm": 0.3185819685459137, + "learning_rate": 0.00019642668561782505, + "loss": 0.2727, + "step": 850 + }, + { + "epoch": 0.17202995345071848, + "eval_loss": 0.28905409574508667, + "eval_runtime": 0.7363, + "eval_samples_per_second": 6.79, + "eval_steps_per_second": 1.358, + "step": 850 + }, + { + "epoch": 0.17223234163124873, + "grad_norm": 0.3105728328227997, + "learning_rate": 0.00019641825397924076, + "loss": 0.244, + "step": 851 + }, + { + "epoch": 0.172434729811779, + "grad_norm": 0.31616318225860596, + "learning_rate": 0.00019640981258606924, + "loss": 0.2598, + "step": 852 + }, + { + "epoch": 0.17263711799230924, + "grad_norm": 0.27749040722846985, + "learning_rate": 0.00019640136143916448, + "loss": 0.2225, + "step": 853 + }, + { + "epoch": 0.1728395061728395, + "grad_norm": 0.37248048186302185, + "learning_rate": 0.00019639290053938152, + "loss": 0.252, + "step": 854 + }, + { + "epoch": 0.17304189435336975, + "grad_norm": 0.3649056553840637, + "learning_rate": 0.00019638442988757632, + "loss": 0.2886, + "step": 855 + }, + { + "epoch": 0.1732442825339, + "grad_norm": 0.30918097496032715, + "learning_rate": 0.00019637594948460584, + "loss": 0.2494, + "step": 856 + }, + { + "epoch": 0.1734466707144303, + "grad_norm": 0.30996596813201904, + "learning_rate": 0.00019636745933132807, + "loss": 0.2318, + "step": 857 + }, + { + "epoch": 0.17364905889496055, + "grad_norm": 0.4232423007488251, + "learning_rate": 0.00019635895942860193, + "loss": 0.2736, + "step": 858 + }, + { + "epoch": 0.1738514470754908, + "grad_norm": 0.43543338775634766, + "learning_rate": 0.00019635044977728736, + "loss": 0.266, + "step": 859 + }, + { + "epoch": 0.17405383525602106, + "grad_norm": 0.35472288727760315, + "learning_rate": 0.00019634193037824528, + "loss": 0.2276, + "step": 860 + }, + { + "epoch": 0.1742562234365513, + "grad_norm": 0.33700042963027954, + "learning_rate": 0.0001963334012323376, + "loss": 0.2907, + "step": 861 + }, + { + "epoch": 0.17445861161708157, + "grad_norm": 0.2621822953224182, + "learning_rate": 0.00019632486234042715, + "loss": 0.2244, + "step": 862 + }, + { + "epoch": 0.17466099979761182, + "grad_norm": 0.27254778146743774, + "learning_rate": 0.00019631631370337787, + "loss": 0.2414, + "step": 863 + }, + { + "epoch": 0.17486338797814208, + "grad_norm": 0.3431929349899292, + "learning_rate": 0.00019630775532205466, + "loss": 0.2544, + "step": 864 + }, + { + "epoch": 0.17506577615867233, + "grad_norm": 0.3418872654438019, + "learning_rate": 0.00019629918719732325, + "loss": 0.2592, + "step": 865 + }, + { + "epoch": 0.1752681643392026, + "grad_norm": 0.3220979571342468, + "learning_rate": 0.00019629060933005056, + "loss": 0.2882, + "step": 866 + }, + { + "epoch": 0.17547055251973284, + "grad_norm": 0.4210681617259979, + "learning_rate": 0.0001962820217211044, + "loss": 0.279, + "step": 867 + }, + { + "epoch": 0.1756729407002631, + "grad_norm": 0.2708446681499481, + "learning_rate": 0.00019627342437135355, + "loss": 0.244, + "step": 868 + }, + { + "epoch": 0.17587532888079335, + "grad_norm": 0.35413023829460144, + "learning_rate": 0.00019626481728166777, + "loss": 0.2727, + "step": 869 + }, + { + "epoch": 0.1760777170613236, + "grad_norm": 0.3829691708087921, + "learning_rate": 0.0001962562004529179, + "loss": 0.2697, + "step": 870 + }, + { + "epoch": 0.17628010524185386, + "grad_norm": 0.39556726813316345, + "learning_rate": 0.00019624757388597567, + "loss": 0.2492, + "step": 871 + }, + { + "epoch": 0.17648249342238415, + "grad_norm": 0.46628960967063904, + "learning_rate": 0.00019623893758171385, + "loss": 0.288, + "step": 872 + }, + { + "epoch": 0.1766848816029144, + "grad_norm": 0.3176499009132385, + "learning_rate": 0.0001962302915410061, + "loss": 0.24, + "step": 873 + }, + { + "epoch": 0.17688726978344466, + "grad_norm": 0.4139035642147064, + "learning_rate": 0.00019622163576472724, + "loss": 0.2517, + "step": 874 + }, + { + "epoch": 0.1770896579639749, + "grad_norm": 0.29733502864837646, + "learning_rate": 0.00019621297025375288, + "loss": 0.2606, + "step": 875 + }, + { + "epoch": 0.17729204614450517, + "grad_norm": 0.5197150111198425, + "learning_rate": 0.00019620429500895976, + "loss": 0.2508, + "step": 876 + }, + { + "epoch": 0.17749443432503542, + "grad_norm": 0.28445857763290405, + "learning_rate": 0.00019619561003122554, + "loss": 0.2149, + "step": 877 + }, + { + "epoch": 0.17769682250556568, + "grad_norm": 0.31572219729423523, + "learning_rate": 0.00019618691532142884, + "loss": 0.2569, + "step": 878 + }, + { + "epoch": 0.17789921068609593, + "grad_norm": 0.26687559485435486, + "learning_rate": 0.00019617821088044934, + "loss": 0.2379, + "step": 879 + }, + { + "epoch": 0.1781015988666262, + "grad_norm": 0.31003233790397644, + "learning_rate": 0.0001961694967091676, + "loss": 0.2384, + "step": 880 + }, + { + "epoch": 0.17830398704715644, + "grad_norm": 0.30486080050468445, + "learning_rate": 0.00019616077280846535, + "loss": 0.2257, + "step": 881 + }, + { + "epoch": 0.1785063752276867, + "grad_norm": 0.32738712430000305, + "learning_rate": 0.00019615203917922508, + "loss": 0.2493, + "step": 882 + }, + { + "epoch": 0.17870876340821695, + "grad_norm": 0.3937731087207794, + "learning_rate": 0.0001961432958223304, + "loss": 0.2577, + "step": 883 + }, + { + "epoch": 0.1789111515887472, + "grad_norm": 0.29360431432724, + "learning_rate": 0.00019613454273866588, + "loss": 0.2184, + "step": 884 + }, + { + "epoch": 0.17911353976927746, + "grad_norm": 0.4143258333206177, + "learning_rate": 0.00019612577992911704, + "loss": 0.2304, + "step": 885 + }, + { + "epoch": 0.17931592794980772, + "grad_norm": 0.34402403235435486, + "learning_rate": 0.00019611700739457044, + "loss": 0.2317, + "step": 886 + }, + { + "epoch": 0.179518316130338, + "grad_norm": 0.3758588433265686, + "learning_rate": 0.00019610822513591356, + "loss": 0.2564, + "step": 887 + }, + { + "epoch": 0.17972070431086826, + "grad_norm": 0.343376100063324, + "learning_rate": 0.0001960994331540349, + "loss": 0.258, + "step": 888 + }, + { + "epoch": 0.1799230924913985, + "grad_norm": 0.3708763122558594, + "learning_rate": 0.000196090631449824, + "loss": 0.2497, + "step": 889 + }, + { + "epoch": 0.18012548067192877, + "grad_norm": 0.3553926646709442, + "learning_rate": 0.00019608182002417123, + "loss": 0.2845, + "step": 890 + }, + { + "epoch": 0.18032786885245902, + "grad_norm": 0.295229434967041, + "learning_rate": 0.0001960729988779681, + "loss": 0.2787, + "step": 891 + }, + { + "epoch": 0.18053025703298928, + "grad_norm": 0.28084808588027954, + "learning_rate": 0.00019606416801210702, + "loss": 0.2342, + "step": 892 + }, + { + "epoch": 0.18073264521351953, + "grad_norm": 0.3650226891040802, + "learning_rate": 0.00019605532742748141, + "loss": 0.298, + "step": 893 + }, + { + "epoch": 0.1809350333940498, + "grad_norm": 0.34866154193878174, + "learning_rate": 0.0001960464771249857, + "loss": 0.237, + "step": 894 + }, + { + "epoch": 0.18113742157458004, + "grad_norm": 0.5545859336853027, + "learning_rate": 0.00019603761710551521, + "loss": 0.2476, + "step": 895 + }, + { + "epoch": 0.1813398097551103, + "grad_norm": 0.36406493186950684, + "learning_rate": 0.00019602874736996632, + "loss": 0.2419, + "step": 896 + }, + { + "epoch": 0.18154219793564055, + "grad_norm": 0.38556376099586487, + "learning_rate": 0.00019601986791923642, + "loss": 0.2541, + "step": 897 + }, + { + "epoch": 0.1817445861161708, + "grad_norm": 0.3872116208076477, + "learning_rate": 0.0001960109787542238, + "loss": 0.2412, + "step": 898 + }, + { + "epoch": 0.18194697429670106, + "grad_norm": 0.3459681570529938, + "learning_rate": 0.00019600207987582777, + "loss": 0.2707, + "step": 899 + }, + { + "epoch": 0.18214936247723132, + "grad_norm": 0.35269078612327576, + "learning_rate": 0.00019599317128494862, + "loss": 0.2372, + "step": 900 + }, + { + "epoch": 0.18214936247723132, + "eval_loss": 0.2881661057472229, + "eval_runtime": 0.7383, + "eval_samples_per_second": 6.772, + "eval_steps_per_second": 1.354, + "step": 900 + }, + { + "epoch": 0.18235175065776157, + "grad_norm": 0.3326655328273773, + "learning_rate": 0.0001959842529824877, + "loss": 0.2698, + "step": 901 + }, + { + "epoch": 0.18255413883829186, + "grad_norm": 0.42814531922340393, + "learning_rate": 0.00019597532496934717, + "loss": 0.2723, + "step": 902 + }, + { + "epoch": 0.1827565270188221, + "grad_norm": 0.4340554177761078, + "learning_rate": 0.00019596638724643032, + "loss": 0.275, + "step": 903 + }, + { + "epoch": 0.18295891519935237, + "grad_norm": 0.2819032371044159, + "learning_rate": 0.0001959574398146414, + "loss": 0.2369, + "step": 904 + }, + { + "epoch": 0.18316130337988262, + "grad_norm": 0.31827113032341003, + "learning_rate": 0.00019594848267488556, + "loss": 0.2772, + "step": 905 + }, + { + "epoch": 0.18336369156041288, + "grad_norm": 0.31110498309135437, + "learning_rate": 0.00019593951582806902, + "loss": 0.226, + "step": 906 + }, + { + "epoch": 0.18356607974094313, + "grad_norm": 0.34133848547935486, + "learning_rate": 0.00019593053927509897, + "loss": 0.1788, + "step": 907 + }, + { + "epoch": 0.1837684679214734, + "grad_norm": 0.30660688877105713, + "learning_rate": 0.00019592155301688356, + "loss": 0.2477, + "step": 908 + }, + { + "epoch": 0.18397085610200364, + "grad_norm": 0.34563958644866943, + "learning_rate": 0.00019591255705433188, + "loss": 0.2614, + "step": 909 + }, + { + "epoch": 0.1841732442825339, + "grad_norm": 0.37129271030426025, + "learning_rate": 0.00019590355138835407, + "loss": 0.2313, + "step": 910 + }, + { + "epoch": 0.18437563246306415, + "grad_norm": 0.3038152754306793, + "learning_rate": 0.00019589453601986123, + "loss": 0.2475, + "step": 911 + }, + { + "epoch": 0.1845780206435944, + "grad_norm": 0.31466180086135864, + "learning_rate": 0.0001958855109497655, + "loss": 0.2383, + "step": 912 + }, + { + "epoch": 0.18478040882412466, + "grad_norm": 0.2817492187023163, + "learning_rate": 0.0001958764761789798, + "loss": 0.2495, + "step": 913 + }, + { + "epoch": 0.18498279700465492, + "grad_norm": 0.5035717487335205, + "learning_rate": 0.00019586743170841832, + "loss": 0.2825, + "step": 914 + }, + { + "epoch": 0.18518518518518517, + "grad_norm": 0.37324804067611694, + "learning_rate": 0.00019585837753899603, + "loss": 0.268, + "step": 915 + }, + { + "epoch": 0.18538757336571543, + "grad_norm": 0.29959744215011597, + "learning_rate": 0.0001958493136716289, + "loss": 0.2341, + "step": 916 + }, + { + "epoch": 0.1855899615462457, + "grad_norm": 0.30992889404296875, + "learning_rate": 0.00019584024010723398, + "loss": 0.2398, + "step": 917 + }, + { + "epoch": 0.18579234972677597, + "grad_norm": 0.41944313049316406, + "learning_rate": 0.00019583115684672917, + "loss": 0.3288, + "step": 918 + }, + { + "epoch": 0.18599473790730622, + "grad_norm": 0.417007714509964, + "learning_rate": 0.00019582206389103348, + "loss": 0.2268, + "step": 919 + }, + { + "epoch": 0.18619712608783648, + "grad_norm": 0.42923831939697266, + "learning_rate": 0.0001958129612410668, + "loss": 0.2854, + "step": 920 + }, + { + "epoch": 0.18639951426836673, + "grad_norm": 0.2931532859802246, + "learning_rate": 0.00019580384889775006, + "loss": 0.23, + "step": 921 + }, + { + "epoch": 0.186601902448897, + "grad_norm": 0.40126919746398926, + "learning_rate": 0.00019579472686200518, + "loss": 0.2682, + "step": 922 + }, + { + "epoch": 0.18680429062942724, + "grad_norm": 0.3147217333316803, + "learning_rate": 0.00019578559513475495, + "loss": 0.2346, + "step": 923 + }, + { + "epoch": 0.1870066788099575, + "grad_norm": 0.32852163910865784, + "learning_rate": 0.00019577645371692332, + "loss": 0.2728, + "step": 924 + }, + { + "epoch": 0.18720906699048775, + "grad_norm": 0.41189318895339966, + "learning_rate": 0.00019576730260943503, + "loss": 0.3268, + "step": 925 + }, + { + "epoch": 0.187411455171018, + "grad_norm": 0.3896682858467102, + "learning_rate": 0.00019575814181321593, + "loss": 0.2162, + "step": 926 + }, + { + "epoch": 0.18761384335154827, + "grad_norm": 0.33515724539756775, + "learning_rate": 0.00019574897132919284, + "loss": 0.2487, + "step": 927 + }, + { + "epoch": 0.18781623153207852, + "grad_norm": 0.310791015625, + "learning_rate": 0.00019573979115829353, + "loss": 0.2704, + "step": 928 + }, + { + "epoch": 0.18801861971260878, + "grad_norm": 0.3307356536388397, + "learning_rate": 0.00019573060130144673, + "loss": 0.2184, + "step": 929 + }, + { + "epoch": 0.18822100789313903, + "grad_norm": 0.35177716612815857, + "learning_rate": 0.0001957214017595822, + "loss": 0.2221, + "step": 930 + }, + { + "epoch": 0.18842339607366929, + "grad_norm": 0.38704296946525574, + "learning_rate": 0.00019571219253363057, + "loss": 0.239, + "step": 931 + }, + { + "epoch": 0.18862578425419954, + "grad_norm": 0.40003615617752075, + "learning_rate": 0.00019570297362452363, + "loss": 0.2707, + "step": 932 + }, + { + "epoch": 0.18882817243472982, + "grad_norm": 0.3730137050151825, + "learning_rate": 0.00019569374503319403, + "loss": 0.2705, + "step": 933 + }, + { + "epoch": 0.18903056061526008, + "grad_norm": 0.3282415270805359, + "learning_rate": 0.0001956845067605754, + "loss": 0.2668, + "step": 934 + }, + { + "epoch": 0.18923294879579033, + "grad_norm": 0.3039020299911499, + "learning_rate": 0.00019567525880760238, + "loss": 0.2294, + "step": 935 + }, + { + "epoch": 0.1894353369763206, + "grad_norm": 0.40377500653266907, + "learning_rate": 0.00019566600117521058, + "loss": 0.2432, + "step": 936 + }, + { + "epoch": 0.18963772515685084, + "grad_norm": 0.2721993029117584, + "learning_rate": 0.0001956567338643366, + "loss": 0.2113, + "step": 937 + }, + { + "epoch": 0.1898401133373811, + "grad_norm": 0.2860236167907715, + "learning_rate": 0.00019564745687591803, + "loss": 0.2504, + "step": 938 + }, + { + "epoch": 0.19004250151791136, + "grad_norm": 0.29996049404144287, + "learning_rate": 0.00019563817021089338, + "loss": 0.26, + "step": 939 + }, + { + "epoch": 0.1902448896984416, + "grad_norm": 0.2724343538284302, + "learning_rate": 0.00019562887387020216, + "loss": 0.2515, + "step": 940 + }, + { + "epoch": 0.19044727787897187, + "grad_norm": 0.3486071228981018, + "learning_rate": 0.0001956195678547849, + "loss": 0.3149, + "step": 941 + }, + { + "epoch": 0.19064966605950212, + "grad_norm": 0.33210688829421997, + "learning_rate": 0.0001956102521655831, + "loss": 0.2132, + "step": 942 + }, + { + "epoch": 0.19085205424003238, + "grad_norm": 0.32016050815582275, + "learning_rate": 0.0001956009268035392, + "loss": 0.2585, + "step": 943 + }, + { + "epoch": 0.19105444242056263, + "grad_norm": 0.3609424829483032, + "learning_rate": 0.00019559159176959668, + "loss": 0.2609, + "step": 944 + }, + { + "epoch": 0.1912568306010929, + "grad_norm": 0.3491605818271637, + "learning_rate": 0.00019558224706469994, + "loss": 0.236, + "step": 945 + }, + { + "epoch": 0.19145921878162314, + "grad_norm": 0.3373927175998688, + "learning_rate": 0.00019557289268979435, + "loss": 0.2615, + "step": 946 + }, + { + "epoch": 0.1916616069621534, + "grad_norm": 0.40465179085731506, + "learning_rate": 0.00019556352864582633, + "loss": 0.2617, + "step": 947 + }, + { + "epoch": 0.19186399514268368, + "grad_norm": 0.3118032217025757, + "learning_rate": 0.00019555415493374324, + "loss": 0.2469, + "step": 948 + }, + { + "epoch": 0.19206638332321394, + "grad_norm": 0.3560132086277008, + "learning_rate": 0.00019554477155449336, + "loss": 0.2562, + "step": 949 + }, + { + "epoch": 0.1922687715037442, + "grad_norm": 0.35652318596839905, + "learning_rate": 0.000195535378509026, + "loss": 0.2473, + "step": 950 + }, + { + "epoch": 0.1922687715037442, + "eval_loss": 0.29233258962631226, + "eval_runtime": 0.7383, + "eval_samples_per_second": 6.772, + "eval_steps_per_second": 1.354, + "step": 950 + }, + { + "epoch": 0.19247115968427445, + "grad_norm": 0.38569778203964233, + "learning_rate": 0.00019552597579829156, + "loss": 0.2271, + "step": 951 + }, + { + "epoch": 0.1926735478648047, + "grad_norm": 0.3487134277820587, + "learning_rate": 0.00019551656342324118, + "loss": 0.2568, + "step": 952 + }, + { + "epoch": 0.19287593604533496, + "grad_norm": 0.29401594400405884, + "learning_rate": 0.00019550714138482717, + "loss": 0.2609, + "step": 953 + }, + { + "epoch": 0.1930783242258652, + "grad_norm": 0.342074453830719, + "learning_rate": 0.00019549770968400277, + "loss": 0.2566, + "step": 954 + }, + { + "epoch": 0.19328071240639547, + "grad_norm": 0.33882638812065125, + "learning_rate": 0.0001954882683217221, + "loss": 0.2631, + "step": 955 + }, + { + "epoch": 0.19348310058692572, + "grad_norm": 0.37705790996551514, + "learning_rate": 0.0001954788172989404, + "loss": 0.2366, + "step": 956 + }, + { + "epoch": 0.19368548876745598, + "grad_norm": 0.3006690442562103, + "learning_rate": 0.00019546935661661382, + "loss": 0.2495, + "step": 957 + }, + { + "epoch": 0.19388787694798623, + "grad_norm": 0.28892847895622253, + "learning_rate": 0.00019545988627569952, + "loss": 0.2503, + "step": 958 + }, + { + "epoch": 0.1940902651285165, + "grad_norm": 0.34635743498802185, + "learning_rate": 0.0001954504062771555, + "loss": 0.2554, + "step": 959 + }, + { + "epoch": 0.19429265330904674, + "grad_norm": 0.3986789286136627, + "learning_rate": 0.000195440916621941, + "loss": 0.2566, + "step": 960 + }, + { + "epoch": 0.194495041489577, + "grad_norm": 0.4037439227104187, + "learning_rate": 0.00019543141731101596, + "loss": 0.2677, + "step": 961 + }, + { + "epoch": 0.19469742967010725, + "grad_norm": 0.5037823915481567, + "learning_rate": 0.0001954219083453415, + "loss": 0.2491, + "step": 962 + }, + { + "epoch": 0.19489981785063754, + "grad_norm": 0.3077455759048462, + "learning_rate": 0.00019541238972587958, + "loss": 0.2614, + "step": 963 + }, + { + "epoch": 0.1951022060311678, + "grad_norm": 0.3154994249343872, + "learning_rate": 0.0001954028614535932, + "loss": 0.2671, + "step": 964 + }, + { + "epoch": 0.19530459421169805, + "grad_norm": 0.3500082790851593, + "learning_rate": 0.0001953933235294464, + "loss": 0.2431, + "step": 965 + }, + { + "epoch": 0.1955069823922283, + "grad_norm": 0.5176851153373718, + "learning_rate": 0.00019538377595440404, + "loss": 0.2662, + "step": 966 + }, + { + "epoch": 0.19570937057275856, + "grad_norm": 0.4201027452945709, + "learning_rate": 0.0001953742187294321, + "loss": 0.2842, + "step": 967 + }, + { + "epoch": 0.1959117587532888, + "grad_norm": 0.3936191201210022, + "learning_rate": 0.00019536465185549746, + "loss": 0.2361, + "step": 968 + }, + { + "epoch": 0.19611414693381907, + "grad_norm": 0.2671091556549072, + "learning_rate": 0.00019535507533356797, + "loss": 0.2429, + "step": 969 + }, + { + "epoch": 0.19631653511434932, + "grad_norm": 0.3662154972553253, + "learning_rate": 0.00019534548916461252, + "loss": 0.3213, + "step": 970 + }, + { + "epoch": 0.19651892329487958, + "grad_norm": 0.2781408131122589, + "learning_rate": 0.00019533589334960093, + "loss": 0.208, + "step": 971 + }, + { + "epoch": 0.19672131147540983, + "grad_norm": 0.3062308728694916, + "learning_rate": 0.000195326287889504, + "loss": 0.2704, + "step": 972 + }, + { + "epoch": 0.1969236996559401, + "grad_norm": 0.3737871050834656, + "learning_rate": 0.0001953166727852935, + "loss": 0.2686, + "step": 973 + }, + { + "epoch": 0.19712608783647034, + "grad_norm": 0.2863426208496094, + "learning_rate": 0.0001953070480379422, + "loss": 0.2275, + "step": 974 + }, + { + "epoch": 0.1973284760170006, + "grad_norm": 0.3075900673866272, + "learning_rate": 0.0001952974136484238, + "loss": 0.2473, + "step": 975 + }, + { + "epoch": 0.19753086419753085, + "grad_norm": 0.30082938075065613, + "learning_rate": 0.00019528776961771308, + "loss": 0.2504, + "step": 976 + }, + { + "epoch": 0.1977332523780611, + "grad_norm": 0.31082528829574585, + "learning_rate": 0.00019527811594678563, + "loss": 0.2632, + "step": 977 + }, + { + "epoch": 0.1979356405585914, + "grad_norm": 0.2519990801811218, + "learning_rate": 0.00019526845263661817, + "loss": 0.21, + "step": 978 + }, + { + "epoch": 0.19813802873912165, + "grad_norm": 0.304943710565567, + "learning_rate": 0.0001952587796881883, + "loss": 0.2578, + "step": 979 + }, + { + "epoch": 0.1983404169196519, + "grad_norm": 0.4617615044116974, + "learning_rate": 0.00019524909710247465, + "loss": 0.2765, + "step": 980 + }, + { + "epoch": 0.19854280510018216, + "grad_norm": 0.3307913541793823, + "learning_rate": 0.0001952394048804568, + "loss": 0.2358, + "step": 981 + }, + { + "epoch": 0.1987451932807124, + "grad_norm": 0.30370616912841797, + "learning_rate": 0.0001952297030231153, + "loss": 0.2148, + "step": 982 + }, + { + "epoch": 0.19894758146124267, + "grad_norm": 0.5463431477546692, + "learning_rate": 0.00019521999153143167, + "loss": 0.2654, + "step": 983 + }, + { + "epoch": 0.19914996964177292, + "grad_norm": 0.3252580463886261, + "learning_rate": 0.00019521027040638844, + "loss": 0.2239, + "step": 984 + }, + { + "epoch": 0.19935235782230318, + "grad_norm": 0.5134966969490051, + "learning_rate": 0.00019520053964896907, + "loss": 0.2509, + "step": 985 + }, + { + "epoch": 0.19955474600283343, + "grad_norm": 0.26822400093078613, + "learning_rate": 0.00019519079926015804, + "loss": 0.2219, + "step": 986 + }, + { + "epoch": 0.1997571341833637, + "grad_norm": 0.29705125093460083, + "learning_rate": 0.00019518104924094075, + "loss": 0.223, + "step": 987 + }, + { + "epoch": 0.19995952236389394, + "grad_norm": 0.2979499399662018, + "learning_rate": 0.00019517128959230365, + "loss": 0.282, + "step": 988 + }, + { + "epoch": 0.2001619105444242, + "grad_norm": 0.3182627856731415, + "learning_rate": 0.00019516152031523405, + "loss": 0.2596, + "step": 989 + }, + { + "epoch": 0.20036429872495445, + "grad_norm": 0.28033894300460815, + "learning_rate": 0.00019515174141072038, + "loss": 0.2134, + "step": 990 + }, + { + "epoch": 0.2005666869054847, + "grad_norm": 0.2765475809574127, + "learning_rate": 0.00019514195287975188, + "loss": 0.246, + "step": 991 + }, + { + "epoch": 0.20076907508601496, + "grad_norm": 0.34840041399002075, + "learning_rate": 0.00019513215472331894, + "loss": 0.2412, + "step": 992 + }, + { + "epoch": 0.20097146326654525, + "grad_norm": 0.2901442050933838, + "learning_rate": 0.00019512234694241278, + "loss": 0.2201, + "step": 993 + }, + { + "epoch": 0.2011738514470755, + "grad_norm": 0.3278302252292633, + "learning_rate": 0.00019511252953802568, + "loss": 0.258, + "step": 994 + }, + { + "epoch": 0.20137623962760576, + "grad_norm": 0.38416075706481934, + "learning_rate": 0.00019510270251115084, + "loss": 0.2476, + "step": 995 + }, + { + "epoch": 0.201578627808136, + "grad_norm": 0.31708383560180664, + "learning_rate": 0.00019509286586278247, + "loss": 0.2711, + "step": 996 + }, + { + "epoch": 0.20178101598866627, + "grad_norm": 0.33503633737564087, + "learning_rate": 0.0001950830195939157, + "loss": 0.2568, + "step": 997 + }, + { + "epoch": 0.20198340416919652, + "grad_norm": 0.3001486659049988, + "learning_rate": 0.00019507316370554674, + "loss": 0.2575, + "step": 998 + }, + { + "epoch": 0.20218579234972678, + "grad_norm": 0.3202657997608185, + "learning_rate": 0.00019506329819867264, + "loss": 0.2213, + "step": 999 + }, + { + "epoch": 0.20238818053025703, + "grad_norm": 0.3761771023273468, + "learning_rate": 0.00019505342307429152, + "loss": 0.2377, + "step": 1000 + }, + { + "epoch": 0.20238818053025703, + "eval_loss": 0.29387423396110535, + "eval_runtime": 0.7386, + "eval_samples_per_second": 6.77, + "eval_steps_per_second": 1.354, + "step": 1000 + }, + { + "epoch": 0.2025905687107873, + "grad_norm": 0.25448256731033325, + "learning_rate": 0.00019504353833340243, + "loss": 0.2335, + "step": 1001 + }, + { + "epoch": 0.20279295689131754, + "grad_norm": 0.26972317695617676, + "learning_rate": 0.00019503364397700543, + "loss": 0.2325, + "step": 1002 + }, + { + "epoch": 0.2029953450718478, + "grad_norm": 0.3720798194408417, + "learning_rate": 0.00019502374000610151, + "loss": 0.2357, + "step": 1003 + }, + { + "epoch": 0.20319773325237805, + "grad_norm": 0.2979353070259094, + "learning_rate": 0.00019501382642169265, + "loss": 0.268, + "step": 1004 + }, + { + "epoch": 0.2034001214329083, + "grad_norm": 0.33817097544670105, + "learning_rate": 0.00019500390322478177, + "loss": 0.3045, + "step": 1005 + }, + { + "epoch": 0.20360250961343856, + "grad_norm": 0.29102396965026855, + "learning_rate": 0.00019499397041637285, + "loss": 0.2421, + "step": 1006 + }, + { + "epoch": 0.20380489779396882, + "grad_norm": 0.35644692182540894, + "learning_rate": 0.00019498402799747079, + "loss": 0.2856, + "step": 1007 + }, + { + "epoch": 0.2040072859744991, + "grad_norm": 0.2464309185743332, + "learning_rate": 0.0001949740759690814, + "loss": 0.2375, + "step": 1008 + }, + { + "epoch": 0.20420967415502936, + "grad_norm": 0.3096578121185303, + "learning_rate": 0.00019496411433221156, + "loss": 0.2123, + "step": 1009 + }, + { + "epoch": 0.2044120623355596, + "grad_norm": 0.43892258405685425, + "learning_rate": 0.00019495414308786909, + "loss": 0.2298, + "step": 1010 + }, + { + "epoch": 0.20461445051608987, + "grad_norm": 0.37740859389305115, + "learning_rate": 0.00019494416223706274, + "loss": 0.2645, + "step": 1011 + }, + { + "epoch": 0.20481683869662012, + "grad_norm": 0.34194040298461914, + "learning_rate": 0.0001949341717808023, + "loss": 0.2501, + "step": 1012 + }, + { + "epoch": 0.20501922687715038, + "grad_norm": 0.3642720580101013, + "learning_rate": 0.0001949241717200985, + "loss": 0.2856, + "step": 1013 + }, + { + "epoch": 0.20522161505768063, + "grad_norm": 0.34601861238479614, + "learning_rate": 0.00019491416205596305, + "loss": 0.2536, + "step": 1014 + }, + { + "epoch": 0.2054240032382109, + "grad_norm": 0.2937442362308502, + "learning_rate": 0.00019490414278940858, + "loss": 0.2354, + "step": 1015 + }, + { + "epoch": 0.20562639141874114, + "grad_norm": 0.2591923475265503, + "learning_rate": 0.00019489411392144875, + "loss": 0.2362, + "step": 1016 + }, + { + "epoch": 0.2058287795992714, + "grad_norm": 0.32309219241142273, + "learning_rate": 0.00019488407545309824, + "loss": 0.288, + "step": 1017 + }, + { + "epoch": 0.20603116777980165, + "grad_norm": 0.4047209918498993, + "learning_rate": 0.00019487402738537255, + "loss": 0.2606, + "step": 1018 + }, + { + "epoch": 0.2062335559603319, + "grad_norm": 0.35363447666168213, + "learning_rate": 0.00019486396971928827, + "loss": 0.2832, + "step": 1019 + }, + { + "epoch": 0.20643594414086217, + "grad_norm": 0.6062155961990356, + "learning_rate": 0.00019485390245586293, + "loss": 0.2238, + "step": 1020 + }, + { + "epoch": 0.20663833232139242, + "grad_norm": 0.3819604516029358, + "learning_rate": 0.00019484382559611504, + "loss": 0.2462, + "step": 1021 + }, + { + "epoch": 0.20684072050192268, + "grad_norm": 0.3663135766983032, + "learning_rate": 0.00019483373914106405, + "loss": 0.2344, + "step": 1022 + }, + { + "epoch": 0.20704310868245293, + "grad_norm": 0.9232494831085205, + "learning_rate": 0.0001948236430917304, + "loss": 0.2962, + "step": 1023 + }, + { + "epoch": 0.20724549686298321, + "grad_norm": 0.37580475211143494, + "learning_rate": 0.00019481353744913552, + "loss": 0.2449, + "step": 1024 + }, + { + "epoch": 0.20744788504351347, + "grad_norm": 0.2748352587223053, + "learning_rate": 0.0001948034222143018, + "loss": 0.2154, + "step": 1025 + }, + { + "epoch": 0.20765027322404372, + "grad_norm": 0.2947147786617279, + "learning_rate": 0.0001947932973882526, + "loss": 0.2678, + "step": 1026 + }, + { + "epoch": 0.20785266140457398, + "grad_norm": 0.579681932926178, + "learning_rate": 0.00019478316297201218, + "loss": 0.2584, + "step": 1027 + }, + { + "epoch": 0.20805504958510423, + "grad_norm": 0.2945061922073364, + "learning_rate": 0.0001947730189666059, + "loss": 0.2124, + "step": 1028 + }, + { + "epoch": 0.2082574377656345, + "grad_norm": 0.3777732849121094, + "learning_rate": 0.00019476286537306004, + "loss": 0.2362, + "step": 1029 + }, + { + "epoch": 0.20845982594616475, + "grad_norm": 0.3627498745918274, + "learning_rate": 0.00019475270219240174, + "loss": 0.2156, + "step": 1030 + }, + { + "epoch": 0.208662214126695, + "grad_norm": 0.38102152943611145, + "learning_rate": 0.0001947425294256593, + "loss": 0.2703, + "step": 1031 + }, + { + "epoch": 0.20886460230722526, + "grad_norm": 0.380249559879303, + "learning_rate": 0.00019473234707386184, + "loss": 0.2587, + "step": 1032 + }, + { + "epoch": 0.2090669904877555, + "grad_norm": 0.38338467478752136, + "learning_rate": 0.0001947221551380395, + "loss": 0.2455, + "step": 1033 + }, + { + "epoch": 0.20926937866828577, + "grad_norm": 0.5233322381973267, + "learning_rate": 0.00019471195361922346, + "loss": 0.2585, + "step": 1034 + }, + { + "epoch": 0.20947176684881602, + "grad_norm": 0.3479728698730469, + "learning_rate": 0.0001947017425184457, + "loss": 0.237, + "step": 1035 + }, + { + "epoch": 0.20967415502934628, + "grad_norm": 0.3039289116859436, + "learning_rate": 0.00019469152183673936, + "loss": 0.2524, + "step": 1036 + }, + { + "epoch": 0.20987654320987653, + "grad_norm": 0.5088594555854797, + "learning_rate": 0.00019468129157513842, + "loss": 0.2509, + "step": 1037 + }, + { + "epoch": 0.2100789313904068, + "grad_norm": 0.3189416825771332, + "learning_rate": 0.00019467105173467787, + "loss": 0.2619, + "step": 1038 + }, + { + "epoch": 0.21028131957093707, + "grad_norm": 0.5125216245651245, + "learning_rate": 0.00019466080231639367, + "loss": 0.2573, + "step": 1039 + }, + { + "epoch": 0.21048370775146732, + "grad_norm": 0.3192436099052429, + "learning_rate": 0.0001946505433213228, + "loss": 0.2115, + "step": 1040 + }, + { + "epoch": 0.21068609593199758, + "grad_norm": 0.43862384557724, + "learning_rate": 0.00019464027475050305, + "loss": 0.2914, + "step": 1041 + }, + { + "epoch": 0.21088848411252784, + "grad_norm": 0.4017934799194336, + "learning_rate": 0.0001946299966049734, + "loss": 0.2352, + "step": 1042 + }, + { + "epoch": 0.2110908722930581, + "grad_norm": 0.45070984959602356, + "learning_rate": 0.0001946197088857736, + "loss": 0.2795, + "step": 1043 + }, + { + "epoch": 0.21129326047358835, + "grad_norm": 0.45495152473449707, + "learning_rate": 0.0001946094115939445, + "loss": 0.2179, + "step": 1044 + }, + { + "epoch": 0.2114956486541186, + "grad_norm": 0.3081578314304352, + "learning_rate": 0.00019459910473052788, + "loss": 0.2281, + "step": 1045 + }, + { + "epoch": 0.21169803683464886, + "grad_norm": 0.36594170331954956, + "learning_rate": 0.00019458878829656644, + "loss": 0.2252, + "step": 1046 + }, + { + "epoch": 0.2119004250151791, + "grad_norm": 0.3685033321380615, + "learning_rate": 0.0001945784622931039, + "loss": 0.2563, + "step": 1047 + }, + { + "epoch": 0.21210281319570937, + "grad_norm": 0.37463536858558655, + "learning_rate": 0.00019456812672118498, + "loss": 0.2342, + "step": 1048 + }, + { + "epoch": 0.21230520137623962, + "grad_norm": 0.4732096493244171, + "learning_rate": 0.00019455778158185524, + "loss": 0.2471, + "step": 1049 + }, + { + "epoch": 0.21250758955676988, + "grad_norm": 0.3210441470146179, + "learning_rate": 0.0001945474268761614, + "loss": 0.2513, + "step": 1050 + }, + { + "epoch": 0.21250758955676988, + "eval_loss": 0.29186126589775085, + "eval_runtime": 0.7396, + "eval_samples_per_second": 6.761, + "eval_steps_per_second": 1.352, + "step": 1050 + }, + { + "epoch": 0.21270997773730013, + "grad_norm": 0.3155227303504944, + "learning_rate": 0.00019453706260515093, + "loss": 0.2633, + "step": 1051 + }, + { + "epoch": 0.2129123659178304, + "grad_norm": 0.33783820271492004, + "learning_rate": 0.00019452668876987248, + "loss": 0.2694, + "step": 1052 + }, + { + "epoch": 0.21311475409836064, + "grad_norm": 0.2770099639892578, + "learning_rate": 0.00019451630537137548, + "loss": 0.2462, + "step": 1053 + }, + { + "epoch": 0.21331714227889093, + "grad_norm": 0.32644253969192505, + "learning_rate": 0.00019450591241071047, + "loss": 0.2395, + "step": 1054 + }, + { + "epoch": 0.21351953045942118, + "grad_norm": 0.38256141543388367, + "learning_rate": 0.00019449550988892892, + "loss": 0.2407, + "step": 1055 + }, + { + "epoch": 0.21372191863995144, + "grad_norm": 0.30983036756515503, + "learning_rate": 0.00019448509780708312, + "loss": 0.2464, + "step": 1056 + }, + { + "epoch": 0.2139243068204817, + "grad_norm": 0.3679855465888977, + "learning_rate": 0.00019447467616622662, + "loss": 0.2608, + "step": 1057 + }, + { + "epoch": 0.21412669500101195, + "grad_norm": 0.3574540615081787, + "learning_rate": 0.00019446424496741366, + "loss": 0.2257, + "step": 1058 + }, + { + "epoch": 0.2143290831815422, + "grad_norm": 0.38781675696372986, + "learning_rate": 0.0001944538042116996, + "loss": 0.2339, + "step": 1059 + }, + { + "epoch": 0.21453147136207246, + "grad_norm": 0.39075520634651184, + "learning_rate": 0.00019444335390014073, + "loss": 0.269, + "step": 1060 + }, + { + "epoch": 0.2147338595426027, + "grad_norm": 0.4448465406894684, + "learning_rate": 0.0001944328940337943, + "loss": 0.2571, + "step": 1061 + }, + { + "epoch": 0.21493624772313297, + "grad_norm": 0.3397972583770752, + "learning_rate": 0.0001944224246137185, + "loss": 0.2667, + "step": 1062 + }, + { + "epoch": 0.21513863590366322, + "grad_norm": 0.3251417875289917, + "learning_rate": 0.00019441194564097258, + "loss": 0.2203, + "step": 1063 + }, + { + "epoch": 0.21534102408419348, + "grad_norm": 0.3613958954811096, + "learning_rate": 0.00019440145711661664, + "loss": 0.2751, + "step": 1064 + }, + { + "epoch": 0.21554341226472373, + "grad_norm": 0.38893675804138184, + "learning_rate": 0.0001943909590417118, + "loss": 0.236, + "step": 1065 + }, + { + "epoch": 0.215745800445254, + "grad_norm": 0.3045949935913086, + "learning_rate": 0.00019438045141732016, + "loss": 0.2284, + "step": 1066 + }, + { + "epoch": 0.21594818862578424, + "grad_norm": 0.5375556349754333, + "learning_rate": 0.00019436993424450476, + "loss": 0.2692, + "step": 1067 + }, + { + "epoch": 0.2161505768063145, + "grad_norm": 0.3492136299610138, + "learning_rate": 0.00019435940752432967, + "loss": 0.2843, + "step": 1068 + }, + { + "epoch": 0.21635296498684478, + "grad_norm": 0.23821096122264862, + "learning_rate": 0.00019434887125785975, + "loss": 0.1821, + "step": 1069 + }, + { + "epoch": 0.21655535316737504, + "grad_norm": 0.38852131366729736, + "learning_rate": 0.00019433832544616108, + "loss": 0.2703, + "step": 1070 + }, + { + "epoch": 0.2167577413479053, + "grad_norm": 0.30407455563545227, + "learning_rate": 0.00019432777009030053, + "loss": 0.253, + "step": 1071 + }, + { + "epoch": 0.21696012952843555, + "grad_norm": 0.47758540511131287, + "learning_rate": 0.00019431720519134596, + "loss": 0.2616, + "step": 1072 + }, + { + "epoch": 0.2171625177089658, + "grad_norm": 0.3612705171108246, + "learning_rate": 0.0001943066307503662, + "loss": 0.276, + "step": 1073 + }, + { + "epoch": 0.21736490588949606, + "grad_norm": 0.3635235130786896, + "learning_rate": 0.00019429604676843114, + "loss": 0.2374, + "step": 1074 + }, + { + "epoch": 0.2175672940700263, + "grad_norm": 0.3618505895137787, + "learning_rate": 0.00019428545324661148, + "loss": 0.2657, + "step": 1075 + }, + { + "epoch": 0.21776968225055657, + "grad_norm": 0.42967814207077026, + "learning_rate": 0.00019427485018597897, + "loss": 0.269, + "step": 1076 + }, + { + "epoch": 0.21797207043108682, + "grad_norm": 0.3846936523914337, + "learning_rate": 0.00019426423758760634, + "loss": 0.2277, + "step": 1077 + }, + { + "epoch": 0.21817445861161708, + "grad_norm": 0.3141876459121704, + "learning_rate": 0.00019425361545256727, + "loss": 0.2587, + "step": 1078 + }, + { + "epoch": 0.21837684679214733, + "grad_norm": 0.4163576364517212, + "learning_rate": 0.00019424298378193638, + "loss": 0.24, + "step": 1079 + }, + { + "epoch": 0.2185792349726776, + "grad_norm": 0.30708739161491394, + "learning_rate": 0.00019423234257678925, + "loss": 0.2517, + "step": 1080 + }, + { + "epoch": 0.21878162315320784, + "grad_norm": 0.32779768109321594, + "learning_rate": 0.00019422169183820249, + "loss": 0.2395, + "step": 1081 + }, + { + "epoch": 0.2189840113337381, + "grad_norm": 0.35462939739227295, + "learning_rate": 0.00019421103156725363, + "loss": 0.2727, + "step": 1082 + }, + { + "epoch": 0.21918639951426835, + "grad_norm": 0.3319645822048187, + "learning_rate": 0.00019420036176502107, + "loss": 0.2345, + "step": 1083 + }, + { + "epoch": 0.21938878769479864, + "grad_norm": 0.39992696046829224, + "learning_rate": 0.0001941896824325844, + "loss": 0.2932, + "step": 1084 + }, + { + "epoch": 0.2195911758753289, + "grad_norm": 0.4323594570159912, + "learning_rate": 0.00019417899357102397, + "loss": 0.2737, + "step": 1085 + }, + { + "epoch": 0.21979356405585915, + "grad_norm": 0.31720826029777527, + "learning_rate": 0.00019416829518142118, + "loss": 0.2378, + "step": 1086 + }, + { + "epoch": 0.2199959522363894, + "grad_norm": 0.35780414938926697, + "learning_rate": 0.00019415758726485836, + "loss": 0.2661, + "step": 1087 + }, + { + "epoch": 0.22019834041691966, + "grad_norm": 0.3422088325023651, + "learning_rate": 0.00019414686982241884, + "loss": 0.2429, + "step": 1088 + }, + { + "epoch": 0.2204007285974499, + "grad_norm": 0.33238205313682556, + "learning_rate": 0.00019413614285518693, + "loss": 0.2382, + "step": 1089 + }, + { + "epoch": 0.22060311677798017, + "grad_norm": 0.45054638385772705, + "learning_rate": 0.00019412540636424782, + "loss": 0.2712, + "step": 1090 + }, + { + "epoch": 0.22080550495851042, + "grad_norm": 0.299956738948822, + "learning_rate": 0.00019411466035068776, + "loss": 0.241, + "step": 1091 + }, + { + "epoch": 0.22100789313904068, + "grad_norm": 0.31100958585739136, + "learning_rate": 0.0001941039048155939, + "loss": 0.2306, + "step": 1092 + }, + { + "epoch": 0.22121028131957093, + "grad_norm": 0.3154681921005249, + "learning_rate": 0.00019409313976005436, + "loss": 0.247, + "step": 1093 + }, + { + "epoch": 0.2214126695001012, + "grad_norm": 0.25610601902008057, + "learning_rate": 0.00019408236518515825, + "loss": 0.2112, + "step": 1094 + }, + { + "epoch": 0.22161505768063144, + "grad_norm": 0.3014226257801056, + "learning_rate": 0.00019407158109199565, + "loss": 0.2436, + "step": 1095 + }, + { + "epoch": 0.2218174458611617, + "grad_norm": 0.3093024492263794, + "learning_rate": 0.00019406078748165751, + "loss": 0.2345, + "step": 1096 + }, + { + "epoch": 0.22201983404169195, + "grad_norm": 0.3434992730617523, + "learning_rate": 0.00019404998435523592, + "loss": 0.2294, + "step": 1097 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.32315975427627563, + "learning_rate": 0.00019403917171382373, + "loss": 0.2591, + "step": 1098 + }, + { + "epoch": 0.2224246104027525, + "grad_norm": 0.3836386799812317, + "learning_rate": 0.0001940283495585149, + "loss": 0.2431, + "step": 1099 + }, + { + "epoch": 0.22262699858328275, + "grad_norm": 0.27538156509399414, + "learning_rate": 0.00019401751789040428, + "loss": 0.2458, + "step": 1100 + }, + { + "epoch": 0.22262699858328275, + "eval_loss": 0.2982866168022156, + "eval_runtime": 0.7369, + "eval_samples_per_second": 6.785, + "eval_steps_per_second": 1.357, + "step": 1100 + }, + { + "epoch": 0.222829386763813, + "grad_norm": 0.3932587206363678, + "learning_rate": 0.0001940066767105877, + "loss": 0.2303, + "step": 1101 + }, + { + "epoch": 0.22303177494434326, + "grad_norm": 0.3903684616088867, + "learning_rate": 0.000193995826020162, + "loss": 0.2362, + "step": 1102 + }, + { + "epoch": 0.2232341631248735, + "grad_norm": 0.35070565342903137, + "learning_rate": 0.0001939849658202249, + "loss": 0.2352, + "step": 1103 + }, + { + "epoch": 0.22343655130540377, + "grad_norm": 0.29039356112480164, + "learning_rate": 0.00019397409611187513, + "loss": 0.2248, + "step": 1104 + }, + { + "epoch": 0.22363893948593402, + "grad_norm": 0.29755696654319763, + "learning_rate": 0.00019396321689621238, + "loss": 0.2605, + "step": 1105 + }, + { + "epoch": 0.22384132766646428, + "grad_norm": 0.713554322719574, + "learning_rate": 0.0001939523281743373, + "loss": 0.2707, + "step": 1106 + }, + { + "epoch": 0.22404371584699453, + "grad_norm": 0.3808262348175049, + "learning_rate": 0.00019394142994735145, + "loss": 0.2433, + "step": 1107 + }, + { + "epoch": 0.2242461040275248, + "grad_norm": 0.31507617235183716, + "learning_rate": 0.00019393052221635746, + "loss": 0.2235, + "step": 1108 + }, + { + "epoch": 0.22444849220805504, + "grad_norm": 0.31799739599227905, + "learning_rate": 0.00019391960498245882, + "loss": 0.2398, + "step": 1109 + }, + { + "epoch": 0.2246508803885853, + "grad_norm": 0.36539918184280396, + "learning_rate": 0.00019390867824676004, + "loss": 0.2539, + "step": 1110 + }, + { + "epoch": 0.22485326856911556, + "grad_norm": 0.313516229391098, + "learning_rate": 0.00019389774201036657, + "loss": 0.205, + "step": 1111 + }, + { + "epoch": 0.2250556567496458, + "grad_norm": 0.48267585039138794, + "learning_rate": 0.00019388679627438483, + "loss": 0.2819, + "step": 1112 + }, + { + "epoch": 0.22525804493017607, + "grad_norm": 0.25217655301094055, + "learning_rate": 0.00019387584103992218, + "loss": 0.2286, + "step": 1113 + }, + { + "epoch": 0.22546043311070635, + "grad_norm": 0.3520773649215698, + "learning_rate": 0.00019386487630808697, + "loss": 0.2477, + "step": 1114 + }, + { + "epoch": 0.2256628212912366, + "grad_norm": 0.3652741611003876, + "learning_rate": 0.00019385390207998847, + "loss": 0.2646, + "step": 1115 + }, + { + "epoch": 0.22586520947176686, + "grad_norm": 0.45731592178344727, + "learning_rate": 0.00019384291835673696, + "loss": 0.251, + "step": 1116 + }, + { + "epoch": 0.22606759765229711, + "grad_norm": 0.3881213963031769, + "learning_rate": 0.00019383192513944367, + "loss": 0.229, + "step": 1117 + }, + { + "epoch": 0.22626998583282737, + "grad_norm": 0.32269710302352905, + "learning_rate": 0.00019382092242922075, + "loss": 0.2453, + "step": 1118 + }, + { + "epoch": 0.22647237401335762, + "grad_norm": 0.3093521296977997, + "learning_rate": 0.00019380991022718133, + "loss": 0.2428, + "step": 1119 + }, + { + "epoch": 0.22667476219388788, + "grad_norm": 0.3101629912853241, + "learning_rate": 0.00019379888853443954, + "loss": 0.2236, + "step": 1120 + }, + { + "epoch": 0.22687715037441814, + "grad_norm": 0.32502663135528564, + "learning_rate": 0.00019378785735211046, + "loss": 0.2685, + "step": 1121 + }, + { + "epoch": 0.2270795385549484, + "grad_norm": 0.36589106917381287, + "learning_rate": 0.00019377681668131006, + "loss": 0.247, + "step": 1122 + }, + { + "epoch": 0.22728192673547865, + "grad_norm": 0.32985422015190125, + "learning_rate": 0.00019376576652315532, + "loss": 0.2331, + "step": 1123 + }, + { + "epoch": 0.2274843149160089, + "grad_norm": 0.3607713580131531, + "learning_rate": 0.00019375470687876424, + "loss": 0.266, + "step": 1124 + }, + { + "epoch": 0.22768670309653916, + "grad_norm": 0.2738536298274994, + "learning_rate": 0.0001937436377492556, + "loss": 0.2187, + "step": 1125 + }, + { + "epoch": 0.2278890912770694, + "grad_norm": 0.45239001512527466, + "learning_rate": 0.0001937325591357494, + "loss": 0.2882, + "step": 1126 + }, + { + "epoch": 0.22809147945759967, + "grad_norm": 0.34548941254615784, + "learning_rate": 0.00019372147103936636, + "loss": 0.2365, + "step": 1127 + }, + { + "epoch": 0.22829386763812992, + "grad_norm": 0.503173828125, + "learning_rate": 0.00019371037346122832, + "loss": 0.2808, + "step": 1128 + }, + { + "epoch": 0.22849625581866018, + "grad_norm": 0.3561367094516754, + "learning_rate": 0.00019369926640245796, + "loss": 0.2423, + "step": 1129 + }, + { + "epoch": 0.22869864399919046, + "grad_norm": 0.2863787114620209, + "learning_rate": 0.00019368814986417897, + "loss": 0.2339, + "step": 1130 + }, + { + "epoch": 0.22890103217972071, + "grad_norm": 0.3625780940055847, + "learning_rate": 0.00019367702384751607, + "loss": 0.2037, + "step": 1131 + }, + { + "epoch": 0.22910342036025097, + "grad_norm": 0.28424787521362305, + "learning_rate": 0.00019366588835359485, + "loss": 0.2404, + "step": 1132 + }, + { + "epoch": 0.22930580854078123, + "grad_norm": 0.30158454179763794, + "learning_rate": 0.00019365474338354184, + "loss": 0.2466, + "step": 1133 + }, + { + "epoch": 0.22950819672131148, + "grad_norm": 0.3348842263221741, + "learning_rate": 0.0001936435889384846, + "loss": 0.2854, + "step": 1134 + }, + { + "epoch": 0.22971058490184174, + "grad_norm": 0.4381720721721649, + "learning_rate": 0.00019363242501955168, + "loss": 0.2493, + "step": 1135 + }, + { + "epoch": 0.229912973082372, + "grad_norm": 0.40765929222106934, + "learning_rate": 0.00019362125162787242, + "loss": 0.2845, + "step": 1136 + }, + { + "epoch": 0.23011536126290225, + "grad_norm": 0.329726904630661, + "learning_rate": 0.0001936100687645773, + "loss": 0.2795, + "step": 1137 + }, + { + "epoch": 0.2303177494434325, + "grad_norm": 0.2915996015071869, + "learning_rate": 0.00019359887643079766, + "loss": 0.2134, + "step": 1138 + }, + { + "epoch": 0.23052013762396276, + "grad_norm": 0.3605322241783142, + "learning_rate": 0.0001935876746276658, + "loss": 0.2622, + "step": 1139 + }, + { + "epoch": 0.230722525804493, + "grad_norm": 0.4158976674079895, + "learning_rate": 0.00019357646335631503, + "loss": 0.2564, + "step": 1140 + }, + { + "epoch": 0.23092491398502327, + "grad_norm": 0.5059180855751038, + "learning_rate": 0.0001935652426178796, + "loss": 0.2614, + "step": 1141 + }, + { + "epoch": 0.23112730216555352, + "grad_norm": 0.2895376980304718, + "learning_rate": 0.00019355401241349468, + "loss": 0.2556, + "step": 1142 + }, + { + "epoch": 0.23132969034608378, + "grad_norm": 0.7368476390838623, + "learning_rate": 0.00019354277274429645, + "loss": 0.2987, + "step": 1143 + }, + { + "epoch": 0.23153207852661403, + "grad_norm": 0.3806043565273285, + "learning_rate": 0.000193531523611422, + "loss": 0.2894, + "step": 1144 + }, + { + "epoch": 0.23173446670714432, + "grad_norm": 0.3642440140247345, + "learning_rate": 0.0001935202650160094, + "loss": 0.2633, + "step": 1145 + }, + { + "epoch": 0.23193685488767457, + "grad_norm": 0.38270601630210876, + "learning_rate": 0.0001935089969591977, + "loss": 0.2258, + "step": 1146 + }, + { + "epoch": 0.23213924306820483, + "grad_norm": 0.3156524896621704, + "learning_rate": 0.00019349771944212685, + "loss": 0.2552, + "step": 1147 + }, + { + "epoch": 0.23234163124873508, + "grad_norm": 0.3413656949996948, + "learning_rate": 0.0001934864324659378, + "loss": 0.2604, + "step": 1148 + }, + { + "epoch": 0.23254401942926534, + "grad_norm": 0.2893592119216919, + "learning_rate": 0.00019347513603177246, + "loss": 0.2305, + "step": 1149 + }, + { + "epoch": 0.2327464076097956, + "grad_norm": 0.4488675892353058, + "learning_rate": 0.00019346383014077372, + "loss": 0.2278, + "step": 1150 + }, + { + "epoch": 0.2327464076097956, + "eval_loss": 0.284952312707901, + "eval_runtime": 0.7418, + "eval_samples_per_second": 6.741, + "eval_steps_per_second": 1.348, + "step": 1150 + }, + { + "epoch": 0.23294879579032585, + "grad_norm": 0.38737061619758606, + "learning_rate": 0.00019345251479408528, + "loss": 0.2428, + "step": 1151 + }, + { + "epoch": 0.2331511839708561, + "grad_norm": 0.5436646342277527, + "learning_rate": 0.00019344118999285202, + "loss": 0.2633, + "step": 1152 + }, + { + "epoch": 0.23335357215138636, + "grad_norm": 0.3771059215068817, + "learning_rate": 0.00019342985573821963, + "loss": 0.2295, + "step": 1153 + }, + { + "epoch": 0.2335559603319166, + "grad_norm": 0.3182806372642517, + "learning_rate": 0.00019341851203133476, + "loss": 0.2379, + "step": 1154 + }, + { + "epoch": 0.23375834851244687, + "grad_norm": 0.3835807740688324, + "learning_rate": 0.0001934071588733451, + "loss": 0.2702, + "step": 1155 + }, + { + "epoch": 0.23396073669297712, + "grad_norm": 0.29284048080444336, + "learning_rate": 0.0001933957962653992, + "loss": 0.2558, + "step": 1156 + }, + { + "epoch": 0.23416312487350738, + "grad_norm": 0.2697771489620209, + "learning_rate": 0.00019338442420864665, + "loss": 0.2362, + "step": 1157 + }, + { + "epoch": 0.23436551305403763, + "grad_norm": 0.44306814670562744, + "learning_rate": 0.0001933730427042379, + "loss": 0.2421, + "step": 1158 + }, + { + "epoch": 0.2345679012345679, + "grad_norm": 0.2870837152004242, + "learning_rate": 0.00019336165175332445, + "loss": 0.2491, + "step": 1159 + }, + { + "epoch": 0.23477028941509817, + "grad_norm": 0.2829664647579193, + "learning_rate": 0.00019335025135705869, + "loss": 0.2482, + "step": 1160 + }, + { + "epoch": 0.23497267759562843, + "grad_norm": 0.35894840955734253, + "learning_rate": 0.00019333884151659402, + "loss": 0.2268, + "step": 1161 + }, + { + "epoch": 0.23517506577615868, + "grad_norm": 0.2977736294269562, + "learning_rate": 0.0001933274222330848, + "loss": 0.2323, + "step": 1162 + }, + { + "epoch": 0.23537745395668894, + "grad_norm": 0.3158702552318573, + "learning_rate": 0.00019331599350768622, + "loss": 0.217, + "step": 1163 + }, + { + "epoch": 0.2355798421372192, + "grad_norm": 0.252240926027298, + "learning_rate": 0.00019330455534155458, + "loss": 0.2308, + "step": 1164 + }, + { + "epoch": 0.23578223031774945, + "grad_norm": 0.36202460527420044, + "learning_rate": 0.00019329310773584708, + "loss": 0.2779, + "step": 1165 + }, + { + "epoch": 0.2359846184982797, + "grad_norm": 0.6302611827850342, + "learning_rate": 0.00019328165069172184, + "loss": 0.3018, + "step": 1166 + }, + { + "epoch": 0.23618700667880996, + "grad_norm": 0.2542176842689514, + "learning_rate": 0.00019327018421033798, + "loss": 0.2328, + "step": 1167 + }, + { + "epoch": 0.2363893948593402, + "grad_norm": 0.29622212052345276, + "learning_rate": 0.00019325870829285554, + "loss": 0.2291, + "step": 1168 + }, + { + "epoch": 0.23659178303987047, + "grad_norm": 0.27297911047935486, + "learning_rate": 0.00019324722294043558, + "loss": 0.2508, + "step": 1169 + }, + { + "epoch": 0.23679417122040072, + "grad_norm": 0.4703531563282013, + "learning_rate": 0.00019323572815424002, + "loss": 0.3078, + "step": 1170 + }, + { + "epoch": 0.23699655940093098, + "grad_norm": 0.34398144483566284, + "learning_rate": 0.0001932242239354318, + "loss": 0.2776, + "step": 1171 + }, + { + "epoch": 0.23719894758146123, + "grad_norm": 0.27496278285980225, + "learning_rate": 0.0001932127102851748, + "loss": 0.2555, + "step": 1172 + }, + { + "epoch": 0.2374013357619915, + "grad_norm": 0.2514081597328186, + "learning_rate": 0.00019320118720463382, + "loss": 0.261, + "step": 1173 + }, + { + "epoch": 0.23760372394252174, + "grad_norm": 0.37026259303092957, + "learning_rate": 0.0001931896546949747, + "loss": 0.251, + "step": 1174 + }, + { + "epoch": 0.23780611212305203, + "grad_norm": 0.28460580110549927, + "learning_rate": 0.00019317811275736411, + "loss": 0.2342, + "step": 1175 + }, + { + "epoch": 0.23800850030358228, + "grad_norm": 0.43450552225112915, + "learning_rate": 0.0001931665613929698, + "loss": 0.2765, + "step": 1176 + }, + { + "epoch": 0.23821088848411254, + "grad_norm": 0.27132248878479004, + "learning_rate": 0.00019315500060296037, + "loss": 0.2299, + "step": 1177 + }, + { + "epoch": 0.2384132766646428, + "grad_norm": 0.340587317943573, + "learning_rate": 0.00019314343038850546, + "loss": 0.233, + "step": 1178 + }, + { + "epoch": 0.23861566484517305, + "grad_norm": 0.29796266555786133, + "learning_rate": 0.0001931318507507756, + "loss": 0.2512, + "step": 1179 + }, + { + "epoch": 0.2388180530257033, + "grad_norm": 0.303588330745697, + "learning_rate": 0.00019312026169094232, + "loss": 0.2563, + "step": 1180 + }, + { + "epoch": 0.23902044120623356, + "grad_norm": 0.3156834542751312, + "learning_rate": 0.00019310866321017804, + "loss": 0.2848, + "step": 1181 + }, + { + "epoch": 0.2392228293867638, + "grad_norm": 0.3846820294857025, + "learning_rate": 0.00019309705530965623, + "loss": 0.2661, + "step": 1182 + }, + { + "epoch": 0.23942521756729407, + "grad_norm": 0.3869188129901886, + "learning_rate": 0.00019308543799055117, + "loss": 0.2486, + "step": 1183 + }, + { + "epoch": 0.23962760574782432, + "grad_norm": 0.22980014979839325, + "learning_rate": 0.00019307381125403827, + "loss": 0.2238, + "step": 1184 + }, + { + "epoch": 0.23982999392835458, + "grad_norm": 0.43149107694625854, + "learning_rate": 0.00019306217510129374, + "loss": 0.2774, + "step": 1185 + }, + { + "epoch": 0.24003238210888483, + "grad_norm": 0.3303294777870178, + "learning_rate": 0.00019305052953349483, + "loss": 0.2472, + "step": 1186 + }, + { + "epoch": 0.2402347702894151, + "grad_norm": 0.3163663148880005, + "learning_rate": 0.0001930388745518197, + "loss": 0.2756, + "step": 1187 + }, + { + "epoch": 0.24043715846994534, + "grad_norm": 0.26152318716049194, + "learning_rate": 0.0001930272101574475, + "loss": 0.2315, + "step": 1188 + }, + { + "epoch": 0.2406395466504756, + "grad_norm": 0.3256945013999939, + "learning_rate": 0.00019301553635155832, + "loss": 0.2446, + "step": 1189 + }, + { + "epoch": 0.24084193483100588, + "grad_norm": 0.4159514605998993, + "learning_rate": 0.00019300385313533313, + "loss": 0.2739, + "step": 1190 + }, + { + "epoch": 0.24104432301153614, + "grad_norm": 0.37670719623565674, + "learning_rate": 0.00019299216050995397, + "loss": 0.2332, + "step": 1191 + }, + { + "epoch": 0.2412467111920664, + "grad_norm": 0.35532158613204956, + "learning_rate": 0.00019298045847660378, + "loss": 0.2533, + "step": 1192 + }, + { + "epoch": 0.24144909937259665, + "grad_norm": 0.3971513509750366, + "learning_rate": 0.00019296874703646642, + "loss": 0.2385, + "step": 1193 + }, + { + "epoch": 0.2416514875531269, + "grad_norm": 0.32217881083488464, + "learning_rate": 0.00019295702619072675, + "loss": 0.2215, + "step": 1194 + }, + { + "epoch": 0.24185387573365716, + "grad_norm": 0.2856425344944, + "learning_rate": 0.00019294529594057056, + "loss": 0.2388, + "step": 1195 + }, + { + "epoch": 0.24205626391418741, + "grad_norm": 0.34437721967697144, + "learning_rate": 0.0001929335562871846, + "loss": 0.2309, + "step": 1196 + }, + { + "epoch": 0.24225865209471767, + "grad_norm": 0.32238832116127014, + "learning_rate": 0.00019292180723175654, + "loss": 0.2578, + "step": 1197 + }, + { + "epoch": 0.24246104027524792, + "grad_norm": 0.27359238266944885, + "learning_rate": 0.00019291004877547505, + "loss": 0.2259, + "step": 1198 + }, + { + "epoch": 0.24266342845577818, + "grad_norm": 0.2698918282985687, + "learning_rate": 0.0001928982809195297, + "loss": 0.198, + "step": 1199 + }, + { + "epoch": 0.24286581663630843, + "grad_norm": 0.30236271023750305, + "learning_rate": 0.00019288650366511108, + "loss": 0.234, + "step": 1200 + }, + { + "epoch": 0.24286581663630843, + "eval_loss": 0.30198293924331665, + "eval_runtime": 0.74, + "eval_samples_per_second": 6.757, + "eval_steps_per_second": 1.351, + "step": 1200 + }, + { + "epoch": 0.2430682048168387, + "grad_norm": 0.3634655773639679, + "learning_rate": 0.00019287471701341064, + "loss": 0.2727, + "step": 1201 + }, + { + "epoch": 0.24327059299736895, + "grad_norm": 0.29206645488739014, + "learning_rate": 0.00019286292096562087, + "loss": 0.2519, + "step": 1202 + }, + { + "epoch": 0.2434729811778992, + "grad_norm": 0.38446906208992004, + "learning_rate": 0.00019285111552293517, + "loss": 0.2559, + "step": 1203 + }, + { + "epoch": 0.24367536935842946, + "grad_norm": 0.3113996982574463, + "learning_rate": 0.0001928393006865479, + "loss": 0.2475, + "step": 1204 + }, + { + "epoch": 0.24387775753895974, + "grad_norm": 0.34893080592155457, + "learning_rate": 0.00019282747645765427, + "loss": 0.2776, + "step": 1205 + }, + { + "epoch": 0.24408014571949, + "grad_norm": 0.3176359534263611, + "learning_rate": 0.00019281564283745063, + "loss": 0.2287, + "step": 1206 + }, + { + "epoch": 0.24428253390002025, + "grad_norm": 0.3394664525985718, + "learning_rate": 0.00019280379982713417, + "loss": 0.2335, + "step": 1207 + }, + { + "epoch": 0.2444849220805505, + "grad_norm": 0.35441362857818604, + "learning_rate": 0.000192791947427903, + "loss": 0.2258, + "step": 1208 + }, + { + "epoch": 0.24468731026108076, + "grad_norm": 0.35951903462409973, + "learning_rate": 0.00019278008564095628, + "loss": 0.2358, + "step": 1209 + }, + { + "epoch": 0.24488969844161101, + "grad_norm": 0.28018155694007874, + "learning_rate": 0.00019276821446749398, + "loss": 0.2826, + "step": 1210 + }, + { + "epoch": 0.24509208662214127, + "grad_norm": 0.2939336597919464, + "learning_rate": 0.00019275633390871717, + "loss": 0.2652, + "step": 1211 + }, + { + "epoch": 0.24529447480267152, + "grad_norm": 0.27605128288269043, + "learning_rate": 0.0001927444439658278, + "loss": 0.2196, + "step": 1212 + }, + { + "epoch": 0.24549686298320178, + "grad_norm": 0.28433987498283386, + "learning_rate": 0.00019273254464002867, + "loss": 0.2266, + "step": 1213 + }, + { + "epoch": 0.24569925116373204, + "grad_norm": 0.3328288793563843, + "learning_rate": 0.00019272063593252377, + "loss": 0.2314, + "step": 1214 + }, + { + "epoch": 0.2459016393442623, + "grad_norm": 0.3645075261592865, + "learning_rate": 0.0001927087178445178, + "loss": 0.2484, + "step": 1215 + }, + { + "epoch": 0.24610402752479255, + "grad_norm": 0.2907272279262543, + "learning_rate": 0.00019269679037721654, + "loss": 0.2077, + "step": 1216 + }, + { + "epoch": 0.2463064157053228, + "grad_norm": 0.3099174201488495, + "learning_rate": 0.00019268485353182672, + "loss": 0.2581, + "step": 1217 + }, + { + "epoch": 0.24650880388585306, + "grad_norm": 0.308207631111145, + "learning_rate": 0.00019267290730955595, + "loss": 0.2416, + "step": 1218 + }, + { + "epoch": 0.2467111920663833, + "grad_norm": 0.2589235007762909, + "learning_rate": 0.00019266095171161277, + "loss": 0.2285, + "step": 1219 + }, + { + "epoch": 0.24691358024691357, + "grad_norm": 0.4162542223930359, + "learning_rate": 0.00019264898673920683, + "loss": 0.2631, + "step": 1220 + }, + { + "epoch": 0.24711596842744385, + "grad_norm": 0.37113407254219055, + "learning_rate": 0.00019263701239354854, + "loss": 0.2464, + "step": 1221 + }, + { + "epoch": 0.2473183566079741, + "grad_norm": 0.2872184216976166, + "learning_rate": 0.0001926250286758494, + "loss": 0.25, + "step": 1222 + }, + { + "epoch": 0.24752074478850436, + "grad_norm": 0.3551543056964874, + "learning_rate": 0.0001926130355873217, + "loss": 0.2656, + "step": 1223 + }, + { + "epoch": 0.24772313296903462, + "grad_norm": 0.3972734808921814, + "learning_rate": 0.0001926010331291789, + "loss": 0.226, + "step": 1224 + }, + { + "epoch": 0.24792552114956487, + "grad_norm": 0.27993929386138916, + "learning_rate": 0.00019258902130263517, + "loss": 0.2283, + "step": 1225 + }, + { + "epoch": 0.24812790933009513, + "grad_norm": 0.26829513907432556, + "learning_rate": 0.00019257700010890581, + "loss": 0.2235, + "step": 1226 + }, + { + "epoch": 0.24833029751062538, + "grad_norm": 0.3441103398799896, + "learning_rate": 0.00019256496954920697, + "loss": 0.2574, + "step": 1227 + }, + { + "epoch": 0.24853268569115564, + "grad_norm": 0.3490145206451416, + "learning_rate": 0.0001925529296247558, + "loss": 0.303, + "step": 1228 + }, + { + "epoch": 0.2487350738716859, + "grad_norm": 0.33735692501068115, + "learning_rate": 0.00019254088033677036, + "loss": 0.2282, + "step": 1229 + }, + { + "epoch": 0.24893746205221615, + "grad_norm": 0.2959878742694855, + "learning_rate": 0.00019252882168646965, + "loss": 0.2375, + "step": 1230 + }, + { + "epoch": 0.2491398502327464, + "grad_norm": 0.37007415294647217, + "learning_rate": 0.00019251675367507368, + "loss": 0.2432, + "step": 1231 + }, + { + "epoch": 0.24934223841327666, + "grad_norm": 0.3428284525871277, + "learning_rate": 0.00019250467630380332, + "loss": 0.2706, + "step": 1232 + }, + { + "epoch": 0.2495446265938069, + "grad_norm": 0.3415639400482178, + "learning_rate": 0.00019249258957388047, + "loss": 0.2563, + "step": 1233 + }, + { + "epoch": 0.24974701477433717, + "grad_norm": 0.34776055812835693, + "learning_rate": 0.0001924804934865279, + "loss": 0.28, + "step": 1234 + }, + { + "epoch": 0.24994940295486742, + "grad_norm": 0.35156211256980896, + "learning_rate": 0.0001924683880429694, + "loss": 0.2157, + "step": 1235 + }, + { + "epoch": 0.2501517911353977, + "grad_norm": 0.2750054597854614, + "learning_rate": 0.00019245627324442966, + "loss": 0.2524, + "step": 1236 + }, + { + "epoch": 0.25035417931592796, + "grad_norm": 0.27282828092575073, + "learning_rate": 0.0001924441490921343, + "loss": 0.1987, + "step": 1237 + }, + { + "epoch": 0.2505565674964582, + "grad_norm": 0.3300628960132599, + "learning_rate": 0.00019243201558730996, + "loss": 0.2745, + "step": 1238 + }, + { + "epoch": 0.25075895567698847, + "grad_norm": 0.354278028011322, + "learning_rate": 0.00019241987273118416, + "loss": 0.2715, + "step": 1239 + }, + { + "epoch": 0.2509613438575187, + "grad_norm": 0.2568090856075287, + "learning_rate": 0.0001924077205249854, + "loss": 0.2251, + "step": 1240 + }, + { + "epoch": 0.251163732038049, + "grad_norm": 0.3392896354198456, + "learning_rate": 0.00019239555896994308, + "loss": 0.224, + "step": 1241 + }, + { + "epoch": 0.25136612021857924, + "grad_norm": 0.2745809853076935, + "learning_rate": 0.0001923833880672876, + "loss": 0.2355, + "step": 1242 + }, + { + "epoch": 0.2515685083991095, + "grad_norm": 0.4213380813598633, + "learning_rate": 0.00019237120781825028, + "loss": 0.242, + "step": 1243 + }, + { + "epoch": 0.25177089657963975, + "grad_norm": 0.4063540995121002, + "learning_rate": 0.00019235901822406342, + "loss": 0.3008, + "step": 1244 + }, + { + "epoch": 0.25197328476017, + "grad_norm": 0.3177233934402466, + "learning_rate": 0.00019234681928596018, + "loss": 0.2917, + "step": 1245 + }, + { + "epoch": 0.25217567294070026, + "grad_norm": 0.2831498086452484, + "learning_rate": 0.0001923346110051748, + "loss": 0.224, + "step": 1246 + }, + { + "epoch": 0.2523780611212305, + "grad_norm": 0.26334744691848755, + "learning_rate": 0.00019232239338294225, + "loss": 0.2505, + "step": 1247 + }, + { + "epoch": 0.25258044930176077, + "grad_norm": 0.391368567943573, + "learning_rate": 0.0001923101664204987, + "loss": 0.2583, + "step": 1248 + }, + { + "epoch": 0.252782837482291, + "grad_norm": 0.44112735986709595, + "learning_rate": 0.00019229793011908114, + "loss": 0.2568, + "step": 1249 + }, + { + "epoch": 0.2529852256628213, + "grad_norm": 0.2900398373603821, + "learning_rate": 0.00019228568447992746, + "loss": 0.2362, + "step": 1250 + }, + { + "epoch": 0.2529852256628213, + "eval_loss": 0.29207420349121094, + "eval_runtime": 0.7379, + "eval_samples_per_second": 6.776, + "eval_steps_per_second": 1.355, + "step": 1250 + }, + { + "epoch": 0.25318761384335153, + "grad_norm": 0.41520947217941284, + "learning_rate": 0.00019227342950427657, + "loss": 0.2584, + "step": 1251 + }, + { + "epoch": 0.2533900020238818, + "grad_norm": 0.2793034613132477, + "learning_rate": 0.0001922611651933683, + "loss": 0.2594, + "step": 1252 + }, + { + "epoch": 0.25359239020441204, + "grad_norm": 0.3332253396511078, + "learning_rate": 0.00019224889154844342, + "loss": 0.2877, + "step": 1253 + }, + { + "epoch": 0.2537947783849423, + "grad_norm": 0.343176931142807, + "learning_rate": 0.00019223660857074364, + "loss": 0.2364, + "step": 1254 + }, + { + "epoch": 0.25399716656547255, + "grad_norm": 0.3649160861968994, + "learning_rate": 0.00019222431626151164, + "loss": 0.2773, + "step": 1255 + }, + { + "epoch": 0.2541995547460028, + "grad_norm": 0.29223141074180603, + "learning_rate": 0.00019221201462199102, + "loss": 0.2687, + "step": 1256 + }, + { + "epoch": 0.25440194292653306, + "grad_norm": 0.3030182421207428, + "learning_rate": 0.00019219970365342635, + "loss": 0.2394, + "step": 1257 + }, + { + "epoch": 0.2546043311070634, + "grad_norm": 0.3496536910533905, + "learning_rate": 0.00019218738335706305, + "loss": 0.2833, + "step": 1258 + }, + { + "epoch": 0.25480671928759363, + "grad_norm": 0.28787243366241455, + "learning_rate": 0.00019217505373414766, + "loss": 0.2556, + "step": 1259 + }, + { + "epoch": 0.2550091074681239, + "grad_norm": 0.36825138330459595, + "learning_rate": 0.0001921627147859275, + "loss": 0.2762, + "step": 1260 + }, + { + "epoch": 0.25521149564865414, + "grad_norm": 0.2672223150730133, + "learning_rate": 0.0001921503665136509, + "loss": 0.2851, + "step": 1261 + }, + { + "epoch": 0.2554138838291844, + "grad_norm": 0.2490050494670868, + "learning_rate": 0.00019213800891856717, + "loss": 0.2449, + "step": 1262 + }, + { + "epoch": 0.25561627200971465, + "grad_norm": 0.3491511940956116, + "learning_rate": 0.0001921256420019265, + "loss": 0.2671, + "step": 1263 + }, + { + "epoch": 0.2558186601902449, + "grad_norm": 0.31115198135375977, + "learning_rate": 0.00019211326576497998, + "loss": 0.1985, + "step": 1264 + }, + { + "epoch": 0.25602104837077516, + "grad_norm": 0.3278442621231079, + "learning_rate": 0.0001921008802089798, + "loss": 0.2285, + "step": 1265 + }, + { + "epoch": 0.2562234365513054, + "grad_norm": 0.41726815700531006, + "learning_rate": 0.00019208848533517893, + "loss": 0.2733, + "step": 1266 + }, + { + "epoch": 0.25642582473183567, + "grad_norm": 0.29319825768470764, + "learning_rate": 0.00019207608114483145, + "loss": 0.2535, + "step": 1267 + }, + { + "epoch": 0.2566282129123659, + "grad_norm": 0.33001402020454407, + "learning_rate": 0.00019206366763919216, + "loss": 0.2332, + "step": 1268 + }, + { + "epoch": 0.2568306010928962, + "grad_norm": 0.3235301375389099, + "learning_rate": 0.00019205124481951703, + "loss": 0.2793, + "step": 1269 + }, + { + "epoch": 0.25703298927342644, + "grad_norm": 0.3671470880508423, + "learning_rate": 0.0001920388126870628, + "loss": 0.2597, + "step": 1270 + }, + { + "epoch": 0.2572353774539567, + "grad_norm": 0.3819235861301422, + "learning_rate": 0.00019202637124308728, + "loss": 0.2557, + "step": 1271 + }, + { + "epoch": 0.25743776563448695, + "grad_norm": 0.39303115010261536, + "learning_rate": 0.00019201392048884914, + "loss": 0.2645, + "step": 1272 + }, + { + "epoch": 0.2576401538150172, + "grad_norm": 0.2981061041355133, + "learning_rate": 0.000192001460425608, + "loss": 0.2781, + "step": 1273 + }, + { + "epoch": 0.25784254199554746, + "grad_norm": 0.2990420162677765, + "learning_rate": 0.00019198899105462447, + "loss": 0.2646, + "step": 1274 + }, + { + "epoch": 0.2580449301760777, + "grad_norm": 0.39371538162231445, + "learning_rate": 0.00019197651237716005, + "loss": 0.2723, + "step": 1275 + }, + { + "epoch": 0.25824731835660797, + "grad_norm": 0.3517228364944458, + "learning_rate": 0.0001919640243944772, + "loss": 0.2718, + "step": 1276 + }, + { + "epoch": 0.2584497065371382, + "grad_norm": 0.30374854803085327, + "learning_rate": 0.00019195152710783933, + "loss": 0.2358, + "step": 1277 + }, + { + "epoch": 0.2586520947176685, + "grad_norm": 0.3416211009025574, + "learning_rate": 0.0001919390205185108, + "loss": 0.2693, + "step": 1278 + }, + { + "epoch": 0.25885448289819873, + "grad_norm": 0.2955116033554077, + "learning_rate": 0.00019192650462775688, + "loss": 0.2419, + "step": 1279 + }, + { + "epoch": 0.259056871078729, + "grad_norm": 0.30653461813926697, + "learning_rate": 0.00019191397943684377, + "loss": 0.2676, + "step": 1280 + }, + { + "epoch": 0.25925925925925924, + "grad_norm": 0.23594805598258972, + "learning_rate": 0.00019190144494703865, + "loss": 0.2407, + "step": 1281 + }, + { + "epoch": 0.2594616474397895, + "grad_norm": 0.4294579029083252, + "learning_rate": 0.00019188890115960967, + "loss": 0.2699, + "step": 1282 + }, + { + "epoch": 0.25966403562031976, + "grad_norm": 0.362417072057724, + "learning_rate": 0.00019187634807582587, + "loss": 0.2416, + "step": 1283 + }, + { + "epoch": 0.25986642380085, + "grad_norm": 0.3066248595714569, + "learning_rate": 0.00019186378569695716, + "loss": 0.2449, + "step": 1284 + }, + { + "epoch": 0.26006881198138027, + "grad_norm": 0.3117138743400574, + "learning_rate": 0.00019185121402427458, + "loss": 0.2798, + "step": 1285 + }, + { + "epoch": 0.2602712001619105, + "grad_norm": 0.33185073733329773, + "learning_rate": 0.00019183863305904995, + "loss": 0.2752, + "step": 1286 + }, + { + "epoch": 0.2604735883424408, + "grad_norm": 0.36043572425842285, + "learning_rate": 0.00019182604280255604, + "loss": 0.2694, + "step": 1287 + }, + { + "epoch": 0.26067597652297103, + "grad_norm": 0.2410745471715927, + "learning_rate": 0.00019181344325606666, + "loss": 0.1937, + "step": 1288 + }, + { + "epoch": 0.26087836470350134, + "grad_norm": 0.29702746868133545, + "learning_rate": 0.0001918008344208565, + "loss": 0.2393, + "step": 1289 + }, + { + "epoch": 0.2610807528840316, + "grad_norm": 0.2712313234806061, + "learning_rate": 0.00019178821629820117, + "loss": 0.2199, + "step": 1290 + }, + { + "epoch": 0.26128314106456185, + "grad_norm": 0.25044986605644226, + "learning_rate": 0.0001917755888893772, + "loss": 0.2121, + "step": 1291 + }, + { + "epoch": 0.2614855292450921, + "grad_norm": 0.2928940951824188, + "learning_rate": 0.00019176295219566213, + "loss": 0.2684, + "step": 1292 + }, + { + "epoch": 0.26168791742562236, + "grad_norm": 0.2997845709323883, + "learning_rate": 0.00019175030621833446, + "loss": 0.2613, + "step": 1293 + }, + { + "epoch": 0.2618903056061526, + "grad_norm": 0.3501720130443573, + "learning_rate": 0.0001917376509586735, + "loss": 0.2604, + "step": 1294 + }, + { + "epoch": 0.2620926937866829, + "grad_norm": 0.28727906942367554, + "learning_rate": 0.00019172498641795963, + "loss": 0.2018, + "step": 1295 + }, + { + "epoch": 0.26229508196721313, + "grad_norm": 0.2778218984603882, + "learning_rate": 0.00019171231259747405, + "loss": 0.2164, + "step": 1296 + }, + { + "epoch": 0.2624974701477434, + "grad_norm": 0.42163240909576416, + "learning_rate": 0.00019169962949849904, + "loss": 0.2536, + "step": 1297 + }, + { + "epoch": 0.26269985832827364, + "grad_norm": 0.27423128485679626, + "learning_rate": 0.00019168693712231773, + "loss": 0.2335, + "step": 1298 + }, + { + "epoch": 0.2629022465088039, + "grad_norm": 0.3440936505794525, + "learning_rate": 0.00019167423547021418, + "loss": 0.2326, + "step": 1299 + }, + { + "epoch": 0.26310463468933415, + "grad_norm": 0.5512051582336426, + "learning_rate": 0.00019166152454347336, + "loss": 0.2086, + "step": 1300 + }, + { + "epoch": 0.26310463468933415, + "eval_loss": 0.29028138518333435, + "eval_runtime": 0.7404, + "eval_samples_per_second": 6.753, + "eval_steps_per_second": 1.351, + "step": 1300 + }, + { + "epoch": 0.2633070228698644, + "grad_norm": 0.48292115330696106, + "learning_rate": 0.00019164880434338133, + "loss": 0.3138, + "step": 1301 + }, + { + "epoch": 0.26350941105039466, + "grad_norm": 0.4760967791080475, + "learning_rate": 0.00019163607487122494, + "loss": 0.2372, + "step": 1302 + }, + { + "epoch": 0.2637117992309249, + "grad_norm": 0.349447101354599, + "learning_rate": 0.00019162333612829198, + "loss": 0.263, + "step": 1303 + }, + { + "epoch": 0.26391418741145517, + "grad_norm": 0.39979079365730286, + "learning_rate": 0.00019161058811587126, + "loss": 0.2567, + "step": 1304 + }, + { + "epoch": 0.2641165755919854, + "grad_norm": 0.32025107741355896, + "learning_rate": 0.00019159783083525254, + "loss": 0.232, + "step": 1305 + }, + { + "epoch": 0.2643189637725157, + "grad_norm": 0.3130270838737488, + "learning_rate": 0.0001915850642877264, + "loss": 0.2278, + "step": 1306 + }, + { + "epoch": 0.26452135195304594, + "grad_norm": 0.46600160002708435, + "learning_rate": 0.00019157228847458443, + "loss": 0.258, + "step": 1307 + }, + { + "epoch": 0.2647237401335762, + "grad_norm": 0.4704112410545349, + "learning_rate": 0.00019155950339711918, + "loss": 0.2413, + "step": 1308 + }, + { + "epoch": 0.26492612831410645, + "grad_norm": 0.6043704748153687, + "learning_rate": 0.00019154670905662414, + "loss": 0.2561, + "step": 1309 + }, + { + "epoch": 0.2651285164946367, + "grad_norm": 0.3092261254787445, + "learning_rate": 0.00019153390545439362, + "loss": 0.2234, + "step": 1310 + }, + { + "epoch": 0.26533090467516696, + "grad_norm": 0.35605481266975403, + "learning_rate": 0.00019152109259172302, + "loss": 0.2598, + "step": 1311 + }, + { + "epoch": 0.2655332928556972, + "grad_norm": 0.375472754240036, + "learning_rate": 0.00019150827046990858, + "loss": 0.2651, + "step": 1312 + }, + { + "epoch": 0.26573568103622747, + "grad_norm": 0.41777515411376953, + "learning_rate": 0.00019149543909024753, + "loss": 0.2597, + "step": 1313 + }, + { + "epoch": 0.2659380692167577, + "grad_norm": 0.41457483172416687, + "learning_rate": 0.000191482598454038, + "loss": 0.2745, + "step": 1314 + }, + { + "epoch": 0.266140457397288, + "grad_norm": 0.3231428265571594, + "learning_rate": 0.00019146974856257905, + "loss": 0.2431, + "step": 1315 + }, + { + "epoch": 0.26634284557781823, + "grad_norm": 0.3488149046897888, + "learning_rate": 0.00019145688941717075, + "loss": 0.2137, + "step": 1316 + }, + { + "epoch": 0.2665452337583485, + "grad_norm": 0.42718151211738586, + "learning_rate": 0.000191444021019114, + "loss": 0.2431, + "step": 1317 + }, + { + "epoch": 0.26674762193887874, + "grad_norm": 0.27386337518692017, + "learning_rate": 0.00019143114336971073, + "loss": 0.2168, + "step": 1318 + }, + { + "epoch": 0.26695001011940905, + "grad_norm": 0.3161386251449585, + "learning_rate": 0.00019141825647026376, + "loss": 0.2723, + "step": 1319 + }, + { + "epoch": 0.2671523982999393, + "grad_norm": 0.27285152673721313, + "learning_rate": 0.0001914053603220768, + "loss": 0.2013, + "step": 1320 + }, + { + "epoch": 0.26735478648046956, + "grad_norm": 0.321103572845459, + "learning_rate": 0.00019139245492645466, + "loss": 0.2282, + "step": 1321 + }, + { + "epoch": 0.2675571746609998, + "grad_norm": 0.39927050471305847, + "learning_rate": 0.00019137954028470284, + "loss": 0.2403, + "step": 1322 + }, + { + "epoch": 0.2677595628415301, + "grad_norm": 0.4967322051525116, + "learning_rate": 0.00019136661639812798, + "loss": 0.2373, + "step": 1323 + }, + { + "epoch": 0.26796195102206033, + "grad_norm": 0.3009711503982544, + "learning_rate": 0.0001913536832680376, + "loss": 0.2455, + "step": 1324 + }, + { + "epoch": 0.2681643392025906, + "grad_norm": 0.27346163988113403, + "learning_rate": 0.00019134074089574007, + "loss": 0.2134, + "step": 1325 + }, + { + "epoch": 0.26836672738312084, + "grad_norm": 0.24805238842964172, + "learning_rate": 0.00019132778928254485, + "loss": 0.2317, + "step": 1326 + }, + { + "epoch": 0.2685691155636511, + "grad_norm": 0.251960426568985, + "learning_rate": 0.00019131482842976217, + "loss": 0.2462, + "step": 1327 + }, + { + "epoch": 0.26877150374418135, + "grad_norm": 0.34561920166015625, + "learning_rate": 0.0001913018583387033, + "loss": 0.2778, + "step": 1328 + }, + { + "epoch": 0.2689738919247116, + "grad_norm": 0.26779308915138245, + "learning_rate": 0.00019128887901068045, + "loss": 0.2451, + "step": 1329 + }, + { + "epoch": 0.26917628010524186, + "grad_norm": 0.28858324885368347, + "learning_rate": 0.00019127589044700668, + "loss": 0.2433, + "step": 1330 + }, + { + "epoch": 0.2693786682857721, + "grad_norm": 0.3663479685783386, + "learning_rate": 0.00019126289264899607, + "loss": 0.2349, + "step": 1331 + }, + { + "epoch": 0.26958105646630237, + "grad_norm": 0.35032492876052856, + "learning_rate": 0.0001912498856179636, + "loss": 0.241, + "step": 1332 + }, + { + "epoch": 0.2697834446468326, + "grad_norm": 0.26940011978149414, + "learning_rate": 0.00019123686935522516, + "loss": 0.2337, + "step": 1333 + }, + { + "epoch": 0.2699858328273629, + "grad_norm": 0.30461275577545166, + "learning_rate": 0.00019122384386209765, + "loss": 0.2457, + "step": 1334 + }, + { + "epoch": 0.27018822100789314, + "grad_norm": 0.27946141362190247, + "learning_rate": 0.0001912108091398988, + "loss": 0.2095, + "step": 1335 + }, + { + "epoch": 0.2703906091884234, + "grad_norm": 0.3439542353153229, + "learning_rate": 0.00019119776518994734, + "loss": 0.2673, + "step": 1336 + }, + { + "epoch": 0.27059299736895365, + "grad_norm": 0.6692569255828857, + "learning_rate": 0.00019118471201356291, + "loss": 0.2584, + "step": 1337 + }, + { + "epoch": 0.2707953855494839, + "grad_norm": 0.30140796303749084, + "learning_rate": 0.00019117164961206614, + "loss": 0.2286, + "step": 1338 + }, + { + "epoch": 0.27099777373001416, + "grad_norm": 0.3491653800010681, + "learning_rate": 0.0001911585779867785, + "loss": 0.2616, + "step": 1339 + }, + { + "epoch": 0.2712001619105444, + "grad_norm": 0.27138373255729675, + "learning_rate": 0.00019114549713902245, + "loss": 0.2468, + "step": 1340 + }, + { + "epoch": 0.27140255009107467, + "grad_norm": 0.41578251123428345, + "learning_rate": 0.0001911324070701214, + "loss": 0.2548, + "step": 1341 + }, + { + "epoch": 0.2716049382716049, + "grad_norm": 0.28390955924987793, + "learning_rate": 0.0001911193077813996, + "loss": 0.2046, + "step": 1342 + }, + { + "epoch": 0.2718073264521352, + "grad_norm": 0.32496026158332825, + "learning_rate": 0.00019110619927418238, + "loss": 0.2447, + "step": 1343 + }, + { + "epoch": 0.27200971463266543, + "grad_norm": 0.29900750517845154, + "learning_rate": 0.0001910930815497959, + "loss": 0.2665, + "step": 1344 + }, + { + "epoch": 0.2722121028131957, + "grad_norm": 0.441050261259079, + "learning_rate": 0.00019107995460956723, + "loss": 0.265, + "step": 1345 + }, + { + "epoch": 0.27241449099372594, + "grad_norm": 0.3301616609096527, + "learning_rate": 0.00019106681845482444, + "loss": 0.2637, + "step": 1346 + }, + { + "epoch": 0.2726168791742562, + "grad_norm": 0.3114319443702698, + "learning_rate": 0.0001910536730868965, + "loss": 0.2217, + "step": 1347 + }, + { + "epoch": 0.27281926735478645, + "grad_norm": 0.38809117674827576, + "learning_rate": 0.00019104051850711334, + "loss": 0.2534, + "step": 1348 + }, + { + "epoch": 0.27302165553531677, + "grad_norm": 0.321119487285614, + "learning_rate": 0.0001910273547168058, + "loss": 0.2569, + "step": 1349 + }, + { + "epoch": 0.273224043715847, + "grad_norm": 0.2682242691516876, + "learning_rate": 0.00019101418171730568, + "loss": 0.2079, + "step": 1350 + }, + { + "epoch": 0.273224043715847, + "eval_loss": 0.2806507647037506, + "eval_runtime": 0.7397, + "eval_samples_per_second": 6.76, + "eval_steps_per_second": 1.352, + "step": 1350 + }, + { + "epoch": 0.2734264318963773, + "grad_norm": 0.32450467348098755, + "learning_rate": 0.00019100099950994563, + "loss": 0.2446, + "step": 1351 + }, + { + "epoch": 0.27362882007690753, + "grad_norm": 0.3531615138053894, + "learning_rate": 0.00019098780809605933, + "loss": 0.2664, + "step": 1352 + }, + { + "epoch": 0.2738312082574378, + "grad_norm": 0.4027129113674164, + "learning_rate": 0.0001909746074769813, + "loss": 0.2531, + "step": 1353 + }, + { + "epoch": 0.27403359643796804, + "grad_norm": 0.27756667137145996, + "learning_rate": 0.0001909613976540471, + "loss": 0.2147, + "step": 1354 + }, + { + "epoch": 0.2742359846184983, + "grad_norm": 0.3009014427661896, + "learning_rate": 0.00019094817862859312, + "loss": 0.2544, + "step": 1355 + }, + { + "epoch": 0.27443837279902855, + "grad_norm": 0.3375817537307739, + "learning_rate": 0.00019093495040195673, + "loss": 0.2391, + "step": 1356 + }, + { + "epoch": 0.2746407609795588, + "grad_norm": 0.2892801761627197, + "learning_rate": 0.0001909217129754762, + "loss": 0.2123, + "step": 1357 + }, + { + "epoch": 0.27484314916008906, + "grad_norm": 0.3353901207447052, + "learning_rate": 0.00019090846635049085, + "loss": 0.2591, + "step": 1358 + }, + { + "epoch": 0.2750455373406193, + "grad_norm": 0.32220199704170227, + "learning_rate": 0.00019089521052834073, + "loss": 0.2508, + "step": 1359 + }, + { + "epoch": 0.2752479255211496, + "grad_norm": 0.3636913299560547, + "learning_rate": 0.00019088194551036695, + "loss": 0.2253, + "step": 1360 + }, + { + "epoch": 0.2754503137016798, + "grad_norm": 0.32039812207221985, + "learning_rate": 0.00019086867129791154, + "loss": 0.2578, + "step": 1361 + }, + { + "epoch": 0.2756527018822101, + "grad_norm": 0.2603769302368164, + "learning_rate": 0.00019085538789231742, + "loss": 0.242, + "step": 1362 + }, + { + "epoch": 0.27585509006274034, + "grad_norm": 0.34952616691589355, + "learning_rate": 0.00019084209529492854, + "loss": 0.2729, + "step": 1363 + }, + { + "epoch": 0.2760574782432706, + "grad_norm": 0.40078848600387573, + "learning_rate": 0.0001908287935070896, + "loss": 0.2578, + "step": 1364 + }, + { + "epoch": 0.27625986642380085, + "grad_norm": 0.3610617518424988, + "learning_rate": 0.00019081548253014642, + "loss": 0.2614, + "step": 1365 + }, + { + "epoch": 0.2764622546043311, + "grad_norm": 0.3041462004184723, + "learning_rate": 0.00019080216236544567, + "loss": 0.2666, + "step": 1366 + }, + { + "epoch": 0.27666464278486136, + "grad_norm": 0.28576037287712097, + "learning_rate": 0.00019078883301433484, + "loss": 0.232, + "step": 1367 + }, + { + "epoch": 0.2768670309653916, + "grad_norm": 0.40742477774620056, + "learning_rate": 0.00019077549447816256, + "loss": 0.241, + "step": 1368 + }, + { + "epoch": 0.27706941914592187, + "grad_norm": 0.5294364094734192, + "learning_rate": 0.00019076214675827825, + "loss": 0.2099, + "step": 1369 + }, + { + "epoch": 0.2772718073264521, + "grad_norm": 0.32854121923446655, + "learning_rate": 0.00019074878985603227, + "loss": 0.2925, + "step": 1370 + }, + { + "epoch": 0.2774741955069824, + "grad_norm": 0.37988781929016113, + "learning_rate": 0.00019073542377277597, + "loss": 0.2368, + "step": 1371 + }, + { + "epoch": 0.27767658368751263, + "grad_norm": 0.30840012431144714, + "learning_rate": 0.00019072204850986154, + "loss": 0.2437, + "step": 1372 + }, + { + "epoch": 0.2778789718680429, + "grad_norm": 0.3330319821834564, + "learning_rate": 0.0001907086640686422, + "loss": 0.2713, + "step": 1373 + }, + { + "epoch": 0.27808136004857315, + "grad_norm": 0.31399691104888916, + "learning_rate": 0.000190695270450472, + "loss": 0.2816, + "step": 1374 + }, + { + "epoch": 0.2782837482291034, + "grad_norm": 0.3240007162094116, + "learning_rate": 0.000190681867656706, + "loss": 0.2752, + "step": 1375 + }, + { + "epoch": 0.27848613640963366, + "grad_norm": 0.3109317421913147, + "learning_rate": 0.00019066845568870014, + "loss": 0.2277, + "step": 1376 + }, + { + "epoch": 0.2786885245901639, + "grad_norm": 0.34070509672164917, + "learning_rate": 0.00019065503454781133, + "loss": 0.2441, + "step": 1377 + }, + { + "epoch": 0.27889091277069417, + "grad_norm": 0.42925670742988586, + "learning_rate": 0.00019064160423539733, + "loss": 0.2751, + "step": 1378 + }, + { + "epoch": 0.2790933009512244, + "grad_norm": 0.3259209990501404, + "learning_rate": 0.00019062816475281692, + "loss": 0.2704, + "step": 1379 + }, + { + "epoch": 0.27929568913175473, + "grad_norm": 0.36306875944137573, + "learning_rate": 0.00019061471610142976, + "loss": 0.2935, + "step": 1380 + }, + { + "epoch": 0.279498077312285, + "grad_norm": 0.3489514887332916, + "learning_rate": 0.00019060125828259641, + "loss": 0.2218, + "step": 1381 + }, + { + "epoch": 0.27970046549281524, + "grad_norm": 0.27014103531837463, + "learning_rate": 0.00019058779129767843, + "loss": 0.2545, + "step": 1382 + }, + { + "epoch": 0.2799028536733455, + "grad_norm": 0.30571702122688293, + "learning_rate": 0.00019057431514803825, + "loss": 0.2527, + "step": 1383 + }, + { + "epoch": 0.28010524185387575, + "grad_norm": 0.3467295169830322, + "learning_rate": 0.00019056082983503924, + "loss": 0.239, + "step": 1384 + }, + { + "epoch": 0.280307630034406, + "grad_norm": 0.301474392414093, + "learning_rate": 0.00019054733536004575, + "loss": 0.2154, + "step": 1385 + }, + { + "epoch": 0.28051001821493626, + "grad_norm": 0.3737625479698181, + "learning_rate": 0.00019053383172442292, + "loss": 0.2965, + "step": 1386 + }, + { + "epoch": 0.2807124063954665, + "grad_norm": 0.3446069359779358, + "learning_rate": 0.00019052031892953698, + "loss": 0.2356, + "step": 1387 + }, + { + "epoch": 0.2809147945759968, + "grad_norm": 0.3060499429702759, + "learning_rate": 0.000190506796976755, + "loss": 0.2277, + "step": 1388 + }, + { + "epoch": 0.28111718275652703, + "grad_norm": 0.3208872377872467, + "learning_rate": 0.00019049326586744497, + "loss": 0.2256, + "step": 1389 + }, + { + "epoch": 0.2813195709370573, + "grad_norm": 0.5294331312179565, + "learning_rate": 0.00019047972560297583, + "loss": 0.264, + "step": 1390 + }, + { + "epoch": 0.28152195911758754, + "grad_norm": 0.5644270181655884, + "learning_rate": 0.00019046617618471745, + "loss": 0.2347, + "step": 1391 + }, + { + "epoch": 0.2817243472981178, + "grad_norm": 0.31018948554992676, + "learning_rate": 0.0001904526176140406, + "loss": 0.2504, + "step": 1392 + }, + { + "epoch": 0.28192673547864805, + "grad_norm": 0.30512624979019165, + "learning_rate": 0.00019043904989231701, + "loss": 0.2609, + "step": 1393 + }, + { + "epoch": 0.2821291236591783, + "grad_norm": 0.28403523564338684, + "learning_rate": 0.00019042547302091934, + "loss": 0.2432, + "step": 1394 + }, + { + "epoch": 0.28233151183970856, + "grad_norm": 0.31579962372779846, + "learning_rate": 0.00019041188700122112, + "loss": 0.2721, + "step": 1395 + }, + { + "epoch": 0.2825339000202388, + "grad_norm": 0.3145096004009247, + "learning_rate": 0.00019039829183459687, + "loss": 0.2439, + "step": 1396 + }, + { + "epoch": 0.28273628820076907, + "grad_norm": 0.40980634093284607, + "learning_rate": 0.00019038468752242198, + "loss": 0.2439, + "step": 1397 + }, + { + "epoch": 0.2829386763812993, + "grad_norm": 0.47898849844932556, + "learning_rate": 0.0001903710740660728, + "loss": 0.2627, + "step": 1398 + }, + { + "epoch": 0.2831410645618296, + "grad_norm": 0.2893619239330292, + "learning_rate": 0.00019035745146692658, + "loss": 0.2028, + "step": 1399 + }, + { + "epoch": 0.28334345274235984, + "grad_norm": 0.2524210512638092, + "learning_rate": 0.00019034381972636157, + "loss": 0.2382, + "step": 1400 + }, + { + "epoch": 0.28334345274235984, + "eval_loss": 0.28519296646118164, + "eval_runtime": 0.736, + "eval_samples_per_second": 6.794, + "eval_steps_per_second": 1.359, + "step": 1400 + }, + { + "epoch": 0.2835458409228901, + "grad_norm": 0.3225509524345398, + "learning_rate": 0.0001903301788457568, + "loss": 0.2582, + "step": 1401 + }, + { + "epoch": 0.28374822910342035, + "grad_norm": 0.5756785273551941, + "learning_rate": 0.00019031652882649241, + "loss": 0.2918, + "step": 1402 + }, + { + "epoch": 0.2839506172839506, + "grad_norm": 0.35944122076034546, + "learning_rate": 0.00019030286966994928, + "loss": 0.271, + "step": 1403 + }, + { + "epoch": 0.28415300546448086, + "grad_norm": 0.25696319341659546, + "learning_rate": 0.00019028920137750935, + "loss": 0.2162, + "step": 1404 + }, + { + "epoch": 0.2843553936450111, + "grad_norm": 0.41459545493125916, + "learning_rate": 0.00019027552395055542, + "loss": 0.276, + "step": 1405 + }, + { + "epoch": 0.28455778182554137, + "grad_norm": 0.33863842487335205, + "learning_rate": 0.0001902618373904712, + "loss": 0.2558, + "step": 1406 + }, + { + "epoch": 0.2847601700060716, + "grad_norm": 0.3595339059829712, + "learning_rate": 0.0001902481416986414, + "loss": 0.2439, + "step": 1407 + }, + { + "epoch": 0.2849625581866019, + "grad_norm": 0.3099048137664795, + "learning_rate": 0.00019023443687645158, + "loss": 0.2133, + "step": 1408 + }, + { + "epoch": 0.28516494636713213, + "grad_norm": 0.3222504258155823, + "learning_rate": 0.00019022072292528827, + "loss": 0.2519, + "step": 1409 + }, + { + "epoch": 0.28536733454766244, + "grad_norm": 0.2719772458076477, + "learning_rate": 0.00019020699984653887, + "loss": 0.2447, + "step": 1410 + }, + { + "epoch": 0.2855697227281927, + "grad_norm": 0.35438042879104614, + "learning_rate": 0.00019019326764159176, + "loss": 0.2485, + "step": 1411 + }, + { + "epoch": 0.28577211090872295, + "grad_norm": 0.3307201862335205, + "learning_rate": 0.00019017952631183622, + "loss": 0.2474, + "step": 1412 + }, + { + "epoch": 0.2859744990892532, + "grad_norm": 0.32247617840766907, + "learning_rate": 0.00019016577585866245, + "loss": 0.2476, + "step": 1413 + }, + { + "epoch": 0.28617688726978346, + "grad_norm": 0.3016018867492676, + "learning_rate": 0.0001901520162834616, + "loss": 0.258, + "step": 1414 + }, + { + "epoch": 0.2863792754503137, + "grad_norm": 0.32061678171157837, + "learning_rate": 0.00019013824758762565, + "loss": 0.2602, + "step": 1415 + }, + { + "epoch": 0.286581663630844, + "grad_norm": 0.3942313492298126, + "learning_rate": 0.00019012446977254767, + "loss": 0.2585, + "step": 1416 + }, + { + "epoch": 0.28678405181137423, + "grad_norm": 0.5417084097862244, + "learning_rate": 0.00019011068283962147, + "loss": 0.2643, + "step": 1417 + }, + { + "epoch": 0.2869864399919045, + "grad_norm": 0.5987349152565002, + "learning_rate": 0.0001900968867902419, + "loss": 0.2477, + "step": 1418 + }, + { + "epoch": 0.28718882817243474, + "grad_norm": 0.30098897218704224, + "learning_rate": 0.00019008308162580474, + "loss": 0.278, + "step": 1419 + }, + { + "epoch": 0.287391216352965, + "grad_norm": 0.27245810627937317, + "learning_rate": 0.0001900692673477066, + "loss": 0.2326, + "step": 1420 + }, + { + "epoch": 0.28759360453349525, + "grad_norm": 0.2615533173084259, + "learning_rate": 0.00019005544395734502, + "loss": 0.211, + "step": 1421 + }, + { + "epoch": 0.2877959927140255, + "grad_norm": 0.3715825378894806, + "learning_rate": 0.00019004161145611863, + "loss": 0.2878, + "step": 1422 + }, + { + "epoch": 0.28799838089455576, + "grad_norm": 0.35649946331977844, + "learning_rate": 0.00019002776984542675, + "loss": 0.2386, + "step": 1423 + }, + { + "epoch": 0.288200769075086, + "grad_norm": 0.3113993704319, + "learning_rate": 0.0001900139191266698, + "loss": 0.2379, + "step": 1424 + }, + { + "epoch": 0.28840315725561627, + "grad_norm": 0.3101493716239929, + "learning_rate": 0.00019000005930124898, + "loss": 0.2639, + "step": 1425 + }, + { + "epoch": 0.2886055454361465, + "grad_norm": 0.2791244387626648, + "learning_rate": 0.00018998619037056654, + "loss": 0.2283, + "step": 1426 + }, + { + "epoch": 0.2888079336166768, + "grad_norm": 0.3837342858314514, + "learning_rate": 0.00018997231233602556, + "loss": 0.2673, + "step": 1427 + }, + { + "epoch": 0.28901032179720704, + "grad_norm": 0.39756709337234497, + "learning_rate": 0.00018995842519903012, + "loss": 0.2601, + "step": 1428 + }, + { + "epoch": 0.2892127099777373, + "grad_norm": 0.25341853499412537, + "learning_rate": 0.0001899445289609851, + "loss": 0.1997, + "step": 1429 + }, + { + "epoch": 0.28941509815826755, + "grad_norm": 0.2866188585758209, + "learning_rate": 0.00018993062362329641, + "loss": 0.2308, + "step": 1430 + }, + { + "epoch": 0.2896174863387978, + "grad_norm": 0.34560084342956543, + "learning_rate": 0.0001899167091873709, + "loss": 0.2422, + "step": 1431 + }, + { + "epoch": 0.28981987451932806, + "grad_norm": 0.3137929439544678, + "learning_rate": 0.00018990278565461622, + "loss": 0.2295, + "step": 1432 + }, + { + "epoch": 0.2900222626998583, + "grad_norm": 0.35084256529808044, + "learning_rate": 0.00018988885302644102, + "loss": 0.303, + "step": 1433 + }, + { + "epoch": 0.29022465088038857, + "grad_norm": 0.36562684178352356, + "learning_rate": 0.00018987491130425488, + "loss": 0.2403, + "step": 1434 + }, + { + "epoch": 0.2904270390609188, + "grad_norm": 0.2591904103755951, + "learning_rate": 0.00018986096048946824, + "loss": 0.2006, + "step": 1435 + }, + { + "epoch": 0.2906294272414491, + "grad_norm": 0.4177074432373047, + "learning_rate": 0.00018984700058349252, + "loss": 0.2805, + "step": 1436 + }, + { + "epoch": 0.29083181542197933, + "grad_norm": 0.5934389233589172, + "learning_rate": 0.00018983303158774003, + "loss": 0.2531, + "step": 1437 + }, + { + "epoch": 0.2910342036025096, + "grad_norm": 0.3575098216533661, + "learning_rate": 0.00018981905350362404, + "loss": 0.2335, + "step": 1438 + }, + { + "epoch": 0.29123659178303984, + "grad_norm": 0.38351020216941833, + "learning_rate": 0.00018980506633255864, + "loss": 0.2444, + "step": 1439 + }, + { + "epoch": 0.29143897996357016, + "grad_norm": 0.35039186477661133, + "learning_rate": 0.00018979107007595895, + "loss": 0.2598, + "step": 1440 + }, + { + "epoch": 0.2916413681441004, + "grad_norm": 0.297146737575531, + "learning_rate": 0.000189777064735241, + "loss": 0.2332, + "step": 1441 + }, + { + "epoch": 0.29184375632463067, + "grad_norm": 0.39954692125320435, + "learning_rate": 0.0001897630503118216, + "loss": 0.249, + "step": 1442 + }, + { + "epoch": 0.2920461445051609, + "grad_norm": 0.549950122833252, + "learning_rate": 0.0001897490268071187, + "loss": 0.2489, + "step": 1443 + }, + { + "epoch": 0.2922485326856912, + "grad_norm": 0.3395889103412628, + "learning_rate": 0.00018973499422255094, + "loss": 0.2877, + "step": 1444 + }, + { + "epoch": 0.29245092086622143, + "grad_norm": 0.26225653290748596, + "learning_rate": 0.00018972095255953808, + "loss": 0.2412, + "step": 1445 + }, + { + "epoch": 0.2926533090467517, + "grad_norm": 0.4954787790775299, + "learning_rate": 0.00018970690181950066, + "loss": 0.2555, + "step": 1446 + }, + { + "epoch": 0.29285569722728194, + "grad_norm": 0.3388492166996002, + "learning_rate": 0.0001896928420038602, + "loss": 0.2505, + "step": 1447 + }, + { + "epoch": 0.2930580854078122, + "grad_norm": 0.5826833844184875, + "learning_rate": 0.00018967877311403913, + "loss": 0.2618, + "step": 1448 + }, + { + "epoch": 0.29326047358834245, + "grad_norm": 0.4034349024295807, + "learning_rate": 0.00018966469515146076, + "loss": 0.2711, + "step": 1449 + }, + { + "epoch": 0.2934628617688727, + "grad_norm": 0.468124121427536, + "learning_rate": 0.00018965060811754937, + "loss": 0.268, + "step": 1450 + }, + { + "epoch": 0.2934628617688727, + "eval_loss": 0.29375138878822327, + "eval_runtime": 0.738, + "eval_samples_per_second": 6.775, + "eval_steps_per_second": 1.355, + "step": 1450 + }, + { + "epoch": 0.29366524994940296, + "grad_norm": 0.4334132671356201, + "learning_rate": 0.00018963651201373019, + "loss": 0.2469, + "step": 1451 + }, + { + "epoch": 0.2938676381299332, + "grad_norm": 0.3310418426990509, + "learning_rate": 0.00018962240684142922, + "loss": 0.2579, + "step": 1452 + }, + { + "epoch": 0.2940700263104635, + "grad_norm": 0.3076673150062561, + "learning_rate": 0.00018960829260207356, + "loss": 0.2619, + "step": 1453 + }, + { + "epoch": 0.29427241449099373, + "grad_norm": 0.3559109568595886, + "learning_rate": 0.0001895941692970911, + "loss": 0.2869, + "step": 1454 + }, + { + "epoch": 0.294474802671524, + "grad_norm": 0.5222853422164917, + "learning_rate": 0.00018958003692791066, + "loss": 0.2916, + "step": 1455 + }, + { + "epoch": 0.29467719085205424, + "grad_norm": 0.3845933675765991, + "learning_rate": 0.00018956589549596207, + "loss": 0.2368, + "step": 1456 + }, + { + "epoch": 0.2948795790325845, + "grad_norm": 0.3979237973690033, + "learning_rate": 0.00018955174500267594, + "loss": 0.3314, + "step": 1457 + }, + { + "epoch": 0.29508196721311475, + "grad_norm": 0.35677894949913025, + "learning_rate": 0.00018953758544948393, + "loss": 0.252, + "step": 1458 + }, + { + "epoch": 0.295284355393645, + "grad_norm": 0.2729237377643585, + "learning_rate": 0.00018952341683781856, + "loss": 0.2204, + "step": 1459 + }, + { + "epoch": 0.29548674357417526, + "grad_norm": 0.2631382942199707, + "learning_rate": 0.0001895092391691132, + "loss": 0.2401, + "step": 1460 + }, + { + "epoch": 0.2956891317547055, + "grad_norm": 0.33820971846580505, + "learning_rate": 0.00018949505244480225, + "loss": 0.2214, + "step": 1461 + }, + { + "epoch": 0.29589151993523577, + "grad_norm": 0.3907553255558014, + "learning_rate": 0.00018948085666632092, + "loss": 0.2483, + "step": 1462 + }, + { + "epoch": 0.296093908115766, + "grad_norm": 0.6589711904525757, + "learning_rate": 0.00018946665183510546, + "loss": 0.2595, + "step": 1463 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 0.3188416659832001, + "learning_rate": 0.00018945243795259292, + "loss": 0.2417, + "step": 1464 + }, + { + "epoch": 0.29649868447682654, + "grad_norm": 0.37853628396987915, + "learning_rate": 0.0001894382150202213, + "loss": 0.2763, + "step": 1465 + }, + { + "epoch": 0.2967010726573568, + "grad_norm": 0.3539217710494995, + "learning_rate": 0.00018942398303942957, + "loss": 0.2592, + "step": 1466 + }, + { + "epoch": 0.29690346083788705, + "grad_norm": 0.4117416441440582, + "learning_rate": 0.00018940974201165755, + "loss": 0.2304, + "step": 1467 + }, + { + "epoch": 0.2971058490184173, + "grad_norm": 0.3291616141796112, + "learning_rate": 0.00018939549193834601, + "loss": 0.2513, + "step": 1468 + }, + { + "epoch": 0.29730823719894756, + "grad_norm": 0.3089660704135895, + "learning_rate": 0.00018938123282093657, + "loss": 0.2, + "step": 1469 + }, + { + "epoch": 0.2975106253794778, + "grad_norm": 0.33169400691986084, + "learning_rate": 0.0001893669646608719, + "loss": 0.2808, + "step": 1470 + }, + { + "epoch": 0.2977130135600081, + "grad_norm": 0.26605546474456787, + "learning_rate": 0.00018935268745959543, + "loss": 0.2435, + "step": 1471 + }, + { + "epoch": 0.2979154017405384, + "grad_norm": 0.31251972913742065, + "learning_rate": 0.00018933840121855165, + "loss": 0.2528, + "step": 1472 + }, + { + "epoch": 0.29811778992106863, + "grad_norm": 0.309332937002182, + "learning_rate": 0.00018932410593918583, + "loss": 0.2659, + "step": 1473 + }, + { + "epoch": 0.2983201781015989, + "grad_norm": 0.29334747791290283, + "learning_rate": 0.00018930980162294424, + "loss": 0.2319, + "step": 1474 + }, + { + "epoch": 0.29852256628212914, + "grad_norm": 0.3506908416748047, + "learning_rate": 0.00018929548827127402, + "loss": 0.2367, + "step": 1475 + }, + { + "epoch": 0.2987249544626594, + "grad_norm": 0.2785925567150116, + "learning_rate": 0.00018928116588562332, + "loss": 0.2221, + "step": 1476 + }, + { + "epoch": 0.29892734264318965, + "grad_norm": 0.33908000588417053, + "learning_rate": 0.00018926683446744103, + "loss": 0.2646, + "step": 1477 + }, + { + "epoch": 0.2991297308237199, + "grad_norm": 0.33341488242149353, + "learning_rate": 0.0001892524940181771, + "loss": 0.268, + "step": 1478 + }, + { + "epoch": 0.29933211900425016, + "grad_norm": 0.31938436627388, + "learning_rate": 0.00018923814453928234, + "loss": 0.2757, + "step": 1479 + }, + { + "epoch": 0.2995345071847804, + "grad_norm": 0.31264033913612366, + "learning_rate": 0.0001892237860322085, + "loss": 0.225, + "step": 1480 + }, + { + "epoch": 0.2997368953653107, + "grad_norm": 0.43807071447372437, + "learning_rate": 0.00018920941849840815, + "loss": 0.2412, + "step": 1481 + }, + { + "epoch": 0.29993928354584093, + "grad_norm": 0.32840967178344727, + "learning_rate": 0.00018919504193933495, + "loss": 0.2412, + "step": 1482 + }, + { + "epoch": 0.3001416717263712, + "grad_norm": 0.3693227171897888, + "learning_rate": 0.00018918065635644332, + "loss": 0.2632, + "step": 1483 + }, + { + "epoch": 0.30034405990690144, + "grad_norm": 0.43262529373168945, + "learning_rate": 0.00018916626175118862, + "loss": 0.2599, + "step": 1484 + }, + { + "epoch": 0.3005464480874317, + "grad_norm": 0.39893639087677, + "learning_rate": 0.00018915185812502715, + "loss": 0.2373, + "step": 1485 + }, + { + "epoch": 0.30074883626796195, + "grad_norm": 0.30610501766204834, + "learning_rate": 0.0001891374454794162, + "loss": 0.2421, + "step": 1486 + }, + { + "epoch": 0.3009512244484922, + "grad_norm": 0.3462240993976593, + "learning_rate": 0.00018912302381581374, + "loss": 0.2514, + "step": 1487 + }, + { + "epoch": 0.30115361262902246, + "grad_norm": 0.37143474817276, + "learning_rate": 0.00018910859313567895, + "loss": 0.2465, + "step": 1488 + }, + { + "epoch": 0.3013560008095527, + "grad_norm": 0.3381304442882538, + "learning_rate": 0.0001890941534404717, + "loss": 0.2338, + "step": 1489 + }, + { + "epoch": 0.30155838899008297, + "grad_norm": 0.3870564103126526, + "learning_rate": 0.00018907970473165287, + "loss": 0.293, + "step": 1490 + }, + { + "epoch": 0.3017607771706132, + "grad_norm": 0.44966599345207214, + "learning_rate": 0.00018906524701068418, + "loss": 0.2567, + "step": 1491 + }, + { + "epoch": 0.3019631653511435, + "grad_norm": 0.39895737171173096, + "learning_rate": 0.00018905078027902836, + "loss": 0.2578, + "step": 1492 + }, + { + "epoch": 0.30216555353167374, + "grad_norm": 0.46490851044654846, + "learning_rate": 0.000189036304538149, + "loss": 0.2288, + "step": 1493 + }, + { + "epoch": 0.302367941712204, + "grad_norm": 0.3627021312713623, + "learning_rate": 0.0001890218197895106, + "loss": 0.2695, + "step": 1494 + }, + { + "epoch": 0.30257032989273425, + "grad_norm": 0.29395443201065063, + "learning_rate": 0.00018900732603457855, + "loss": 0.2135, + "step": 1495 + }, + { + "epoch": 0.3027727180732645, + "grad_norm": 0.3824223577976227, + "learning_rate": 0.00018899282327481922, + "loss": 0.2511, + "step": 1496 + }, + { + "epoch": 0.30297510625379476, + "grad_norm": 0.31674912571907043, + "learning_rate": 0.00018897831151169984, + "loss": 0.2589, + "step": 1497 + }, + { + "epoch": 0.303177494434325, + "grad_norm": 0.34949105978012085, + "learning_rate": 0.00018896379074668848, + "loss": 0.2262, + "step": 1498 + }, + { + "epoch": 0.30337988261485527, + "grad_norm": 0.5163675546646118, + "learning_rate": 0.0001889492609812543, + "loss": 0.2443, + "step": 1499 + }, + { + "epoch": 0.3035822707953855, + "grad_norm": 0.29069405794143677, + "learning_rate": 0.00018893472221686723, + "loss": 0.2487, + "step": 1500 + }, + { + "epoch": 0.3035822707953855, + "eval_loss": 0.2916322648525238, + "eval_runtime": 0.7412, + "eval_samples_per_second": 6.746, + "eval_steps_per_second": 1.349, + "step": 1500 + }, + { + "epoch": 0.30378465897591583, + "grad_norm": 0.2935831546783447, + "learning_rate": 0.0001889201744549981, + "loss": 0.2856, + "step": 1501 + }, + { + "epoch": 0.3039870471564461, + "grad_norm": 0.35391557216644287, + "learning_rate": 0.0001889056176971188, + "loss": 0.284, + "step": 1502 + }, + { + "epoch": 0.30418943533697634, + "grad_norm": 0.3323562741279602, + "learning_rate": 0.000188891051944702, + "loss": 0.247, + "step": 1503 + }, + { + "epoch": 0.3043918235175066, + "grad_norm": 0.3300694227218628, + "learning_rate": 0.0001888764771992212, + "loss": 0.2302, + "step": 1504 + }, + { + "epoch": 0.30459421169803685, + "grad_norm": 0.3524026572704315, + "learning_rate": 0.00018886189346215107, + "loss": 0.234, + "step": 1505 + }, + { + "epoch": 0.3047965998785671, + "grad_norm": 0.4068450331687927, + "learning_rate": 0.00018884730073496698, + "loss": 0.2716, + "step": 1506 + }, + { + "epoch": 0.30499898805909736, + "grad_norm": 0.38134053349494934, + "learning_rate": 0.00018883269901914522, + "loss": 0.2431, + "step": 1507 + }, + { + "epoch": 0.3052013762396276, + "grad_norm": 0.3229370713233948, + "learning_rate": 0.00018881808831616313, + "loss": 0.2367, + "step": 1508 + }, + { + "epoch": 0.3054037644201579, + "grad_norm": 0.2964808940887451, + "learning_rate": 0.0001888034686274988, + "loss": 0.2336, + "step": 1509 + }, + { + "epoch": 0.30560615260068813, + "grad_norm": 0.3803769648075104, + "learning_rate": 0.00018878883995463133, + "loss": 0.2276, + "step": 1510 + }, + { + "epoch": 0.3058085407812184, + "grad_norm": 0.31368228793144226, + "learning_rate": 0.00018877420229904067, + "loss": 0.2578, + "step": 1511 + }, + { + "epoch": 0.30601092896174864, + "grad_norm": 0.30417075753211975, + "learning_rate": 0.00018875955566220772, + "loss": 0.251, + "step": 1512 + }, + { + "epoch": 0.3062133171422789, + "grad_norm": 0.3475414216518402, + "learning_rate": 0.00018874490004561426, + "loss": 0.2544, + "step": 1513 + }, + { + "epoch": 0.30641570532280915, + "grad_norm": 0.41898611187934875, + "learning_rate": 0.000188730235450743, + "loss": 0.2621, + "step": 1514 + }, + { + "epoch": 0.3066180935033394, + "grad_norm": 0.3886573910713196, + "learning_rate": 0.0001887155618790776, + "loss": 0.2168, + "step": 1515 + }, + { + "epoch": 0.30682048168386966, + "grad_norm": 0.4358440339565277, + "learning_rate": 0.0001887008793321025, + "loss": 0.2754, + "step": 1516 + }, + { + "epoch": 0.3070228698643999, + "grad_norm": 0.32611727714538574, + "learning_rate": 0.0001886861878113032, + "loss": 0.2672, + "step": 1517 + }, + { + "epoch": 0.30722525804493017, + "grad_norm": 0.3701517581939697, + "learning_rate": 0.00018867148731816592, + "loss": 0.2416, + "step": 1518 + }, + { + "epoch": 0.3074276462254604, + "grad_norm": 0.2883046567440033, + "learning_rate": 0.00018865677785417798, + "loss": 0.268, + "step": 1519 + }, + { + "epoch": 0.3076300344059907, + "grad_norm": 0.4058697819709778, + "learning_rate": 0.00018864205942082757, + "loss": 0.2511, + "step": 1520 + }, + { + "epoch": 0.30783242258652094, + "grad_norm": 0.286826491355896, + "learning_rate": 0.00018862733201960365, + "loss": 0.232, + "step": 1521 + }, + { + "epoch": 0.3080348107670512, + "grad_norm": 0.45446181297302246, + "learning_rate": 0.00018861259565199626, + "loss": 0.2903, + "step": 1522 + }, + { + "epoch": 0.30823719894758145, + "grad_norm": 0.24897044897079468, + "learning_rate": 0.00018859785031949625, + "loss": 0.2304, + "step": 1523 + }, + { + "epoch": 0.3084395871281117, + "grad_norm": 0.3912540078163147, + "learning_rate": 0.00018858309602359539, + "loss": 0.2663, + "step": 1524 + }, + { + "epoch": 0.30864197530864196, + "grad_norm": 0.3550991415977478, + "learning_rate": 0.00018856833276578635, + "loss": 0.2224, + "step": 1525 + }, + { + "epoch": 0.3088443634891722, + "grad_norm": 0.323539137840271, + "learning_rate": 0.00018855356054756273, + "loss": 0.2861, + "step": 1526 + }, + { + "epoch": 0.30904675166970247, + "grad_norm": 0.32665151357650757, + "learning_rate": 0.00018853877937041906, + "loss": 0.2436, + "step": 1527 + }, + { + "epoch": 0.3092491398502327, + "grad_norm": 0.373546838760376, + "learning_rate": 0.00018852398923585072, + "loss": 0.2673, + "step": 1528 + }, + { + "epoch": 0.309451528030763, + "grad_norm": 0.36496198177337646, + "learning_rate": 0.000188509190145354, + "loss": 0.2105, + "step": 1529 + }, + { + "epoch": 0.30965391621129323, + "grad_norm": 0.34335947036743164, + "learning_rate": 0.00018849438210042613, + "loss": 0.2774, + "step": 1530 + }, + { + "epoch": 0.30985630439182354, + "grad_norm": 0.29991385340690613, + "learning_rate": 0.00018847956510256527, + "loss": 0.2592, + "step": 1531 + }, + { + "epoch": 0.3100586925723538, + "grad_norm": 0.36050549149513245, + "learning_rate": 0.00018846473915327041, + "loss": 0.2497, + "step": 1532 + }, + { + "epoch": 0.31026108075288406, + "grad_norm": 0.3393295109272003, + "learning_rate": 0.00018844990425404148, + "loss": 0.2647, + "step": 1533 + }, + { + "epoch": 0.3104634689334143, + "grad_norm": 0.39216938614845276, + "learning_rate": 0.00018843506040637934, + "loss": 0.2557, + "step": 1534 + }, + { + "epoch": 0.31066585711394457, + "grad_norm": 0.2916669249534607, + "learning_rate": 0.00018842020761178574, + "loss": 0.2188, + "step": 1535 + }, + { + "epoch": 0.3108682452944748, + "grad_norm": 0.35340821743011475, + "learning_rate": 0.0001884053458717633, + "loss": 0.2276, + "step": 1536 + }, + { + "epoch": 0.3110706334750051, + "grad_norm": 0.2852288782596588, + "learning_rate": 0.00018839047518781561, + "loss": 0.2531, + "step": 1537 + }, + { + "epoch": 0.31127302165553533, + "grad_norm": 0.4118358790874481, + "learning_rate": 0.0001883755955614471, + "loss": 0.2508, + "step": 1538 + }, + { + "epoch": 0.3114754098360656, + "grad_norm": 0.3326147198677063, + "learning_rate": 0.00018836070699416313, + "loss": 0.2347, + "step": 1539 + }, + { + "epoch": 0.31167779801659584, + "grad_norm": 0.42047885060310364, + "learning_rate": 0.00018834580948746997, + "loss": 0.2716, + "step": 1540 + }, + { + "epoch": 0.3118801861971261, + "grad_norm": 0.2908095121383667, + "learning_rate": 0.00018833090304287486, + "loss": 0.25, + "step": 1541 + }, + { + "epoch": 0.31208257437765635, + "grad_norm": 0.36547958850860596, + "learning_rate": 0.0001883159876618858, + "loss": 0.22, + "step": 1542 + }, + { + "epoch": 0.3122849625581866, + "grad_norm": 0.4909718632698059, + "learning_rate": 0.0001883010633460118, + "loss": 0.2657, + "step": 1543 + }, + { + "epoch": 0.31248735073871686, + "grad_norm": 0.34576284885406494, + "learning_rate": 0.00018828613009676276, + "loss": 0.2642, + "step": 1544 + }, + { + "epoch": 0.3126897389192471, + "grad_norm": 0.3441828191280365, + "learning_rate": 0.0001882711879156494, + "loss": 0.2393, + "step": 1545 + }, + { + "epoch": 0.3128921270997774, + "grad_norm": 0.24068088829517365, + "learning_rate": 0.00018825623680418353, + "loss": 0.2012, + "step": 1546 + }, + { + "epoch": 0.31309451528030763, + "grad_norm": 0.24355578422546387, + "learning_rate": 0.00018824127676387765, + "loss": 0.2314, + "step": 1547 + }, + { + "epoch": 0.3132969034608379, + "grad_norm": 0.3701528310775757, + "learning_rate": 0.00018822630779624528, + "loss": 0.2473, + "step": 1548 + }, + { + "epoch": 0.31349929164136814, + "grad_norm": 0.3159331679344177, + "learning_rate": 0.00018821132990280086, + "loss": 0.2086, + "step": 1549 + }, + { + "epoch": 0.3137016798218984, + "grad_norm": 0.5335456728935242, + "learning_rate": 0.00018819634308505964, + "loss": 0.3292, + "step": 1550 + }, + { + "epoch": 0.3137016798218984, + "eval_loss": 0.29419490694999695, + "eval_runtime": 0.738, + "eval_samples_per_second": 6.775, + "eval_steps_per_second": 1.355, + "step": 1550 + }, + { + "epoch": 0.31390406800242865, + "grad_norm": 0.3889347314834595, + "learning_rate": 0.0001881813473445379, + "loss": 0.2466, + "step": 1551 + }, + { + "epoch": 0.3141064561829589, + "grad_norm": 0.3013499081134796, + "learning_rate": 0.00018816634268275267, + "loss": 0.2538, + "step": 1552 + }, + { + "epoch": 0.31430884436348916, + "grad_norm": 0.25156041979789734, + "learning_rate": 0.00018815132910122206, + "loss": 0.2323, + "step": 1553 + }, + { + "epoch": 0.3145112325440194, + "grad_norm": 0.3922179341316223, + "learning_rate": 0.00018813630660146488, + "loss": 0.2784, + "step": 1554 + }, + { + "epoch": 0.31471362072454967, + "grad_norm": 0.3269888162612915, + "learning_rate": 0.00018812127518500106, + "loss": 0.2898, + "step": 1555 + }, + { + "epoch": 0.3149160089050799, + "grad_norm": 0.3011750280857086, + "learning_rate": 0.00018810623485335118, + "loss": 0.2831, + "step": 1556 + }, + { + "epoch": 0.3151183970856102, + "grad_norm": 0.2626027762889862, + "learning_rate": 0.00018809118560803704, + "loss": 0.222, + "step": 1557 + }, + { + "epoch": 0.31532078526614044, + "grad_norm": 0.32995250821113586, + "learning_rate": 0.000188076127450581, + "loss": 0.2607, + "step": 1558 + }, + { + "epoch": 0.3155231734466707, + "grad_norm": 0.30816736817359924, + "learning_rate": 0.00018806106038250659, + "loss": 0.2375, + "step": 1559 + }, + { + "epoch": 0.31572556162720095, + "grad_norm": 0.3002629280090332, + "learning_rate": 0.00018804598440533808, + "loss": 0.2601, + "step": 1560 + }, + { + "epoch": 0.3159279498077312, + "grad_norm": 0.2814362347126007, + "learning_rate": 0.00018803089952060075, + "loss": 0.2373, + "step": 1561 + }, + { + "epoch": 0.3161303379882615, + "grad_norm": 0.28549739718437195, + "learning_rate": 0.00018801580572982068, + "loss": 0.2239, + "step": 1562 + }, + { + "epoch": 0.31633272616879177, + "grad_norm": 0.3680552542209625, + "learning_rate": 0.00018800070303452495, + "loss": 0.2394, + "step": 1563 + }, + { + "epoch": 0.316535114349322, + "grad_norm": 0.39102286100387573, + "learning_rate": 0.00018798559143624145, + "loss": 0.2526, + "step": 1564 + }, + { + "epoch": 0.3167375025298523, + "grad_norm": 0.26771053671836853, + "learning_rate": 0.00018797047093649903, + "loss": 0.2856, + "step": 1565 + }, + { + "epoch": 0.31693989071038253, + "grad_norm": 0.27007582783699036, + "learning_rate": 0.00018795534153682745, + "loss": 0.2416, + "step": 1566 + }, + { + "epoch": 0.3171422788909128, + "grad_norm": 0.32626280188560486, + "learning_rate": 0.0001879402032387573, + "loss": 0.2517, + "step": 1567 + }, + { + "epoch": 0.31734466707144304, + "grad_norm": 0.3350610136985779, + "learning_rate": 0.00018792505604382014, + "loss": 0.264, + "step": 1568 + }, + { + "epoch": 0.3175470552519733, + "grad_norm": 0.3124147653579712, + "learning_rate": 0.00018790989995354836, + "loss": 0.2744, + "step": 1569 + }, + { + "epoch": 0.31774944343250355, + "grad_norm": 0.34561726450920105, + "learning_rate": 0.0001878947349694754, + "loss": 0.2469, + "step": 1570 + }, + { + "epoch": 0.3179518316130338, + "grad_norm": 1.0323286056518555, + "learning_rate": 0.00018787956109313537, + "loss": 0.2873, + "step": 1571 + }, + { + "epoch": 0.31815421979356406, + "grad_norm": 0.34411680698394775, + "learning_rate": 0.00018786437832606347, + "loss": 0.2407, + "step": 1572 + }, + { + "epoch": 0.3183566079740943, + "grad_norm": 0.281716912984848, + "learning_rate": 0.00018784918666979575, + "loss": 0.2429, + "step": 1573 + }, + { + "epoch": 0.3185589961546246, + "grad_norm": 0.29189565777778625, + "learning_rate": 0.00018783398612586908, + "loss": 0.2099, + "step": 1574 + }, + { + "epoch": 0.31876138433515483, + "grad_norm": 0.41523846983909607, + "learning_rate": 0.00018781877669582132, + "loss": 0.2404, + "step": 1575 + }, + { + "epoch": 0.3189637725156851, + "grad_norm": 0.34226346015930176, + "learning_rate": 0.00018780355838119122, + "loss": 0.2631, + "step": 1576 + }, + { + "epoch": 0.31916616069621534, + "grad_norm": 0.40481576323509216, + "learning_rate": 0.0001877883311835184, + "loss": 0.2535, + "step": 1577 + }, + { + "epoch": 0.3193685488767456, + "grad_norm": 0.42217254638671875, + "learning_rate": 0.00018777309510434337, + "loss": 0.2723, + "step": 1578 + }, + { + "epoch": 0.31957093705727585, + "grad_norm": 0.3216110169887543, + "learning_rate": 0.00018775785014520758, + "loss": 0.2415, + "step": 1579 + }, + { + "epoch": 0.3197733252378061, + "grad_norm": 0.32365474104881287, + "learning_rate": 0.00018774259630765334, + "loss": 0.2317, + "step": 1580 + }, + { + "epoch": 0.31997571341833636, + "grad_norm": 0.32175514101982117, + "learning_rate": 0.00018772733359322387, + "loss": 0.2323, + "step": 1581 + }, + { + "epoch": 0.3201781015988666, + "grad_norm": 0.32359227538108826, + "learning_rate": 0.00018771206200346333, + "loss": 0.2867, + "step": 1582 + }, + { + "epoch": 0.32038048977939687, + "grad_norm": 0.4200432598590851, + "learning_rate": 0.00018769678153991669, + "loss": 0.2628, + "step": 1583 + }, + { + "epoch": 0.3205828779599271, + "grad_norm": 0.35818588733673096, + "learning_rate": 0.0001876814922041299, + "loss": 0.2187, + "step": 1584 + }, + { + "epoch": 0.3207852661404574, + "grad_norm": 0.3140599727630615, + "learning_rate": 0.00018766619399764972, + "loss": 0.2113, + "step": 1585 + }, + { + "epoch": 0.32098765432098764, + "grad_norm": 0.38742703199386597, + "learning_rate": 0.00018765088692202392, + "loss": 0.2689, + "step": 1586 + }, + { + "epoch": 0.3211900425015179, + "grad_norm": 0.3630368411540985, + "learning_rate": 0.00018763557097880112, + "loss": 0.2831, + "step": 1587 + }, + { + "epoch": 0.32139243068204815, + "grad_norm": 0.3636877238750458, + "learning_rate": 0.00018762024616953075, + "loss": 0.2455, + "step": 1588 + }, + { + "epoch": 0.3215948188625784, + "grad_norm": 0.292085736989975, + "learning_rate": 0.00018760491249576332, + "loss": 0.2404, + "step": 1589 + }, + { + "epoch": 0.32179720704310866, + "grad_norm": 0.33262398838996887, + "learning_rate": 0.00018758956995905, + "loss": 0.2235, + "step": 1590 + }, + { + "epoch": 0.3219995952236389, + "grad_norm": 0.4547630548477173, + "learning_rate": 0.00018757421856094314, + "loss": 0.2362, + "step": 1591 + }, + { + "epoch": 0.3222019834041692, + "grad_norm": 0.2708628475666046, + "learning_rate": 0.00018755885830299568, + "loss": 0.2371, + "step": 1592 + }, + { + "epoch": 0.3224043715846995, + "grad_norm": 0.3246055543422699, + "learning_rate": 0.00018754348918676174, + "loss": 0.2462, + "step": 1593 + }, + { + "epoch": 0.32260675976522973, + "grad_norm": 0.26867252588272095, + "learning_rate": 0.0001875281112137961, + "loss": 0.2465, + "step": 1594 + }, + { + "epoch": 0.32280914794576, + "grad_norm": 0.4057390093803406, + "learning_rate": 0.00018751272438565463, + "loss": 0.2211, + "step": 1595 + }, + { + "epoch": 0.32301153612629024, + "grad_norm": 0.3229082226753235, + "learning_rate": 0.00018749732870389392, + "loss": 0.269, + "step": 1596 + }, + { + "epoch": 0.3232139243068205, + "grad_norm": 0.23535144329071045, + "learning_rate": 0.00018748192417007164, + "loss": 0.2086, + "step": 1597 + }, + { + "epoch": 0.32341631248735075, + "grad_norm": 0.30807140469551086, + "learning_rate": 0.00018746651078574618, + "loss": 0.2339, + "step": 1598 + }, + { + "epoch": 0.323618700667881, + "grad_norm": 0.48715728521347046, + "learning_rate": 0.00018745108855247695, + "loss": 0.2279, + "step": 1599 + }, + { + "epoch": 0.32382108884841126, + "grad_norm": 0.3199789822101593, + "learning_rate": 0.00018743565747182417, + "loss": 0.2642, + "step": 1600 + }, + { + "epoch": 0.32382108884841126, + "eval_loss": 0.2936403155326843, + "eval_runtime": 0.7383, + "eval_samples_per_second": 6.772, + "eval_steps_per_second": 1.354, + "step": 1600 + }, + { + "epoch": 0.3240234770289415, + "grad_norm": 0.27700263261795044, + "learning_rate": 0.00018742021754534905, + "loss": 0.2183, + "step": 1601 + }, + { + "epoch": 0.3242258652094718, + "grad_norm": 0.32673951983451843, + "learning_rate": 0.00018740476877461356, + "loss": 0.2448, + "step": 1602 + }, + { + "epoch": 0.32442825339000203, + "grad_norm": 0.2943725287914276, + "learning_rate": 0.00018738931116118074, + "loss": 0.2474, + "step": 1603 + }, + { + "epoch": 0.3246306415705323, + "grad_norm": 0.3454664647579193, + "learning_rate": 0.00018737384470661437, + "loss": 0.2397, + "step": 1604 + }, + { + "epoch": 0.32483302975106254, + "grad_norm": 0.31171998381614685, + "learning_rate": 0.0001873583694124792, + "loss": 0.2565, + "step": 1605 + }, + { + "epoch": 0.3250354179315928, + "grad_norm": 0.2882143259048462, + "learning_rate": 0.00018734288528034085, + "loss": 0.2684, + "step": 1606 + }, + { + "epoch": 0.32523780611212305, + "grad_norm": 0.3082469403743744, + "learning_rate": 0.00018732739231176585, + "loss": 0.2119, + "step": 1607 + }, + { + "epoch": 0.3254401942926533, + "grad_norm": 0.3238343298435211, + "learning_rate": 0.00018731189050832158, + "loss": 0.214, + "step": 1608 + }, + { + "epoch": 0.32564258247318356, + "grad_norm": 0.27819204330444336, + "learning_rate": 0.00018729637987157643, + "loss": 0.2417, + "step": 1609 + }, + { + "epoch": 0.3258449706537138, + "grad_norm": 0.29245132207870483, + "learning_rate": 0.0001872808604030995, + "loss": 0.2403, + "step": 1610 + }, + { + "epoch": 0.32604735883424407, + "grad_norm": 0.3408953547477722, + "learning_rate": 0.000187265332104461, + "loss": 0.2475, + "step": 1611 + }, + { + "epoch": 0.3262497470147743, + "grad_norm": 0.32225704193115234, + "learning_rate": 0.00018724979497723184, + "loss": 0.2317, + "step": 1612 + }, + { + "epoch": 0.3264521351953046, + "grad_norm": 0.3903481066226959, + "learning_rate": 0.00018723424902298392, + "loss": 0.2989, + "step": 1613 + }, + { + "epoch": 0.32665452337583484, + "grad_norm": 0.3457467555999756, + "learning_rate": 0.00018721869424329003, + "loss": 0.266, + "step": 1614 + }, + { + "epoch": 0.3268569115563651, + "grad_norm": 0.29828596115112305, + "learning_rate": 0.0001872031306397238, + "loss": 0.2567, + "step": 1615 + }, + { + "epoch": 0.32705929973689535, + "grad_norm": 0.44933828711509705, + "learning_rate": 0.00018718755821385988, + "loss": 0.2429, + "step": 1616 + }, + { + "epoch": 0.3272616879174256, + "grad_norm": 0.3514581322669983, + "learning_rate": 0.00018717197696727366, + "loss": 0.2696, + "step": 1617 + }, + { + "epoch": 0.32746407609795586, + "grad_norm": 0.2809898853302002, + "learning_rate": 0.00018715638690154144, + "loss": 0.2069, + "step": 1618 + }, + { + "epoch": 0.3276664642784861, + "grad_norm": 0.36968308687210083, + "learning_rate": 0.00018714078801824059, + "loss": 0.2317, + "step": 1619 + }, + { + "epoch": 0.32786885245901637, + "grad_norm": 0.41571712493896484, + "learning_rate": 0.0001871251803189491, + "loss": 0.2883, + "step": 1620 + }, + { + "epoch": 0.3280712406395466, + "grad_norm": 0.38158875703811646, + "learning_rate": 0.00018710956380524606, + "loss": 0.2533, + "step": 1621 + }, + { + "epoch": 0.32827362882007693, + "grad_norm": 0.24912859499454498, + "learning_rate": 0.00018709393847871143, + "loss": 0.2285, + "step": 1622 + }, + { + "epoch": 0.3284760170006072, + "grad_norm": 0.3398696184158325, + "learning_rate": 0.00018707830434092597, + "loss": 0.2558, + "step": 1623 + }, + { + "epoch": 0.32867840518113745, + "grad_norm": 0.3501795530319214, + "learning_rate": 0.00018706266139347134, + "loss": 0.2625, + "step": 1624 + }, + { + "epoch": 0.3288807933616677, + "grad_norm": 0.27285391092300415, + "learning_rate": 0.0001870470096379302, + "loss": 0.2384, + "step": 1625 + }, + { + "epoch": 0.32908318154219796, + "grad_norm": 0.31238171458244324, + "learning_rate": 0.00018703134907588597, + "loss": 0.2522, + "step": 1626 + }, + { + "epoch": 0.3292855697227282, + "grad_norm": 0.3718354105949402, + "learning_rate": 0.00018701567970892308, + "loss": 0.2297, + "step": 1627 + }, + { + "epoch": 0.32948795790325847, + "grad_norm": 0.34207820892333984, + "learning_rate": 0.00018700000153862675, + "loss": 0.3124, + "step": 1628 + }, + { + "epoch": 0.3296903460837887, + "grad_norm": 0.2899322211742401, + "learning_rate": 0.00018698431456658313, + "loss": 0.2466, + "step": 1629 + }, + { + "epoch": 0.329892734264319, + "grad_norm": 0.326610267162323, + "learning_rate": 0.00018696861879437932, + "loss": 0.2371, + "step": 1630 + }, + { + "epoch": 0.33009512244484923, + "grad_norm": 0.36713942885398865, + "learning_rate": 0.00018695291422360317, + "loss": 0.2841, + "step": 1631 + }, + { + "epoch": 0.3302975106253795, + "grad_norm": 0.3396678864955902, + "learning_rate": 0.00018693720085584357, + "loss": 0.2434, + "step": 1632 + }, + { + "epoch": 0.33049989880590974, + "grad_norm": 0.32362422347068787, + "learning_rate": 0.0001869214786926902, + "loss": 0.2558, + "step": 1633 + }, + { + "epoch": 0.33070228698644, + "grad_norm": 0.2856462895870209, + "learning_rate": 0.00018690574773573367, + "loss": 0.2628, + "step": 1634 + }, + { + "epoch": 0.33090467516697025, + "grad_norm": 0.36801108717918396, + "learning_rate": 0.00018689000798656545, + "loss": 0.2754, + "step": 1635 + }, + { + "epoch": 0.3311070633475005, + "grad_norm": 0.35658085346221924, + "learning_rate": 0.000186874259446778, + "loss": 0.2912, + "step": 1636 + }, + { + "epoch": 0.33130945152803076, + "grad_norm": 0.3088845908641815, + "learning_rate": 0.0001868585021179645, + "loss": 0.2615, + "step": 1637 + }, + { + "epoch": 0.331511839708561, + "grad_norm": 0.3361416757106781, + "learning_rate": 0.00018684273600171918, + "loss": 0.2523, + "step": 1638 + }, + { + "epoch": 0.3317142278890913, + "grad_norm": 0.4012823700904846, + "learning_rate": 0.00018682696109963704, + "loss": 0.263, + "step": 1639 + }, + { + "epoch": 0.33191661606962153, + "grad_norm": 0.30794715881347656, + "learning_rate": 0.00018681117741331407, + "loss": 0.2548, + "step": 1640 + }, + { + "epoch": 0.3321190042501518, + "grad_norm": 0.539105236530304, + "learning_rate": 0.00018679538494434703, + "loss": 0.2455, + "step": 1641 + }, + { + "epoch": 0.33232139243068204, + "grad_norm": 0.2805461287498474, + "learning_rate": 0.0001867795836943337, + "loss": 0.2171, + "step": 1642 + }, + { + "epoch": 0.3325237806112123, + "grad_norm": 0.28716808557510376, + "learning_rate": 0.00018676377366487265, + "loss": 0.261, + "step": 1643 + }, + { + "epoch": 0.33272616879174255, + "grad_norm": 0.23502685129642487, + "learning_rate": 0.00018674795485756337, + "loss": 0.2327, + "step": 1644 + }, + { + "epoch": 0.3329285569722728, + "grad_norm": 0.37822720408439636, + "learning_rate": 0.0001867321272740063, + "loss": 0.2482, + "step": 1645 + }, + { + "epoch": 0.33313094515280306, + "grad_norm": 0.2823399305343628, + "learning_rate": 0.00018671629091580262, + "loss": 0.251, + "step": 1646 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.34142354130744934, + "learning_rate": 0.00018670044578455455, + "loss": 0.2245, + "step": 1647 + }, + { + "epoch": 0.33353572151386357, + "grad_norm": 0.2987143099308014, + "learning_rate": 0.0001866845918818651, + "loss": 0.2458, + "step": 1648 + }, + { + "epoch": 0.3337381096943938, + "grad_norm": 0.4302227199077606, + "learning_rate": 0.00018666872920933823, + "loss": 0.2637, + "step": 1649 + }, + { + "epoch": 0.3339404978749241, + "grad_norm": 0.8706358671188354, + "learning_rate": 0.0001866528577685787, + "loss": 0.2845, + "step": 1650 + }, + { + "epoch": 0.3339404978749241, + "eval_loss": 0.27961453795433044, + "eval_runtime": 0.7359, + "eval_samples_per_second": 6.794, + "eval_steps_per_second": 1.359, + "step": 1650 + }, + { + "epoch": 0.33414288605545434, + "grad_norm": 0.28847336769104004, + "learning_rate": 0.00018663697756119232, + "loss": 0.2259, + "step": 1651 + }, + { + "epoch": 0.3343452742359846, + "grad_norm": 0.3432278335094452, + "learning_rate": 0.00018662108858878557, + "loss": 0.2526, + "step": 1652 + }, + { + "epoch": 0.3345476624165149, + "grad_norm": 0.4454701244831085, + "learning_rate": 0.000186605190852966, + "loss": 0.2525, + "step": 1653 + }, + { + "epoch": 0.33475005059704516, + "grad_norm": 0.498799204826355, + "learning_rate": 0.00018658928435534198, + "loss": 0.2523, + "step": 1654 + }, + { + "epoch": 0.3349524387775754, + "grad_norm": 0.3040686547756195, + "learning_rate": 0.00018657336909752272, + "loss": 0.2291, + "step": 1655 + }, + { + "epoch": 0.33515482695810567, + "grad_norm": 0.2661318778991699, + "learning_rate": 0.00018655744508111837, + "loss": 0.1961, + "step": 1656 + }, + { + "epoch": 0.3353572151386359, + "grad_norm": 0.34656617045402527, + "learning_rate": 0.00018654151230774, + "loss": 0.2629, + "step": 1657 + }, + { + "epoch": 0.3355596033191662, + "grad_norm": 0.35558953881263733, + "learning_rate": 0.00018652557077899947, + "loss": 0.2895, + "step": 1658 + }, + { + "epoch": 0.33576199149969643, + "grad_norm": 0.2632910907268524, + "learning_rate": 0.00018650962049650955, + "loss": 0.2367, + "step": 1659 + }, + { + "epoch": 0.3359643796802267, + "grad_norm": 0.3103640675544739, + "learning_rate": 0.000186493661461884, + "loss": 0.2895, + "step": 1660 + }, + { + "epoch": 0.33616676786075694, + "grad_norm": 0.31030529737472534, + "learning_rate": 0.00018647769367673733, + "loss": 0.2499, + "step": 1661 + }, + { + "epoch": 0.3363691560412872, + "grad_norm": 0.2659652531147003, + "learning_rate": 0.00018646171714268504, + "loss": 0.2279, + "step": 1662 + }, + { + "epoch": 0.33657154422181745, + "grad_norm": 0.3482886254787445, + "learning_rate": 0.00018644573186134343, + "loss": 0.2635, + "step": 1663 + }, + { + "epoch": 0.3367739324023477, + "grad_norm": 0.39144811034202576, + "learning_rate": 0.00018642973783432974, + "loss": 0.2736, + "step": 1664 + }, + { + "epoch": 0.33697632058287796, + "grad_norm": 0.31855225563049316, + "learning_rate": 0.00018641373506326207, + "loss": 0.2553, + "step": 1665 + }, + { + "epoch": 0.3371787087634082, + "grad_norm": 0.30389824509620667, + "learning_rate": 0.0001863977235497594, + "loss": 0.2157, + "step": 1666 + }, + { + "epoch": 0.3373810969439385, + "grad_norm": 0.3267851769924164, + "learning_rate": 0.00018638170329544164, + "loss": 0.2581, + "step": 1667 + }, + { + "epoch": 0.33758348512446873, + "grad_norm": 0.3200203478336334, + "learning_rate": 0.00018636567430192953, + "loss": 0.2484, + "step": 1668 + }, + { + "epoch": 0.337785873304999, + "grad_norm": 0.2819439768791199, + "learning_rate": 0.00018634963657084472, + "loss": 0.2496, + "step": 1669 + }, + { + "epoch": 0.33798826148552924, + "grad_norm": 0.3534572720527649, + "learning_rate": 0.00018633359010380974, + "loss": 0.2551, + "step": 1670 + }, + { + "epoch": 0.3381906496660595, + "grad_norm": 0.3555509150028229, + "learning_rate": 0.000186317534902448, + "loss": 0.2806, + "step": 1671 + }, + { + "epoch": 0.33839303784658975, + "grad_norm": 0.46702417731285095, + "learning_rate": 0.00018630147096838378, + "loss": 0.2207, + "step": 1672 + }, + { + "epoch": 0.33859542602712, + "grad_norm": 0.31666767597198486, + "learning_rate": 0.00018628539830324229, + "loss": 0.2332, + "step": 1673 + }, + { + "epoch": 0.33879781420765026, + "grad_norm": 0.3063281178474426, + "learning_rate": 0.0001862693169086496, + "loss": 0.2301, + "step": 1674 + }, + { + "epoch": 0.3390002023881805, + "grad_norm": 0.23433008790016174, + "learning_rate": 0.0001862532267862326, + "loss": 0.2128, + "step": 1675 + }, + { + "epoch": 0.33920259056871077, + "grad_norm": 0.3376384377479553, + "learning_rate": 0.0001862371279376192, + "loss": 0.2373, + "step": 1676 + }, + { + "epoch": 0.339404978749241, + "grad_norm": 0.41629114747047424, + "learning_rate": 0.00018622102036443806, + "loss": 0.2496, + "step": 1677 + }, + { + "epoch": 0.3396073669297713, + "grad_norm": 0.40785738825798035, + "learning_rate": 0.00018620490406831875, + "loss": 0.244, + "step": 1678 + }, + { + "epoch": 0.33980975511030154, + "grad_norm": 0.3256091773509979, + "learning_rate": 0.00018618877905089183, + "loss": 0.2055, + "step": 1679 + }, + { + "epoch": 0.3400121432908318, + "grad_norm": 0.3316114544868469, + "learning_rate": 0.0001861726453137886, + "loss": 0.207, + "step": 1680 + }, + { + "epoch": 0.34021453147136205, + "grad_norm": 1.1633223295211792, + "learning_rate": 0.00018615650285864132, + "loss": 0.2441, + "step": 1681 + }, + { + "epoch": 0.3404169196518923, + "grad_norm": 0.3376322090625763, + "learning_rate": 0.0001861403516870831, + "loss": 0.2722, + "step": 1682 + }, + { + "epoch": 0.3406193078324226, + "grad_norm": 0.33407843112945557, + "learning_rate": 0.00018612419180074797, + "loss": 0.2484, + "step": 1683 + }, + { + "epoch": 0.34082169601295287, + "grad_norm": 0.28343215584754944, + "learning_rate": 0.0001861080232012708, + "loss": 0.2318, + "step": 1684 + }, + { + "epoch": 0.3410240841934831, + "grad_norm": 0.3230278193950653, + "learning_rate": 0.00018609184589028733, + "loss": 0.2357, + "step": 1685 + }, + { + "epoch": 0.3412264723740134, + "grad_norm": 0.31829163432121277, + "learning_rate": 0.0001860756598694343, + "loss": 0.2468, + "step": 1686 + }, + { + "epoch": 0.34142886055454363, + "grad_norm": 0.3306484520435333, + "learning_rate": 0.00018605946514034915, + "loss": 0.2483, + "step": 1687 + }, + { + "epoch": 0.3416312487350739, + "grad_norm": 0.2926234006881714, + "learning_rate": 0.00018604326170467035, + "loss": 0.2282, + "step": 1688 + }, + { + "epoch": 0.34183363691560414, + "grad_norm": 0.6779906749725342, + "learning_rate": 0.00018602704956403716, + "loss": 0.2533, + "step": 1689 + }, + { + "epoch": 0.3420360250961344, + "grad_norm": 0.3214509189128876, + "learning_rate": 0.00018601082872008977, + "loss": 0.2078, + "step": 1690 + }, + { + "epoch": 0.34223841327666465, + "grad_norm": 0.2985462248325348, + "learning_rate": 0.00018599459917446924, + "loss": 0.2648, + "step": 1691 + }, + { + "epoch": 0.3424408014571949, + "grad_norm": 0.503711462020874, + "learning_rate": 0.0001859783609288175, + "loss": 0.2725, + "step": 1692 + }, + { + "epoch": 0.34264318963772517, + "grad_norm": 0.3204715847969055, + "learning_rate": 0.0001859621139847773, + "loss": 0.2021, + "step": 1693 + }, + { + "epoch": 0.3428455778182554, + "grad_norm": 0.28608301281929016, + "learning_rate": 0.0001859458583439925, + "loss": 0.2391, + "step": 1694 + }, + { + "epoch": 0.3430479659987857, + "grad_norm": 0.3452533483505249, + "learning_rate": 0.0001859295940081075, + "loss": 0.2597, + "step": 1695 + }, + { + "epoch": 0.34325035417931593, + "grad_norm": 0.3648858368396759, + "learning_rate": 0.00018591332097876782, + "loss": 0.2276, + "step": 1696 + }, + { + "epoch": 0.3434527423598462, + "grad_norm": 0.2962428331375122, + "learning_rate": 0.00018589703925761986, + "loss": 0.2382, + "step": 1697 + }, + { + "epoch": 0.34365513054037644, + "grad_norm": 0.33181461691856384, + "learning_rate": 0.00018588074884631076, + "loss": 0.2436, + "step": 1698 + }, + { + "epoch": 0.3438575187209067, + "grad_norm": 0.32770097255706787, + "learning_rate": 0.00018586444974648858, + "loss": 0.241, + "step": 1699 + }, + { + "epoch": 0.34405990690143695, + "grad_norm": 0.23778030276298523, + "learning_rate": 0.00018584814195980238, + "loss": 0.2011, + "step": 1700 + }, + { + "epoch": 0.34405990690143695, + "eval_loss": 0.290479451417923, + "eval_runtime": 0.7406, + "eval_samples_per_second": 6.752, + "eval_steps_per_second": 1.35, + "step": 1700 + }, + { + "epoch": 0.3442622950819672, + "grad_norm": 0.40289005637168884, + "learning_rate": 0.00018583182548790196, + "loss": 0.2699, + "step": 1701 + }, + { + "epoch": 0.34446468326249746, + "grad_norm": 0.46312880516052246, + "learning_rate": 0.00018581550033243806, + "loss": 0.2294, + "step": 1702 + }, + { + "epoch": 0.3446670714430277, + "grad_norm": 0.35846683382987976, + "learning_rate": 0.00018579916649506229, + "loss": 0.2481, + "step": 1703 + }, + { + "epoch": 0.344869459623558, + "grad_norm": 0.5839173197746277, + "learning_rate": 0.00018578282397742712, + "loss": 0.2502, + "step": 1704 + }, + { + "epoch": 0.3450718478040882, + "grad_norm": 0.3601457476615906, + "learning_rate": 0.00018576647278118594, + "loss": 0.2289, + "step": 1705 + }, + { + "epoch": 0.3452742359846185, + "grad_norm": 0.32447540760040283, + "learning_rate": 0.000185750112907993, + "loss": 0.2528, + "step": 1706 + }, + { + "epoch": 0.34547662416514874, + "grad_norm": 0.3305363059043884, + "learning_rate": 0.0001857337443595034, + "loss": 0.302, + "step": 1707 + }, + { + "epoch": 0.345679012345679, + "grad_norm": 0.27065616846084595, + "learning_rate": 0.00018571736713737314, + "loss": 0.2285, + "step": 1708 + }, + { + "epoch": 0.34588140052620925, + "grad_norm": 0.32808080315589905, + "learning_rate": 0.00018570098124325908, + "loss": 0.2727, + "step": 1709 + }, + { + "epoch": 0.3460837887067395, + "grad_norm": 0.29432907700538635, + "learning_rate": 0.00018568458667881895, + "loss": 0.2145, + "step": 1710 + }, + { + "epoch": 0.34628617688726976, + "grad_norm": 0.3078475296497345, + "learning_rate": 0.00018566818344571147, + "loss": 0.2661, + "step": 1711 + }, + { + "epoch": 0.3464885650678, + "grad_norm": 0.3330211341381073, + "learning_rate": 0.0001856517715455961, + "loss": 0.2525, + "step": 1712 + }, + { + "epoch": 0.3466909532483303, + "grad_norm": 0.3153764605522156, + "learning_rate": 0.0001856353509801332, + "loss": 0.2511, + "step": 1713 + }, + { + "epoch": 0.3468933414288606, + "grad_norm": 0.3954264521598816, + "learning_rate": 0.00018561892175098405, + "loss": 0.2523, + "step": 1714 + }, + { + "epoch": 0.34709572960939084, + "grad_norm": 0.2736445367336273, + "learning_rate": 0.0001856024838598108, + "loss": 0.2146, + "step": 1715 + }, + { + "epoch": 0.3472981177899211, + "grad_norm": 0.3401740789413452, + "learning_rate": 0.0001855860373082764, + "loss": 0.2577, + "step": 1716 + }, + { + "epoch": 0.34750050597045135, + "grad_norm": 0.2959592044353485, + "learning_rate": 0.00018556958209804485, + "loss": 0.2301, + "step": 1717 + }, + { + "epoch": 0.3477028941509816, + "grad_norm": 0.28072482347488403, + "learning_rate": 0.00018555311823078083, + "loss": 0.2558, + "step": 1718 + }, + { + "epoch": 0.34790528233151186, + "grad_norm": 0.3037206530570984, + "learning_rate": 0.00018553664570815001, + "loss": 0.264, + "step": 1719 + }, + { + "epoch": 0.3481076705120421, + "grad_norm": 0.27355295419692993, + "learning_rate": 0.0001855201645318189, + "loss": 0.2362, + "step": 1720 + }, + { + "epoch": 0.34831005869257237, + "grad_norm": 0.35578373074531555, + "learning_rate": 0.0001855036747034549, + "loss": 0.2477, + "step": 1721 + }, + { + "epoch": 0.3485124468731026, + "grad_norm": 0.4559978246688843, + "learning_rate": 0.00018548717622472627, + "loss": 0.2813, + "step": 1722 + }, + { + "epoch": 0.3487148350536329, + "grad_norm": 0.42498183250427246, + "learning_rate": 0.00018547066909730214, + "loss": 0.2262, + "step": 1723 + }, + { + "epoch": 0.34891722323416313, + "grad_norm": 0.5447008013725281, + "learning_rate": 0.00018545415332285256, + "loss": 0.2371, + "step": 1724 + }, + { + "epoch": 0.3491196114146934, + "grad_norm": 0.3454398214817047, + "learning_rate": 0.00018543762890304842, + "loss": 0.2485, + "step": 1725 + }, + { + "epoch": 0.34932199959522364, + "grad_norm": 0.3223482072353363, + "learning_rate": 0.00018542109583956148, + "loss": 0.2096, + "step": 1726 + }, + { + "epoch": 0.3495243877757539, + "grad_norm": 0.31731921434402466, + "learning_rate": 0.00018540455413406433, + "loss": 0.2593, + "step": 1727 + }, + { + "epoch": 0.34972677595628415, + "grad_norm": 0.33122220635414124, + "learning_rate": 0.00018538800378823056, + "loss": 0.2643, + "step": 1728 + }, + { + "epoch": 0.3499291641368144, + "grad_norm": 0.2749335467815399, + "learning_rate": 0.00018537144480373455, + "loss": 0.2337, + "step": 1729 + }, + { + "epoch": 0.35013155231734466, + "grad_norm": 0.39072299003601074, + "learning_rate": 0.00018535487718225152, + "loss": 0.2268, + "step": 1730 + }, + { + "epoch": 0.3503339404978749, + "grad_norm": 0.32879638671875, + "learning_rate": 0.00018533830092545763, + "loss": 0.2519, + "step": 1731 + }, + { + "epoch": 0.3505363286784052, + "grad_norm": 0.340533584356308, + "learning_rate": 0.00018532171603502992, + "loss": 0.2244, + "step": 1732 + }, + { + "epoch": 0.35073871685893543, + "grad_norm": 0.2613264322280884, + "learning_rate": 0.00018530512251264624, + "loss": 0.246, + "step": 1733 + }, + { + "epoch": 0.3509411050394657, + "grad_norm": 0.33618271350860596, + "learning_rate": 0.00018528852035998536, + "loss": 0.2353, + "step": 1734 + }, + { + "epoch": 0.35114349321999594, + "grad_norm": 0.36929893493652344, + "learning_rate": 0.00018527190957872694, + "loss": 0.272, + "step": 1735 + }, + { + "epoch": 0.3513458814005262, + "grad_norm": 0.3761570453643799, + "learning_rate": 0.00018525529017055143, + "loss": 0.2537, + "step": 1736 + }, + { + "epoch": 0.35154826958105645, + "grad_norm": 0.2979852855205536, + "learning_rate": 0.00018523866213714023, + "loss": 0.2541, + "step": 1737 + }, + { + "epoch": 0.3517506577615867, + "grad_norm": 0.30869945883750916, + "learning_rate": 0.0001852220254801756, + "loss": 0.2288, + "step": 1738 + }, + { + "epoch": 0.35195304594211696, + "grad_norm": 0.2735971510410309, + "learning_rate": 0.00018520538020134065, + "loss": 0.2211, + "step": 1739 + }, + { + "epoch": 0.3521554341226472, + "grad_norm": 0.38951921463012695, + "learning_rate": 0.0001851887263023194, + "loss": 0.2706, + "step": 1740 + }, + { + "epoch": 0.35235782230317747, + "grad_norm": 0.32086440920829773, + "learning_rate": 0.00018517206378479667, + "loss": 0.27, + "step": 1741 + }, + { + "epoch": 0.3525602104837077, + "grad_norm": 0.3548159897327423, + "learning_rate": 0.00018515539265045826, + "loss": 0.2511, + "step": 1742 + }, + { + "epoch": 0.35276259866423804, + "grad_norm": 0.2878701090812683, + "learning_rate": 0.00018513871290099074, + "loss": 0.2381, + "step": 1743 + }, + { + "epoch": 0.3529649868447683, + "grad_norm": 0.3361879587173462, + "learning_rate": 0.00018512202453808158, + "loss": 0.2539, + "step": 1744 + }, + { + "epoch": 0.35316737502529855, + "grad_norm": 0.31610074639320374, + "learning_rate": 0.00018510532756341918, + "loss": 0.2477, + "step": 1745 + }, + { + "epoch": 0.3533697632058288, + "grad_norm": 0.3475480079650879, + "learning_rate": 0.00018508862197869273, + "loss": 0.2321, + "step": 1746 + }, + { + "epoch": 0.35357215138635906, + "grad_norm": 0.3138309121131897, + "learning_rate": 0.0001850719077855923, + "loss": 0.23, + "step": 1747 + }, + { + "epoch": 0.3537745395668893, + "grad_norm": 0.3435781002044678, + "learning_rate": 0.00018505518498580892, + "loss": 0.2569, + "step": 1748 + }, + { + "epoch": 0.35397692774741957, + "grad_norm": 0.3510514497756958, + "learning_rate": 0.00018503845358103438, + "loss": 0.2504, + "step": 1749 + }, + { + "epoch": 0.3541793159279498, + "grad_norm": 0.3832964599132538, + "learning_rate": 0.00018502171357296144, + "loss": 0.2639, + "step": 1750 + }, + { + "epoch": 0.3541793159279498, + "eval_loss": 0.28545942902565, + "eval_runtime": 0.7382, + "eval_samples_per_second": 6.773, + "eval_steps_per_second": 1.355, + "step": 1750 + }, + { + "epoch": 0.3543817041084801, + "grad_norm": 0.932768702507019, + "learning_rate": 0.0001850049649632836, + "loss": 0.2413, + "step": 1751 + }, + { + "epoch": 0.35458409228901033, + "grad_norm": 0.30881059169769287, + "learning_rate": 0.00018498820775369538, + "loss": 0.2755, + "step": 1752 + }, + { + "epoch": 0.3547864804695406, + "grad_norm": 0.35876527428627014, + "learning_rate": 0.00018497144194589207, + "loss": 0.2547, + "step": 1753 + }, + { + "epoch": 0.35498886865007084, + "grad_norm": 0.45719102025032043, + "learning_rate": 0.00018495466754156982, + "loss": 0.2675, + "step": 1754 + }, + { + "epoch": 0.3551912568306011, + "grad_norm": 0.3853405714035034, + "learning_rate": 0.00018493788454242575, + "loss": 0.2257, + "step": 1755 + }, + { + "epoch": 0.35539364501113135, + "grad_norm": 0.4291214346885681, + "learning_rate": 0.00018492109295015777, + "loss": 0.2542, + "step": 1756 + }, + { + "epoch": 0.3555960331916616, + "grad_norm": 0.3692077398300171, + "learning_rate": 0.0001849042927664647, + "loss": 0.2405, + "step": 1757 + }, + { + "epoch": 0.35579842137219186, + "grad_norm": 0.36899515986442566, + "learning_rate": 0.00018488748399304617, + "loss": 0.2731, + "step": 1758 + }, + { + "epoch": 0.3560008095527221, + "grad_norm": 0.28955820202827454, + "learning_rate": 0.00018487066663160269, + "loss": 0.2448, + "step": 1759 + }, + { + "epoch": 0.3562031977332524, + "grad_norm": 0.27147674560546875, + "learning_rate": 0.0001848538406838357, + "loss": 0.2389, + "step": 1760 + }, + { + "epoch": 0.35640558591378263, + "grad_norm": 0.38604146242141724, + "learning_rate": 0.0001848370061514475, + "loss": 0.2672, + "step": 1761 + }, + { + "epoch": 0.3566079740943129, + "grad_norm": 0.3210741877555847, + "learning_rate": 0.0001848201630361412, + "loss": 0.2054, + "step": 1762 + }, + { + "epoch": 0.35681036227484314, + "grad_norm": 0.312847763299942, + "learning_rate": 0.0001848033113396208, + "loss": 0.2201, + "step": 1763 + }, + { + "epoch": 0.3570127504553734, + "grad_norm": 0.38909921050071716, + "learning_rate": 0.00018478645106359117, + "loss": 0.2775, + "step": 1764 + }, + { + "epoch": 0.35721513863590365, + "grad_norm": 0.35675913095474243, + "learning_rate": 0.0001847695822097581, + "loss": 0.2529, + "step": 1765 + }, + { + "epoch": 0.3574175268164339, + "grad_norm": 0.31625452637672424, + "learning_rate": 0.0001847527047798282, + "loss": 0.247, + "step": 1766 + }, + { + "epoch": 0.35761991499696416, + "grad_norm": 0.31915339827537537, + "learning_rate": 0.00018473581877550887, + "loss": 0.2495, + "step": 1767 + }, + { + "epoch": 0.3578223031774944, + "grad_norm": 0.2950085997581482, + "learning_rate": 0.00018471892419850855, + "loss": 0.2239, + "step": 1768 + }, + { + "epoch": 0.35802469135802467, + "grad_norm": 0.35980451107025146, + "learning_rate": 0.00018470202105053644, + "loss": 0.2518, + "step": 1769 + }, + { + "epoch": 0.3582270795385549, + "grad_norm": 0.40969300270080566, + "learning_rate": 0.0001846851093333026, + "loss": 0.2701, + "step": 1770 + }, + { + "epoch": 0.3584294677190852, + "grad_norm": 0.2965695559978485, + "learning_rate": 0.000184668189048518, + "loss": 0.2833, + "step": 1771 + }, + { + "epoch": 0.35863185589961544, + "grad_norm": 0.28182271122932434, + "learning_rate": 0.00018465126019789443, + "loss": 0.2136, + "step": 1772 + }, + { + "epoch": 0.3588342440801457, + "grad_norm": 0.3636777698993683, + "learning_rate": 0.0001846343227831446, + "loss": 0.2936, + "step": 1773 + }, + { + "epoch": 0.359036632260676, + "grad_norm": 0.3793594539165497, + "learning_rate": 0.00018461737680598202, + "loss": 0.2448, + "step": 1774 + }, + { + "epoch": 0.35923902044120626, + "grad_norm": 0.3261548578739166, + "learning_rate": 0.00018460042226812115, + "loss": 0.2335, + "step": 1775 + }, + { + "epoch": 0.3594414086217365, + "grad_norm": 0.2846795916557312, + "learning_rate": 0.00018458345917127727, + "loss": 0.247, + "step": 1776 + }, + { + "epoch": 0.35964379680226677, + "grad_norm": 0.32100164890289307, + "learning_rate": 0.0001845664875171665, + "loss": 0.2537, + "step": 1777 + }, + { + "epoch": 0.359846184982797, + "grad_norm": 0.2889302372932434, + "learning_rate": 0.00018454950730750587, + "loss": 0.2233, + "step": 1778 + }, + { + "epoch": 0.3600485731633273, + "grad_norm": 0.36676448583602905, + "learning_rate": 0.00018453251854401326, + "loss": 0.2355, + "step": 1779 + }, + { + "epoch": 0.36025096134385753, + "grad_norm": 0.3235335052013397, + "learning_rate": 0.00018451552122840742, + "loss": 0.2381, + "step": 1780 + }, + { + "epoch": 0.3604533495243878, + "grad_norm": 0.30230575799942017, + "learning_rate": 0.00018449851536240798, + "loss": 0.2659, + "step": 1781 + }, + { + "epoch": 0.36065573770491804, + "grad_norm": 0.4702809751033783, + "learning_rate": 0.00018448150094773532, + "loss": 0.2251, + "step": 1782 + }, + { + "epoch": 0.3608581258854483, + "grad_norm": 0.2967616021633148, + "learning_rate": 0.00018446447798611088, + "loss": 0.2537, + "step": 1783 + }, + { + "epoch": 0.36106051406597856, + "grad_norm": 0.296118825674057, + "learning_rate": 0.00018444744647925685, + "loss": 0.258, + "step": 1784 + }, + { + "epoch": 0.3612629022465088, + "grad_norm": 0.35841524600982666, + "learning_rate": 0.00018443040642889628, + "loss": 0.2287, + "step": 1785 + }, + { + "epoch": 0.36146529042703907, + "grad_norm": 0.2674766182899475, + "learning_rate": 0.00018441335783675312, + "loss": 0.2381, + "step": 1786 + }, + { + "epoch": 0.3616676786075693, + "grad_norm": 0.305122435092926, + "learning_rate": 0.0001843963007045521, + "loss": 0.2657, + "step": 1787 + }, + { + "epoch": 0.3618700667880996, + "grad_norm": 0.35348016023635864, + "learning_rate": 0.00018437923503401897, + "loss": 0.2608, + "step": 1788 + }, + { + "epoch": 0.36207245496862983, + "grad_norm": 0.3226334750652313, + "learning_rate": 0.00018436216082688022, + "loss": 0.2273, + "step": 1789 + }, + { + "epoch": 0.3622748431491601, + "grad_norm": 0.24315498769283295, + "learning_rate": 0.00018434507808486324, + "loss": 0.2292, + "step": 1790 + }, + { + "epoch": 0.36247723132969034, + "grad_norm": 0.3406563103199005, + "learning_rate": 0.00018432798680969627, + "loss": 0.251, + "step": 1791 + }, + { + "epoch": 0.3626796195102206, + "grad_norm": 0.3662257194519043, + "learning_rate": 0.00018431088700310844, + "loss": 0.2665, + "step": 1792 + }, + { + "epoch": 0.36288200769075085, + "grad_norm": 0.2890268862247467, + "learning_rate": 0.00018429377866682972, + "loss": 0.2508, + "step": 1793 + }, + { + "epoch": 0.3630843958712811, + "grad_norm": 0.2680732011795044, + "learning_rate": 0.000184276661802591, + "loss": 0.2556, + "step": 1794 + }, + { + "epoch": 0.36328678405181136, + "grad_norm": 0.42836645245552063, + "learning_rate": 0.00018425953641212393, + "loss": 0.2372, + "step": 1795 + }, + { + "epoch": 0.3634891722323416, + "grad_norm": 0.4649638533592224, + "learning_rate": 0.00018424240249716108, + "loss": 0.2675, + "step": 1796 + }, + { + "epoch": 0.3636915604128719, + "grad_norm": 0.2570657432079315, + "learning_rate": 0.00018422526005943586, + "loss": 0.2549, + "step": 1797 + }, + { + "epoch": 0.36389394859340213, + "grad_norm": 0.3064950406551361, + "learning_rate": 0.00018420810910068264, + "loss": 0.2694, + "step": 1798 + }, + { + "epoch": 0.3640963367739324, + "grad_norm": 0.28641650080680847, + "learning_rate": 0.0001841909496226365, + "loss": 0.2657, + "step": 1799 + }, + { + "epoch": 0.36429872495446264, + "grad_norm": 0.2810456156730652, + "learning_rate": 0.00018417378162703348, + "loss": 0.2133, + "step": 1800 + }, + { + "epoch": 0.36429872495446264, + "eval_loss": 0.28418654203414917, + "eval_runtime": 0.7405, + "eval_samples_per_second": 6.752, + "eval_steps_per_second": 1.35, + "step": 1800 + }, + { + "epoch": 0.3645011131349929, + "grad_norm": 0.2836998403072357, + "learning_rate": 0.00018415660511561047, + "loss": 0.2408, + "step": 1801 + }, + { + "epoch": 0.36470350131552315, + "grad_norm": 0.26643380522727966, + "learning_rate": 0.00018413942009010522, + "loss": 0.2094, + "step": 1802 + }, + { + "epoch": 0.3649058894960534, + "grad_norm": 0.2961254417896271, + "learning_rate": 0.00018412222655225628, + "loss": 0.2621, + "step": 1803 + }, + { + "epoch": 0.3651082776765837, + "grad_norm": 0.3298720121383667, + "learning_rate": 0.00018410502450380315, + "loss": 0.2449, + "step": 1804 + }, + { + "epoch": 0.36531066585711397, + "grad_norm": 0.34053587913513184, + "learning_rate": 0.00018408781394648615, + "loss": 0.2536, + "step": 1805 + }, + { + "epoch": 0.3655130540376442, + "grad_norm": 0.3451336622238159, + "learning_rate": 0.00018407059488204645, + "loss": 0.2532, + "step": 1806 + }, + { + "epoch": 0.3657154422181745, + "grad_norm": 0.33622947335243225, + "learning_rate": 0.00018405336731222612, + "loss": 0.2062, + "step": 1807 + }, + { + "epoch": 0.36591783039870474, + "grad_norm": 0.30621781945228577, + "learning_rate": 0.00018403613123876803, + "loss": 0.241, + "step": 1808 + }, + { + "epoch": 0.366120218579235, + "grad_norm": 0.32667985558509827, + "learning_rate": 0.000184018886663416, + "loss": 0.2363, + "step": 1809 + }, + { + "epoch": 0.36632260675976525, + "grad_norm": 0.31269723176956177, + "learning_rate": 0.00018400163358791454, + "loss": 0.2572, + "step": 1810 + }, + { + "epoch": 0.3665249949402955, + "grad_norm": 0.31308773159980774, + "learning_rate": 0.00018398437201400927, + "loss": 0.2629, + "step": 1811 + }, + { + "epoch": 0.36672738312082576, + "grad_norm": 0.23163512349128723, + "learning_rate": 0.0001839671019434465, + "loss": 0.1939, + "step": 1812 + }, + { + "epoch": 0.366929771301356, + "grad_norm": 0.2652641832828522, + "learning_rate": 0.00018394982337797337, + "loss": 0.2445, + "step": 1813 + }, + { + "epoch": 0.36713215948188627, + "grad_norm": 0.4329833388328552, + "learning_rate": 0.00018393253631933797, + "loss": 0.2726, + "step": 1814 + }, + { + "epoch": 0.3673345476624165, + "grad_norm": 0.44865334033966064, + "learning_rate": 0.0001839152407692893, + "loss": 0.2265, + "step": 1815 + }, + { + "epoch": 0.3675369358429468, + "grad_norm": 0.26356157660484314, + "learning_rate": 0.000183897936729577, + "loss": 0.214, + "step": 1816 + }, + { + "epoch": 0.36773932402347703, + "grad_norm": 0.2965501844882965, + "learning_rate": 0.00018388062420195188, + "loss": 0.1911, + "step": 1817 + }, + { + "epoch": 0.3679417122040073, + "grad_norm": 0.2967815399169922, + "learning_rate": 0.00018386330318816529, + "loss": 0.2478, + "step": 1818 + }, + { + "epoch": 0.36814410038453754, + "grad_norm": 0.37267395853996277, + "learning_rate": 0.00018384597368996966, + "loss": 0.2583, + "step": 1819 + }, + { + "epoch": 0.3683464885650678, + "grad_norm": 0.36238357424736023, + "learning_rate": 0.00018382863570911822, + "loss": 0.2703, + "step": 1820 + }, + { + "epoch": 0.36854887674559805, + "grad_norm": 0.3597848415374756, + "learning_rate": 0.00018381128924736502, + "loss": 0.277, + "step": 1821 + }, + { + "epoch": 0.3687512649261283, + "grad_norm": 0.37893056869506836, + "learning_rate": 0.00018379393430646498, + "loss": 0.2414, + "step": 1822 + }, + { + "epoch": 0.36895365310665856, + "grad_norm": 0.2763799726963043, + "learning_rate": 0.00018377657088817392, + "loss": 0.2366, + "step": 1823 + }, + { + "epoch": 0.3691560412871888, + "grad_norm": 0.32323357462882996, + "learning_rate": 0.00018375919899424846, + "loss": 0.2512, + "step": 1824 + }, + { + "epoch": 0.3693584294677191, + "grad_norm": 0.2714237868785858, + "learning_rate": 0.00018374181862644613, + "loss": 0.242, + "step": 1825 + }, + { + "epoch": 0.36956081764824933, + "grad_norm": 0.557920515537262, + "learning_rate": 0.00018372442978652532, + "loss": 0.2205, + "step": 1826 + }, + { + "epoch": 0.3697632058287796, + "grad_norm": 0.2829962372779846, + "learning_rate": 0.00018370703247624516, + "loss": 0.2467, + "step": 1827 + }, + { + "epoch": 0.36996559400930984, + "grad_norm": 0.2736676037311554, + "learning_rate": 0.00018368962669736578, + "loss": 0.2482, + "step": 1828 + }, + { + "epoch": 0.3701679821898401, + "grad_norm": 0.3360370397567749, + "learning_rate": 0.00018367221245164816, + "loss": 0.2753, + "step": 1829 + }, + { + "epoch": 0.37037037037037035, + "grad_norm": 0.40492188930511475, + "learning_rate": 0.000183654789740854, + "loss": 0.2671, + "step": 1830 + }, + { + "epoch": 0.3705727585509006, + "grad_norm": 0.3024072051048279, + "learning_rate": 0.00018363735856674604, + "loss": 0.2373, + "step": 1831 + }, + { + "epoch": 0.37077514673143086, + "grad_norm": 0.3565376102924347, + "learning_rate": 0.0001836199189310877, + "loss": 0.2321, + "step": 1832 + }, + { + "epoch": 0.3709775349119611, + "grad_norm": 0.4333427846431732, + "learning_rate": 0.00018360247083564342, + "loss": 0.2743, + "step": 1833 + }, + { + "epoch": 0.3711799230924914, + "grad_norm": 0.3357686996459961, + "learning_rate": 0.00018358501428217833, + "loss": 0.2359, + "step": 1834 + }, + { + "epoch": 0.3713823112730217, + "grad_norm": 0.30689799785614014, + "learning_rate": 0.00018356754927245856, + "loss": 0.258, + "step": 1835 + }, + { + "epoch": 0.37158469945355194, + "grad_norm": 0.7203855514526367, + "learning_rate": 0.00018355007580825102, + "loss": 0.2673, + "step": 1836 + }, + { + "epoch": 0.3717870876340822, + "grad_norm": 0.5838192701339722, + "learning_rate": 0.0001835325938913235, + "loss": 0.2674, + "step": 1837 + }, + { + "epoch": 0.37198947581461245, + "grad_norm": 0.2569965124130249, + "learning_rate": 0.0001835151035234446, + "loss": 0.229, + "step": 1838 + }, + { + "epoch": 0.3721918639951427, + "grad_norm": 0.3677656352519989, + "learning_rate": 0.00018349760470638384, + "loss": 0.2847, + "step": 1839 + }, + { + "epoch": 0.37239425217567296, + "grad_norm": 0.32607802748680115, + "learning_rate": 0.00018348009744191158, + "loss": 0.2258, + "step": 1840 + }, + { + "epoch": 0.3725966403562032, + "grad_norm": 0.8226847648620605, + "learning_rate": 0.00018346258173179903, + "loss": 0.2736, + "step": 1841 + }, + { + "epoch": 0.37279902853673347, + "grad_norm": 0.33003148436546326, + "learning_rate": 0.00018344505757781818, + "loss": 0.2587, + "step": 1842 + }, + { + "epoch": 0.3730014167172637, + "grad_norm": 0.6836985945701599, + "learning_rate": 0.000183427524981742, + "loss": 0.2281, + "step": 1843 + }, + { + "epoch": 0.373203804897794, + "grad_norm": 0.4346601963043213, + "learning_rate": 0.00018340998394534425, + "loss": 0.2798, + "step": 1844 + }, + { + "epoch": 0.37340619307832423, + "grad_norm": 0.28121423721313477, + "learning_rate": 0.0001833924344703995, + "loss": 0.2677, + "step": 1845 + }, + { + "epoch": 0.3736085812588545, + "grad_norm": 0.45256420969963074, + "learning_rate": 0.00018337487655868331, + "loss": 0.2561, + "step": 1846 + }, + { + "epoch": 0.37381096943938474, + "grad_norm": 0.5170138478279114, + "learning_rate": 0.00018335731021197193, + "loss": 0.3101, + "step": 1847 + }, + { + "epoch": 0.374013357619915, + "grad_norm": 0.3438572585582733, + "learning_rate": 0.00018333973543204255, + "loss": 0.2484, + "step": 1848 + }, + { + "epoch": 0.37421574580044525, + "grad_norm": 0.6279707551002502, + "learning_rate": 0.00018332215222067322, + "loss": 0.2347, + "step": 1849 + }, + { + "epoch": 0.3744181339809755, + "grad_norm": 0.3407716155052185, + "learning_rate": 0.0001833045605796428, + "loss": 0.2515, + "step": 1850 + }, + { + "epoch": 0.3744181339809755, + "eval_loss": 0.28794625401496887, + "eval_runtime": 0.7392, + "eval_samples_per_second": 6.764, + "eval_steps_per_second": 1.353, + "step": 1850 + }, + { + "epoch": 0.37462052216150576, + "grad_norm": 0.2511065900325775, + "learning_rate": 0.00018328696051073107, + "loss": 0.2173, + "step": 1851 + }, + { + "epoch": 0.374822910342036, + "grad_norm": 0.2791818380355835, + "learning_rate": 0.00018326935201571859, + "loss": 0.2783, + "step": 1852 + }, + { + "epoch": 0.3750252985225663, + "grad_norm": 0.3694707155227661, + "learning_rate": 0.0001832517350963868, + "loss": 0.2367, + "step": 1853 + }, + { + "epoch": 0.37522768670309653, + "grad_norm": 0.4342188537120819, + "learning_rate": 0.000183234109754518, + "loss": 0.2433, + "step": 1854 + }, + { + "epoch": 0.3754300748836268, + "grad_norm": 0.35381773114204407, + "learning_rate": 0.00018321647599189538, + "loss": 0.2492, + "step": 1855 + }, + { + "epoch": 0.37563246306415704, + "grad_norm": 0.3225228488445282, + "learning_rate": 0.00018319883381030287, + "loss": 0.214, + "step": 1856 + }, + { + "epoch": 0.3758348512446873, + "grad_norm": 0.3866134285926819, + "learning_rate": 0.00018318118321152533, + "loss": 0.2737, + "step": 1857 + }, + { + "epoch": 0.37603723942521755, + "grad_norm": 0.28414440155029297, + "learning_rate": 0.00018316352419734853, + "loss": 0.2119, + "step": 1858 + }, + { + "epoch": 0.3762396276057478, + "grad_norm": 0.3543158173561096, + "learning_rate": 0.00018314585676955893, + "loss": 0.2344, + "step": 1859 + }, + { + "epoch": 0.37644201578627806, + "grad_norm": 0.37424933910369873, + "learning_rate": 0.00018312818092994403, + "loss": 0.2577, + "step": 1860 + }, + { + "epoch": 0.3766444039668083, + "grad_norm": 0.48105794191360474, + "learning_rate": 0.00018311049668029197, + "loss": 0.2546, + "step": 1861 + }, + { + "epoch": 0.37684679214733857, + "grad_norm": 0.3142531216144562, + "learning_rate": 0.000183092804022392, + "loss": 0.2466, + "step": 1862 + }, + { + "epoch": 0.3770491803278688, + "grad_norm": 0.3777507245540619, + "learning_rate": 0.00018307510295803396, + "loss": 0.2528, + "step": 1863 + }, + { + "epoch": 0.3772515685083991, + "grad_norm": 0.35614216327667236, + "learning_rate": 0.0001830573934890087, + "loss": 0.2479, + "step": 1864 + }, + { + "epoch": 0.3774539566889294, + "grad_norm": 0.40795329213142395, + "learning_rate": 0.00018303967561710788, + "loss": 0.2684, + "step": 1865 + }, + { + "epoch": 0.37765634486945965, + "grad_norm": 0.3819682002067566, + "learning_rate": 0.000183021949344124, + "loss": 0.2542, + "step": 1866 + }, + { + "epoch": 0.3778587330499899, + "grad_norm": 0.3602333962917328, + "learning_rate": 0.00018300421467185046, + "loss": 0.2412, + "step": 1867 + }, + { + "epoch": 0.37806112123052016, + "grad_norm": 0.41661393642425537, + "learning_rate": 0.0001829864716020814, + "loss": 0.239, + "step": 1868 + }, + { + "epoch": 0.3782635094110504, + "grad_norm": 0.40309882164001465, + "learning_rate": 0.00018296872013661192, + "loss": 0.2956, + "step": 1869 + }, + { + "epoch": 0.37846589759158067, + "grad_norm": 0.3767727017402649, + "learning_rate": 0.0001829509602772379, + "loss": 0.2564, + "step": 1870 + }, + { + "epoch": 0.3786682857721109, + "grad_norm": 0.3340590000152588, + "learning_rate": 0.00018293319202575614, + "loss": 0.2595, + "step": 1871 + }, + { + "epoch": 0.3788706739526412, + "grad_norm": 0.4940049648284912, + "learning_rate": 0.0001829154153839642, + "loss": 0.2399, + "step": 1872 + }, + { + "epoch": 0.37907306213317143, + "grad_norm": 0.2716989815235138, + "learning_rate": 0.00018289763035366055, + "loss": 0.2129, + "step": 1873 + }, + { + "epoch": 0.3792754503137017, + "grad_norm": 0.32507073879241943, + "learning_rate": 0.00018287983693664455, + "loss": 0.2391, + "step": 1874 + }, + { + "epoch": 0.37947783849423194, + "grad_norm": 0.3758697211742401, + "learning_rate": 0.00018286203513471623, + "loss": 0.241, + "step": 1875 + }, + { + "epoch": 0.3796802266747622, + "grad_norm": 0.3964853584766388, + "learning_rate": 0.00018284422494967668, + "loss": 0.2686, + "step": 1876 + }, + { + "epoch": 0.37988261485529246, + "grad_norm": 0.5915670394897461, + "learning_rate": 0.00018282640638332773, + "loss": 0.2551, + "step": 1877 + }, + { + "epoch": 0.3800850030358227, + "grad_norm": 0.3817998468875885, + "learning_rate": 0.00018280857943747206, + "loss": 0.2885, + "step": 1878 + }, + { + "epoch": 0.38028739121635297, + "grad_norm": 0.31637680530548096, + "learning_rate": 0.00018279074411391323, + "loss": 0.2691, + "step": 1879 + }, + { + "epoch": 0.3804897793968832, + "grad_norm": 0.3863159120082855, + "learning_rate": 0.00018277290041445563, + "loss": 0.2769, + "step": 1880 + }, + { + "epoch": 0.3806921675774135, + "grad_norm": 0.28158068656921387, + "learning_rate": 0.0001827550483409045, + "loss": 0.2668, + "step": 1881 + }, + { + "epoch": 0.38089455575794373, + "grad_norm": 0.39321863651275635, + "learning_rate": 0.0001827371878950659, + "loss": 0.2705, + "step": 1882 + }, + { + "epoch": 0.381096943938474, + "grad_norm": 0.3414781987667084, + "learning_rate": 0.00018271931907874677, + "loss": 0.2484, + "step": 1883 + }, + { + "epoch": 0.38129933211900424, + "grad_norm": 0.2941926419734955, + "learning_rate": 0.00018270144189375492, + "loss": 0.229, + "step": 1884 + }, + { + "epoch": 0.3815017202995345, + "grad_norm": 0.30828848481178284, + "learning_rate": 0.00018268355634189893, + "loss": 0.2382, + "step": 1885 + }, + { + "epoch": 0.38170410848006475, + "grad_norm": 0.3570677936077118, + "learning_rate": 0.00018266566242498833, + "loss": 0.2914, + "step": 1886 + }, + { + "epoch": 0.381906496660595, + "grad_norm": 0.33216559886932373, + "learning_rate": 0.0001826477601448334, + "loss": 0.2525, + "step": 1887 + }, + { + "epoch": 0.38210888484112526, + "grad_norm": 0.37334170937538147, + "learning_rate": 0.0001826298495032453, + "loss": 0.2152, + "step": 1888 + }, + { + "epoch": 0.3823112730216555, + "grad_norm": 0.3412468731403351, + "learning_rate": 0.00018261193050203605, + "loss": 0.2453, + "step": 1889 + }, + { + "epoch": 0.3825136612021858, + "grad_norm": 0.255756676197052, + "learning_rate": 0.0001825940031430185, + "loss": 0.2203, + "step": 1890 + }, + { + "epoch": 0.38271604938271603, + "grad_norm": 0.29467645287513733, + "learning_rate": 0.0001825760674280064, + "loss": 0.2491, + "step": 1891 + }, + { + "epoch": 0.3829184375632463, + "grad_norm": 0.4214814305305481, + "learning_rate": 0.00018255812335881425, + "loss": 0.273, + "step": 1892 + }, + { + "epoch": 0.38312082574377654, + "grad_norm": 0.43483030796051025, + "learning_rate": 0.0001825401709372574, + "loss": 0.2466, + "step": 1893 + }, + { + "epoch": 0.3833232139243068, + "grad_norm": 0.31902021169662476, + "learning_rate": 0.0001825222101651522, + "loss": 0.2536, + "step": 1894 + }, + { + "epoch": 0.3835256021048371, + "grad_norm": 0.39684993028640747, + "learning_rate": 0.00018250424104431564, + "loss": 0.2377, + "step": 1895 + }, + { + "epoch": 0.38372799028536736, + "grad_norm": 0.5531018376350403, + "learning_rate": 0.00018248626357656567, + "loss": 0.241, + "step": 1896 + }, + { + "epoch": 0.3839303784658976, + "grad_norm": 0.3912695348262787, + "learning_rate": 0.0001824682777637211, + "loss": 0.2672, + "step": 1897 + }, + { + "epoch": 0.38413276664642787, + "grad_norm": 0.2787422835826874, + "learning_rate": 0.0001824502836076015, + "loss": 0.2097, + "step": 1898 + }, + { + "epoch": 0.3843351548269581, + "grad_norm": 0.28876063227653503, + "learning_rate": 0.00018243228111002732, + "loss": 0.2439, + "step": 1899 + }, + { + "epoch": 0.3845375430074884, + "grad_norm": 0.3721954822540283, + "learning_rate": 0.00018241427027281993, + "loss": 0.2368, + "step": 1900 + }, + { + "epoch": 0.3845375430074884, + "eval_loss": 0.2891212999820709, + "eval_runtime": 0.7385, + "eval_samples_per_second": 6.77, + "eval_steps_per_second": 1.354, + "step": 1900 + }, + { + "epoch": 0.38473993118801864, + "grad_norm": 0.43678271770477295, + "learning_rate": 0.00018239625109780144, + "loss": 0.2922, + "step": 1901 + }, + { + "epoch": 0.3849423193685489, + "grad_norm": 0.34848496317863464, + "learning_rate": 0.0001823782235867948, + "loss": 0.2649, + "step": 1902 + }, + { + "epoch": 0.38514470754907915, + "grad_norm": 0.4435858726501465, + "learning_rate": 0.00018236018774162388, + "loss": 0.2728, + "step": 1903 + }, + { + "epoch": 0.3853470957296094, + "grad_norm": 0.44345182180404663, + "learning_rate": 0.00018234214356411342, + "loss": 0.2716, + "step": 1904 + }, + { + "epoch": 0.38554948391013966, + "grad_norm": 0.38408511877059937, + "learning_rate": 0.00018232409105608884, + "loss": 0.2434, + "step": 1905 + }, + { + "epoch": 0.3857518720906699, + "grad_norm": 0.2690526843070984, + "learning_rate": 0.0001823060302193765, + "loss": 0.2581, + "step": 1906 + }, + { + "epoch": 0.38595426027120017, + "grad_norm": 0.32186561822891235, + "learning_rate": 0.00018228796105580373, + "loss": 0.254, + "step": 1907 + }, + { + "epoch": 0.3861566484517304, + "grad_norm": 0.3032004237174988, + "learning_rate": 0.00018226988356719845, + "loss": 0.2353, + "step": 1908 + }, + { + "epoch": 0.3863590366322607, + "grad_norm": 0.3549427390098572, + "learning_rate": 0.0001822517977553896, + "loss": 0.2407, + "step": 1909 + }, + { + "epoch": 0.38656142481279093, + "grad_norm": 0.3449702858924866, + "learning_rate": 0.00018223370362220696, + "loss": 0.2516, + "step": 1910 + }, + { + "epoch": 0.3867638129933212, + "grad_norm": 0.39067935943603516, + "learning_rate": 0.00018221560116948103, + "loss": 0.2618, + "step": 1911 + }, + { + "epoch": 0.38696620117385144, + "grad_norm": 0.37322261929512024, + "learning_rate": 0.00018219749039904322, + "loss": 0.226, + "step": 1912 + }, + { + "epoch": 0.3871685893543817, + "grad_norm": 0.4211459457874298, + "learning_rate": 0.00018217937131272585, + "loss": 0.2813, + "step": 1913 + }, + { + "epoch": 0.38737097753491195, + "grad_norm": 0.3087145686149597, + "learning_rate": 0.00018216124391236198, + "loss": 0.2307, + "step": 1914 + }, + { + "epoch": 0.3875733657154422, + "grad_norm": 0.3727726936340332, + "learning_rate": 0.00018214310819978556, + "loss": 0.265, + "step": 1915 + }, + { + "epoch": 0.38777575389597246, + "grad_norm": 0.3550623655319214, + "learning_rate": 0.00018212496417683137, + "loss": 0.2585, + "step": 1916 + }, + { + "epoch": 0.3879781420765027, + "grad_norm": 0.30683794617652893, + "learning_rate": 0.000182106811845335, + "loss": 0.2422, + "step": 1917 + }, + { + "epoch": 0.388180530257033, + "grad_norm": 0.4710239768028259, + "learning_rate": 0.000182088651207133, + "loss": 0.2756, + "step": 1918 + }, + { + "epoch": 0.38838291843756323, + "grad_norm": 0.30055004358291626, + "learning_rate": 0.0001820704822640626, + "loss": 0.233, + "step": 1919 + }, + { + "epoch": 0.3885853066180935, + "grad_norm": 0.33683738112449646, + "learning_rate": 0.00018205230501796196, + "loss": 0.2428, + "step": 1920 + }, + { + "epoch": 0.38878769479862374, + "grad_norm": 0.25107863545417786, + "learning_rate": 0.00018203411947067006, + "loss": 0.2183, + "step": 1921 + }, + { + "epoch": 0.388990082979154, + "grad_norm": 0.30623263120651245, + "learning_rate": 0.00018201592562402672, + "loss": 0.2546, + "step": 1922 + }, + { + "epoch": 0.38919247115968425, + "grad_norm": 0.2566131353378296, + "learning_rate": 0.0001819977234798726, + "loss": 0.2203, + "step": 1923 + }, + { + "epoch": 0.3893948593402145, + "grad_norm": 0.3132251501083374, + "learning_rate": 0.00018197951304004922, + "loss": 0.2521, + "step": 1924 + }, + { + "epoch": 0.3895972475207448, + "grad_norm": 0.29413077235221863, + "learning_rate": 0.00018196129430639896, + "loss": 0.2248, + "step": 1925 + }, + { + "epoch": 0.38979963570127507, + "grad_norm": 0.32894453406333923, + "learning_rate": 0.0001819430672807649, + "loss": 0.2401, + "step": 1926 + }, + { + "epoch": 0.3900020238818053, + "grad_norm": 0.26279136538505554, + "learning_rate": 0.0001819248319649911, + "loss": 0.1968, + "step": 1927 + }, + { + "epoch": 0.3902044120623356, + "grad_norm": 0.24191400408744812, + "learning_rate": 0.00018190658836092244, + "loss": 0.2482, + "step": 1928 + }, + { + "epoch": 0.39040680024286584, + "grad_norm": 0.2685995399951935, + "learning_rate": 0.00018188833647040463, + "loss": 0.2259, + "step": 1929 + }, + { + "epoch": 0.3906091884233961, + "grad_norm": 0.4994000196456909, + "learning_rate": 0.00018187007629528416, + "loss": 0.2561, + "step": 1930 + }, + { + "epoch": 0.39081157660392635, + "grad_norm": 0.29093116521835327, + "learning_rate": 0.00018185180783740842, + "loss": 0.2539, + "step": 1931 + }, + { + "epoch": 0.3910139647844566, + "grad_norm": 0.49854952096939087, + "learning_rate": 0.00018183353109862561, + "loss": 0.2879, + "step": 1932 + }, + { + "epoch": 0.39121635296498686, + "grad_norm": 0.3427339494228363, + "learning_rate": 0.00018181524608078484, + "loss": 0.2422, + "step": 1933 + }, + { + "epoch": 0.3914187411455171, + "grad_norm": 0.32090428471565247, + "learning_rate": 0.0001817969527857359, + "loss": 0.2734, + "step": 1934 + }, + { + "epoch": 0.39162112932604737, + "grad_norm": 0.33059003949165344, + "learning_rate": 0.00018177865121532963, + "loss": 0.2627, + "step": 1935 + }, + { + "epoch": 0.3918235175065776, + "grad_norm": 0.3862765431404114, + "learning_rate": 0.00018176034137141746, + "loss": 0.2851, + "step": 1936 + }, + { + "epoch": 0.3920259056871079, + "grad_norm": 0.31288978457450867, + "learning_rate": 0.00018174202325585184, + "loss": 0.234, + "step": 1937 + }, + { + "epoch": 0.39222829386763813, + "grad_norm": 0.2682187855243683, + "learning_rate": 0.00018172369687048608, + "loss": 0.2177, + "step": 1938 + }, + { + "epoch": 0.3924306820481684, + "grad_norm": 0.2934335172176361, + "learning_rate": 0.00018170536221717416, + "loss": 0.223, + "step": 1939 + }, + { + "epoch": 0.39263307022869864, + "grad_norm": 0.5119560360908508, + "learning_rate": 0.00018168701929777102, + "loss": 0.2685, + "step": 1940 + }, + { + "epoch": 0.3928354584092289, + "grad_norm": 0.889604926109314, + "learning_rate": 0.00018166866811413236, + "loss": 0.2363, + "step": 1941 + }, + { + "epoch": 0.39303784658975915, + "grad_norm": 0.3027733862400055, + "learning_rate": 0.00018165030866811486, + "loss": 0.2128, + "step": 1942 + }, + { + "epoch": 0.3932402347702894, + "grad_norm": 0.2894009053707123, + "learning_rate": 0.00018163194096157582, + "loss": 0.2565, + "step": 1943 + }, + { + "epoch": 0.39344262295081966, + "grad_norm": 0.3722357451915741, + "learning_rate": 0.0001816135649963736, + "loss": 0.2587, + "step": 1944 + }, + { + "epoch": 0.3936450111313499, + "grad_norm": 0.4462873339653015, + "learning_rate": 0.00018159518077436718, + "loss": 0.3092, + "step": 1945 + }, + { + "epoch": 0.3938473993118802, + "grad_norm": 0.5751633048057556, + "learning_rate": 0.0001815767882974166, + "loss": 0.2906, + "step": 1946 + }, + { + "epoch": 0.39404978749241043, + "grad_norm": 0.47575053572654724, + "learning_rate": 0.00018155838756738252, + "loss": 0.2553, + "step": 1947 + }, + { + "epoch": 0.3942521756729407, + "grad_norm": 2.4225292205810547, + "learning_rate": 0.00018153997858612656, + "loss": 0.2262, + "step": 1948 + }, + { + "epoch": 0.39445456385347094, + "grad_norm": 0.41146135330200195, + "learning_rate": 0.00018152156135551117, + "loss": 0.2422, + "step": 1949 + }, + { + "epoch": 0.3946569520340012, + "grad_norm": 0.3842962682247162, + "learning_rate": 0.00018150313587739957, + "loss": 0.2319, + "step": 1950 + }, + { + "epoch": 0.3946569520340012, + "eval_loss": 0.29187315702438354, + "eval_runtime": 0.7369, + "eval_samples_per_second": 6.785, + "eval_steps_per_second": 1.357, + "step": 1950 + }, + { + "epoch": 0.39485934021453145, + "grad_norm": 0.3393075168132782, + "learning_rate": 0.00018148470215365595, + "loss": 0.2476, + "step": 1951 + }, + { + "epoch": 0.3950617283950617, + "grad_norm": 0.7554660439491272, + "learning_rate": 0.00018146626018614512, + "loss": 0.2346, + "step": 1952 + }, + { + "epoch": 0.39526411657559196, + "grad_norm": 0.2916422188282013, + "learning_rate": 0.00018144780997673293, + "loss": 0.2481, + "step": 1953 + }, + { + "epoch": 0.3954665047561222, + "grad_norm": 0.4447515308856964, + "learning_rate": 0.00018142935152728592, + "loss": 0.2622, + "step": 1954 + }, + { + "epoch": 0.39566889293665247, + "grad_norm": 0.3644491732120514, + "learning_rate": 0.00018141088483967157, + "loss": 0.2683, + "step": 1955 + }, + { + "epoch": 0.3958712811171828, + "grad_norm": 0.43378445506095886, + "learning_rate": 0.00018139240991575813, + "loss": 0.2068, + "step": 1956 + }, + { + "epoch": 0.39607366929771304, + "grad_norm": 0.3398344814777374, + "learning_rate": 0.00018137392675741468, + "loss": 0.1931, + "step": 1957 + }, + { + "epoch": 0.3962760574782433, + "grad_norm": 0.6017144322395325, + "learning_rate": 0.0001813554353665112, + "loss": 0.2184, + "step": 1958 + }, + { + "epoch": 0.39647844565877355, + "grad_norm": 0.36190178990364075, + "learning_rate": 0.00018133693574491836, + "loss": 0.2764, + "step": 1959 + }, + { + "epoch": 0.3966808338393038, + "grad_norm": 0.33858707547187805, + "learning_rate": 0.00018131842789450786, + "loss": 0.252, + "step": 1960 + }, + { + "epoch": 0.39688322201983406, + "grad_norm": 0.3058442771434784, + "learning_rate": 0.00018129991181715208, + "loss": 0.19, + "step": 1961 + }, + { + "epoch": 0.3970856102003643, + "grad_norm": 0.5234771370887756, + "learning_rate": 0.00018128138751472432, + "loss": 0.2875, + "step": 1962 + }, + { + "epoch": 0.39728799838089457, + "grad_norm": 0.3918607532978058, + "learning_rate": 0.00018126285498909863, + "loss": 0.2565, + "step": 1963 + }, + { + "epoch": 0.3974903865614248, + "grad_norm": 0.4046613872051239, + "learning_rate": 0.00018124431424214996, + "loss": 0.2803, + "step": 1964 + }, + { + "epoch": 0.3976927747419551, + "grad_norm": 0.3878602087497711, + "learning_rate": 0.00018122576527575404, + "loss": 0.2708, + "step": 1965 + }, + { + "epoch": 0.39789516292248533, + "grad_norm": 0.3302357792854309, + "learning_rate": 0.0001812072080917875, + "loss": 0.2407, + "step": 1966 + }, + { + "epoch": 0.3980975511030156, + "grad_norm": 0.38548722863197327, + "learning_rate": 0.00018118864269212775, + "loss": 0.2543, + "step": 1967 + }, + { + "epoch": 0.39829993928354585, + "grad_norm": 0.5989793539047241, + "learning_rate": 0.00018117006907865298, + "loss": 0.2317, + "step": 1968 + }, + { + "epoch": 0.3985023274640761, + "grad_norm": 0.38723450899124146, + "learning_rate": 0.0001811514872532424, + "loss": 0.2418, + "step": 1969 + }, + { + "epoch": 0.39870471564460636, + "grad_norm": 0.303592711687088, + "learning_rate": 0.0001811328972177758, + "loss": 0.2282, + "step": 1970 + }, + { + "epoch": 0.3989071038251366, + "grad_norm": 1.0240917205810547, + "learning_rate": 0.000181114298974134, + "loss": 0.2696, + "step": 1971 + }, + { + "epoch": 0.39910949200566687, + "grad_norm": 0.2980985641479492, + "learning_rate": 0.0001810956925241986, + "loss": 0.2568, + "step": 1972 + }, + { + "epoch": 0.3993118801861971, + "grad_norm": 0.34745925664901733, + "learning_rate": 0.0001810770778698519, + "loss": 0.2424, + "step": 1973 + }, + { + "epoch": 0.3995142683667274, + "grad_norm": 0.4588114321231842, + "learning_rate": 0.0001810584550129772, + "loss": 0.2843, + "step": 1974 + }, + { + "epoch": 0.39971665654725763, + "grad_norm": 0.407896488904953, + "learning_rate": 0.00018103982395545855, + "loss": 0.2606, + "step": 1975 + }, + { + "epoch": 0.3999190447277879, + "grad_norm": 0.41062796115875244, + "learning_rate": 0.00018102118469918085, + "loss": 0.2513, + "step": 1976 + }, + { + "epoch": 0.40012143290831814, + "grad_norm": 0.30710867047309875, + "learning_rate": 0.00018100253724602988, + "loss": 0.2434, + "step": 1977 + }, + { + "epoch": 0.4003238210888484, + "grad_norm": 0.3533878028392792, + "learning_rate": 0.0001809838815978921, + "loss": 0.2552, + "step": 1978 + }, + { + "epoch": 0.40052620926937865, + "grad_norm": 0.442613810300827, + "learning_rate": 0.00018096521775665494, + "loss": 0.2837, + "step": 1979 + }, + { + "epoch": 0.4007285974499089, + "grad_norm": 0.42794176936149597, + "learning_rate": 0.0001809465457242066, + "loss": 0.3078, + "step": 1980 + }, + { + "epoch": 0.40093098563043916, + "grad_norm": 0.41974005103111267, + "learning_rate": 0.00018092786550243613, + "loss": 0.2543, + "step": 1981 + }, + { + "epoch": 0.4011333738109694, + "grad_norm": 0.34940439462661743, + "learning_rate": 0.00018090917709323337, + "loss": 0.2334, + "step": 1982 + }, + { + "epoch": 0.4013357619914997, + "grad_norm": 0.39555102586746216, + "learning_rate": 0.0001808904804984891, + "loss": 0.2469, + "step": 1983 + }, + { + "epoch": 0.40153815017202993, + "grad_norm": 0.4463624656200409, + "learning_rate": 0.00018087177572009475, + "loss": 0.2559, + "step": 1984 + }, + { + "epoch": 0.4017405383525602, + "grad_norm": 0.3513191342353821, + "learning_rate": 0.00018085306275994272, + "loss": 0.2247, + "step": 1985 + }, + { + "epoch": 0.4019429265330905, + "grad_norm": 0.38531386852264404, + "learning_rate": 0.00018083434161992616, + "loss": 0.2478, + "step": 1986 + }, + { + "epoch": 0.40214531471362075, + "grad_norm": 0.34127408266067505, + "learning_rate": 0.00018081561230193913, + "loss": 0.2543, + "step": 1987 + }, + { + "epoch": 0.402347702894151, + "grad_norm": 0.2705632746219635, + "learning_rate": 0.00018079687480787642, + "loss": 0.1989, + "step": 1988 + }, + { + "epoch": 0.40255009107468126, + "grad_norm": 0.3331010043621063, + "learning_rate": 0.00018077812913963373, + "loss": 0.203, + "step": 1989 + }, + { + "epoch": 0.4027524792552115, + "grad_norm": 1.0123331546783447, + "learning_rate": 0.0001807593752991075, + "loss": 0.2963, + "step": 1990 + }, + { + "epoch": 0.40295486743574177, + "grad_norm": 0.46535953879356384, + "learning_rate": 0.00018074061328819508, + "loss": 0.2546, + "step": 1991 + }, + { + "epoch": 0.403157255616272, + "grad_norm": 0.2680860459804535, + "learning_rate": 0.00018072184310879462, + "loss": 0.2216, + "step": 1992 + }, + { + "epoch": 0.4033596437968023, + "grad_norm": 0.32052627205848694, + "learning_rate": 0.00018070306476280508, + "loss": 0.234, + "step": 1993 + }, + { + "epoch": 0.40356203197733254, + "grad_norm": 0.37091732025146484, + "learning_rate": 0.00018068427825212625, + "loss": 0.2721, + "step": 1994 + }, + { + "epoch": 0.4037644201578628, + "grad_norm": 0.37389910221099854, + "learning_rate": 0.00018066548357865874, + "loss": 0.2477, + "step": 1995 + }, + { + "epoch": 0.40396680833839305, + "grad_norm": 0.3704164922237396, + "learning_rate": 0.00018064668074430404, + "loss": 0.2748, + "step": 1996 + }, + { + "epoch": 0.4041691965189233, + "grad_norm": 0.45123517513275146, + "learning_rate": 0.0001806278697509644, + "loss": 0.2488, + "step": 1997 + }, + { + "epoch": 0.40437158469945356, + "grad_norm": 0.3480460047721863, + "learning_rate": 0.00018060905060054289, + "loss": 0.2332, + "step": 1998 + }, + { + "epoch": 0.4045739728799838, + "grad_norm": 0.35321128368377686, + "learning_rate": 0.0001805902232949435, + "loss": 0.2777, + "step": 1999 + }, + { + "epoch": 0.40477636106051407, + "grad_norm": 0.45171496272087097, + "learning_rate": 0.0001805713878360709, + "loss": 0.2721, + "step": 2000 + }, + { + "epoch": 0.40477636106051407, + "eval_loss": 0.28785374760627747, + "eval_runtime": 0.7406, + "eval_samples_per_second": 6.751, + "eval_steps_per_second": 1.35, + "step": 2000 + }, + { + "epoch": 0.4049787492410443, + "grad_norm": 0.42943838238716125, + "learning_rate": 0.00018055254422583074, + "loss": 0.2848, + "step": 2001 + }, + { + "epoch": 0.4051811374215746, + "grad_norm": 0.32872962951660156, + "learning_rate": 0.00018053369246612936, + "loss": 0.2504, + "step": 2002 + }, + { + "epoch": 0.40538352560210483, + "grad_norm": 0.3315301537513733, + "learning_rate": 0.00018051483255887403, + "loss": 0.2573, + "step": 2003 + }, + { + "epoch": 0.4055859137826351, + "grad_norm": 0.4020966589450836, + "learning_rate": 0.00018049596450597278, + "loss": 0.3004, + "step": 2004 + }, + { + "epoch": 0.40578830196316534, + "grad_norm": 0.3283519148826599, + "learning_rate": 0.00018047708830933444, + "loss": 0.2762, + "step": 2005 + }, + { + "epoch": 0.4059906901436956, + "grad_norm": 0.32964780926704407, + "learning_rate": 0.00018045820397086875, + "loss": 0.2672, + "step": 2006 + }, + { + "epoch": 0.40619307832422585, + "grad_norm": 0.42414024472236633, + "learning_rate": 0.00018043931149248622, + "loss": 0.2627, + "step": 2007 + }, + { + "epoch": 0.4063954665047561, + "grad_norm": 0.26213163137435913, + "learning_rate": 0.0001804204108760982, + "loss": 0.2386, + "step": 2008 + }, + { + "epoch": 0.40659785468528636, + "grad_norm": 0.28232935070991516, + "learning_rate": 0.00018040150212361687, + "loss": 0.2458, + "step": 2009 + }, + { + "epoch": 0.4068002428658166, + "grad_norm": 0.5508570075035095, + "learning_rate": 0.00018038258523695518, + "loss": 0.285, + "step": 2010 + }, + { + "epoch": 0.4070026310463469, + "grad_norm": 0.275995135307312, + "learning_rate": 0.00018036366021802693, + "loss": 0.2674, + "step": 2011 + }, + { + "epoch": 0.40720501922687713, + "grad_norm": 0.3155290186405182, + "learning_rate": 0.00018034472706874682, + "loss": 0.2601, + "step": 2012 + }, + { + "epoch": 0.4074074074074074, + "grad_norm": 0.32950180768966675, + "learning_rate": 0.00018032578579103029, + "loss": 0.2688, + "step": 2013 + }, + { + "epoch": 0.40760979558793764, + "grad_norm": 0.3493838608264923, + "learning_rate": 0.00018030683638679354, + "loss": 0.2314, + "step": 2014 + }, + { + "epoch": 0.4078121837684679, + "grad_norm": 0.3500867486000061, + "learning_rate": 0.00018028787885795378, + "loss": 0.2472, + "step": 2015 + }, + { + "epoch": 0.4080145719489982, + "grad_norm": 0.528800368309021, + "learning_rate": 0.00018026891320642888, + "loss": 0.237, + "step": 2016 + }, + { + "epoch": 0.40821696012952846, + "grad_norm": 0.5186204314231873, + "learning_rate": 0.0001802499394341376, + "loss": 0.2233, + "step": 2017 + }, + { + "epoch": 0.4084193483100587, + "grad_norm": 0.3000638484954834, + "learning_rate": 0.0001802309575429995, + "loss": 0.2269, + "step": 2018 + }, + { + "epoch": 0.40862173649058897, + "grad_norm": 0.33724966645240784, + "learning_rate": 0.00018021196753493496, + "loss": 0.2404, + "step": 2019 + }, + { + "epoch": 0.4088241246711192, + "grad_norm": 0.41299405694007874, + "learning_rate": 0.00018019296941186523, + "loss": 0.2333, + "step": 2020 + }, + { + "epoch": 0.4090265128516495, + "grad_norm": 0.33299964666366577, + "learning_rate": 0.00018017396317571228, + "loss": 0.2553, + "step": 2021 + }, + { + "epoch": 0.40922890103217974, + "grad_norm": 0.3150463402271271, + "learning_rate": 0.00018015494882839898, + "loss": 0.2884, + "step": 2022 + }, + { + "epoch": 0.40943128921271, + "grad_norm": 0.31459060311317444, + "learning_rate": 0.00018013592637184904, + "loss": 0.2183, + "step": 2023 + }, + { + "epoch": 0.40963367739324025, + "grad_norm": 0.45589613914489746, + "learning_rate": 0.00018011689580798695, + "loss": 0.2286, + "step": 2024 + }, + { + "epoch": 0.4098360655737705, + "grad_norm": 0.2886069416999817, + "learning_rate": 0.00018009785713873794, + "loss": 0.2158, + "step": 2025 + }, + { + "epoch": 0.41003845375430076, + "grad_norm": 0.444607138633728, + "learning_rate": 0.00018007881036602823, + "loss": 0.2677, + "step": 2026 + }, + { + "epoch": 0.410240841934831, + "grad_norm": 0.3646940290927887, + "learning_rate": 0.00018005975549178476, + "loss": 0.2754, + "step": 2027 + }, + { + "epoch": 0.41044323011536127, + "grad_norm": 0.2616381347179413, + "learning_rate": 0.00018004069251793524, + "loss": 0.2218, + "step": 2028 + }, + { + "epoch": 0.4106456182958915, + "grad_norm": 0.42195913195610046, + "learning_rate": 0.00018002162144640837, + "loss": 0.2607, + "step": 2029 + }, + { + "epoch": 0.4108480064764218, + "grad_norm": 0.3766661286354065, + "learning_rate": 0.00018000254227913348, + "loss": 0.2566, + "step": 2030 + }, + { + "epoch": 0.41105039465695203, + "grad_norm": 0.3197172284126282, + "learning_rate": 0.00017998345501804078, + "loss": 0.2309, + "step": 2031 + }, + { + "epoch": 0.4112527828374823, + "grad_norm": 0.29483306407928467, + "learning_rate": 0.0001799643596650614, + "loss": 0.2379, + "step": 2032 + }, + { + "epoch": 0.41145517101801254, + "grad_norm": 0.2921574115753174, + "learning_rate": 0.00017994525622212713, + "loss": 0.2266, + "step": 2033 + }, + { + "epoch": 0.4116575591985428, + "grad_norm": 0.2598446011543274, + "learning_rate": 0.00017992614469117073, + "loss": 0.2166, + "step": 2034 + }, + { + "epoch": 0.41185994737907305, + "grad_norm": 0.3084995150566101, + "learning_rate": 0.00017990702507412565, + "loss": 0.2619, + "step": 2035 + }, + { + "epoch": 0.4120623355596033, + "grad_norm": 0.4501848816871643, + "learning_rate": 0.0001798878973729262, + "loss": 0.265, + "step": 2036 + }, + { + "epoch": 0.41226472374013357, + "grad_norm": 0.4476367235183716, + "learning_rate": 0.0001798687615895076, + "loss": 0.2292, + "step": 2037 + }, + { + "epoch": 0.4124671119206638, + "grad_norm": 0.3379128873348236, + "learning_rate": 0.00017984961772580572, + "loss": 0.2448, + "step": 2038 + }, + { + "epoch": 0.4126695001011941, + "grad_norm": 0.3354593813419342, + "learning_rate": 0.00017983046578375737, + "loss": 0.2529, + "step": 2039 + }, + { + "epoch": 0.41287188828172433, + "grad_norm": 0.377542644739151, + "learning_rate": 0.0001798113057653002, + "loss": 0.2685, + "step": 2040 + }, + { + "epoch": 0.4130742764622546, + "grad_norm": 0.3194352984428406, + "learning_rate": 0.0001797921376723725, + "loss": 0.2338, + "step": 2041 + }, + { + "epoch": 0.41327666464278484, + "grad_norm": 0.3700343072414398, + "learning_rate": 0.00017977296150691356, + "loss": 0.2454, + "step": 2042 + }, + { + "epoch": 0.4134790528233151, + "grad_norm": 0.5020773410797119, + "learning_rate": 0.00017975377727086347, + "loss": 0.2591, + "step": 2043 + }, + { + "epoch": 0.41368144100384535, + "grad_norm": 0.37990298867225647, + "learning_rate": 0.000179734584966163, + "loss": 0.2663, + "step": 2044 + }, + { + "epoch": 0.4138838291843756, + "grad_norm": 0.3366142213344574, + "learning_rate": 0.00017971538459475388, + "loss": 0.2282, + "step": 2045 + }, + { + "epoch": 0.41408621736490586, + "grad_norm": 0.328730970621109, + "learning_rate": 0.00017969617615857858, + "loss": 0.2102, + "step": 2046 + }, + { + "epoch": 0.4142886055454362, + "grad_norm": 0.3981629014015198, + "learning_rate": 0.00017967695965958046, + "loss": 0.2718, + "step": 2047 + }, + { + "epoch": 0.41449099372596643, + "grad_norm": 0.43636375665664673, + "learning_rate": 0.00017965773509970355, + "loss": 0.2602, + "step": 2048 + }, + { + "epoch": 0.4146933819064967, + "grad_norm": 0.5631572604179382, + "learning_rate": 0.00017963850248089286, + "loss": 0.2629, + "step": 2049 + }, + { + "epoch": 0.41489577008702694, + "grad_norm": 0.4168608486652374, + "learning_rate": 0.00017961926180509415, + "loss": 0.2745, + "step": 2050 + }, + { + "epoch": 0.41489577008702694, + "eval_loss": 0.2666853070259094, + "eval_runtime": 0.7389, + "eval_samples_per_second": 6.767, + "eval_steps_per_second": 1.353, + "step": 2050 + }, + { + "epoch": 0.4150981582675572, + "grad_norm": 0.35379037261009216, + "learning_rate": 0.00017960001307425395, + "loss": 0.2712, + "step": 2051 + }, + { + "epoch": 0.41530054644808745, + "grad_norm": 0.31800132989883423, + "learning_rate": 0.00017958075629031966, + "loss": 0.2797, + "step": 2052 + }, + { + "epoch": 0.4155029346286177, + "grad_norm": 0.3469184935092926, + "learning_rate": 0.00017956149145523947, + "loss": 0.2749, + "step": 2053 + }, + { + "epoch": 0.41570532280914796, + "grad_norm": 0.38023021817207336, + "learning_rate": 0.00017954221857096242, + "loss": 0.2419, + "step": 2054 + }, + { + "epoch": 0.4159077109896782, + "grad_norm": 0.32849445939064026, + "learning_rate": 0.0001795229376394383, + "loss": 0.2509, + "step": 2055 + }, + { + "epoch": 0.41611009917020847, + "grad_norm": 0.36450543999671936, + "learning_rate": 0.0001795036486626178, + "loss": 0.2962, + "step": 2056 + }, + { + "epoch": 0.4163124873507387, + "grad_norm": 0.3229864537715912, + "learning_rate": 0.00017948435164245234, + "loss": 0.2398, + "step": 2057 + }, + { + "epoch": 0.416514875531269, + "grad_norm": 0.2975095510482788, + "learning_rate": 0.00017946504658089422, + "loss": 0.2286, + "step": 2058 + }, + { + "epoch": 0.41671726371179924, + "grad_norm": 0.3617817759513855, + "learning_rate": 0.00017944573347989645, + "loss": 0.2797, + "step": 2059 + }, + { + "epoch": 0.4169196518923295, + "grad_norm": 0.466573566198349, + "learning_rate": 0.00017942641234141302, + "loss": 0.2173, + "step": 2060 + }, + { + "epoch": 0.41712204007285975, + "grad_norm": 0.29917478561401367, + "learning_rate": 0.0001794070831673986, + "loss": 0.2679, + "step": 2061 + }, + { + "epoch": 0.41732442825339, + "grad_norm": 0.36867937445640564, + "learning_rate": 0.00017938774595980872, + "loss": 0.2855, + "step": 2062 + }, + { + "epoch": 0.41752681643392026, + "grad_norm": 0.27075591683387756, + "learning_rate": 0.0001793684007205997, + "loss": 0.2606, + "step": 2063 + }, + { + "epoch": 0.4177292046144505, + "grad_norm": 0.3169737756252289, + "learning_rate": 0.00017934904745172872, + "loss": 0.2165, + "step": 2064 + }, + { + "epoch": 0.41793159279498077, + "grad_norm": 0.292100191116333, + "learning_rate": 0.00017932968615515365, + "loss": 0.2559, + "step": 2065 + }, + { + "epoch": 0.418133980975511, + "grad_norm": 0.4538803696632385, + "learning_rate": 0.0001793103168328334, + "loss": 0.2537, + "step": 2066 + }, + { + "epoch": 0.4183363691560413, + "grad_norm": 0.5876901745796204, + "learning_rate": 0.00017929093948672748, + "loss": 0.2414, + "step": 2067 + }, + { + "epoch": 0.41853875733657153, + "grad_norm": 0.8129988312721252, + "learning_rate": 0.00017927155411879628, + "loss": 0.2363, + "step": 2068 + }, + { + "epoch": 0.4187411455171018, + "grad_norm": 0.45293712615966797, + "learning_rate": 0.00017925216073100102, + "loss": 0.2158, + "step": 2069 + }, + { + "epoch": 0.41894353369763204, + "grad_norm": 0.28010156750679016, + "learning_rate": 0.00017923275932530373, + "loss": 0.2707, + "step": 2070 + }, + { + "epoch": 0.4191459218781623, + "grad_norm": 0.34166184067726135, + "learning_rate": 0.00017921334990366722, + "loss": 0.2437, + "step": 2071 + }, + { + "epoch": 0.41934831005869255, + "grad_norm": 0.3896978199481964, + "learning_rate": 0.00017919393246805513, + "loss": 0.2076, + "step": 2072 + }, + { + "epoch": 0.4195506982392228, + "grad_norm": 0.6046048402786255, + "learning_rate": 0.00017917450702043195, + "loss": 0.2931, + "step": 2073 + }, + { + "epoch": 0.41975308641975306, + "grad_norm": 0.3625839948654175, + "learning_rate": 0.0001791550735627629, + "loss": 0.217, + "step": 2074 + }, + { + "epoch": 0.4199554746002833, + "grad_norm": 0.37617138028144836, + "learning_rate": 0.00017913563209701408, + "loss": 0.2681, + "step": 2075 + }, + { + "epoch": 0.4201578627808136, + "grad_norm": 0.40031054615974426, + "learning_rate": 0.00017911618262515238, + "loss": 0.2421, + "step": 2076 + }, + { + "epoch": 0.4203602509613439, + "grad_norm": 0.40487319231033325, + "learning_rate": 0.00017909672514914546, + "loss": 0.2537, + "step": 2077 + }, + { + "epoch": 0.42056263914187414, + "grad_norm": 0.39844515919685364, + "learning_rate": 0.00017907725967096182, + "loss": 0.2642, + "step": 2078 + }, + { + "epoch": 0.4207650273224044, + "grad_norm": 0.3935433626174927, + "learning_rate": 0.00017905778619257086, + "loss": 0.2726, + "step": 2079 + }, + { + "epoch": 0.42096741550293465, + "grad_norm": 0.3426590859889984, + "learning_rate": 0.00017903830471594257, + "loss": 0.2064, + "step": 2080 + }, + { + "epoch": 0.4211698036834649, + "grad_norm": 0.3915655314922333, + "learning_rate": 0.000179018815243048, + "loss": 0.2312, + "step": 2081 + }, + { + "epoch": 0.42137219186399516, + "grad_norm": 0.470255970954895, + "learning_rate": 0.00017899931777585882, + "loss": 0.2045, + "step": 2082 + }, + { + "epoch": 0.4215745800445254, + "grad_norm": 0.3128467798233032, + "learning_rate": 0.00017897981231634758, + "loss": 0.2497, + "step": 2083 + }, + { + "epoch": 0.42177696822505567, + "grad_norm": 0.31205683946609497, + "learning_rate": 0.00017896029886648766, + "loss": 0.23, + "step": 2084 + }, + { + "epoch": 0.4219793564055859, + "grad_norm": 0.40596142411231995, + "learning_rate": 0.00017894077742825325, + "loss": 0.2658, + "step": 2085 + }, + { + "epoch": 0.4221817445861162, + "grad_norm": 0.2824687957763672, + "learning_rate": 0.00017892124800361926, + "loss": 0.232, + "step": 2086 + }, + { + "epoch": 0.42238413276664644, + "grad_norm": 0.4363657236099243, + "learning_rate": 0.00017890171059456155, + "loss": 0.2639, + "step": 2087 + }, + { + "epoch": 0.4225865209471767, + "grad_norm": 0.5147897601127625, + "learning_rate": 0.0001788821652030566, + "loss": 0.2583, + "step": 2088 + }, + { + "epoch": 0.42278890912770695, + "grad_norm": 0.3708350658416748, + "learning_rate": 0.00017886261183108193, + "loss": 0.2607, + "step": 2089 + }, + { + "epoch": 0.4229912973082372, + "grad_norm": 0.28474700450897217, + "learning_rate": 0.00017884305048061568, + "loss": 0.2268, + "step": 2090 + }, + { + "epoch": 0.42319368548876746, + "grad_norm": 0.3953563868999481, + "learning_rate": 0.0001788234811536369, + "loss": 0.2706, + "step": 2091 + }, + { + "epoch": 0.4233960736692977, + "grad_norm": 0.2877426743507385, + "learning_rate": 0.00017880390385212534, + "loss": 0.2428, + "step": 2092 + }, + { + "epoch": 0.42359846184982797, + "grad_norm": 0.2851710319519043, + "learning_rate": 0.0001787843185780617, + "loss": 0.237, + "step": 2093 + }, + { + "epoch": 0.4238008500303582, + "grad_norm": 0.3443615734577179, + "learning_rate": 0.00017876472533342734, + "loss": 0.2469, + "step": 2094 + }, + { + "epoch": 0.4240032382108885, + "grad_norm": 0.2852996289730072, + "learning_rate": 0.00017874512412020458, + "loss": 0.2365, + "step": 2095 + }, + { + "epoch": 0.42420562639141873, + "grad_norm": 0.3117428719997406, + "learning_rate": 0.0001787255149403764, + "loss": 0.2594, + "step": 2096 + }, + { + "epoch": 0.424408014571949, + "grad_norm": 0.32744285464286804, + "learning_rate": 0.0001787058977959267, + "loss": 0.2664, + "step": 2097 + }, + { + "epoch": 0.42461040275247924, + "grad_norm": 0.4537400007247925, + "learning_rate": 0.00017868627268884007, + "loss": 0.2401, + "step": 2098 + }, + { + "epoch": 0.4248127909330095, + "grad_norm": 0.3057894706726074, + "learning_rate": 0.00017866663962110203, + "loss": 0.2438, + "step": 2099 + }, + { + "epoch": 0.42501517911353975, + "grad_norm": 0.45849281549453735, + "learning_rate": 0.00017864699859469887, + "loss": 0.2275, + "step": 2100 + }, + { + "epoch": 0.42501517911353975, + "eval_loss": 0.2693714201450348, + "eval_runtime": 0.7384, + "eval_samples_per_second": 6.771, + "eval_steps_per_second": 1.354, + "step": 2100 + }, + { + "epoch": 0.42521756729407, + "grad_norm": 0.45448294281959534, + "learning_rate": 0.0001786273496116176, + "loss": 0.2915, + "step": 2101 + }, + { + "epoch": 0.42541995547460026, + "grad_norm": 0.36509305238723755, + "learning_rate": 0.0001786076926738461, + "loss": 0.2425, + "step": 2102 + }, + { + "epoch": 0.4256223436551305, + "grad_norm": 0.33645161986351013, + "learning_rate": 0.00017858802778337313, + "loss": 0.2239, + "step": 2103 + }, + { + "epoch": 0.4258247318356608, + "grad_norm": 0.31513741612434387, + "learning_rate": 0.0001785683549421881, + "loss": 0.2711, + "step": 2104 + }, + { + "epoch": 0.42602712001619103, + "grad_norm": 0.3990754783153534, + "learning_rate": 0.0001785486741522813, + "loss": 0.2172, + "step": 2105 + }, + { + "epoch": 0.4262295081967213, + "grad_norm": 0.34632688760757446, + "learning_rate": 0.00017852898541564387, + "loss": 0.2578, + "step": 2106 + }, + { + "epoch": 0.4264318963772516, + "grad_norm": 0.3364465832710266, + "learning_rate": 0.0001785092887342677, + "loss": 0.2357, + "step": 2107 + }, + { + "epoch": 0.42663428455778185, + "grad_norm": 0.3629503548145294, + "learning_rate": 0.0001784895841101455, + "loss": 0.2397, + "step": 2108 + }, + { + "epoch": 0.4268366727383121, + "grad_norm": 0.34377363324165344, + "learning_rate": 0.00017846987154527072, + "loss": 0.2656, + "step": 2109 + }, + { + "epoch": 0.42703906091884236, + "grad_norm": 0.2795635461807251, + "learning_rate": 0.00017845015104163775, + "loss": 0.2153, + "step": 2110 + }, + { + "epoch": 0.4272414490993726, + "grad_norm": 0.3671300411224365, + "learning_rate": 0.0001784304226012416, + "loss": 0.2438, + "step": 2111 + }, + { + "epoch": 0.42744383727990287, + "grad_norm": 0.3172593414783478, + "learning_rate": 0.00017841068622607832, + "loss": 0.2426, + "step": 2112 + }, + { + "epoch": 0.4276462254604331, + "grad_norm": 0.39299485087394714, + "learning_rate": 0.00017839094191814453, + "loss": 0.2618, + "step": 2113 + }, + { + "epoch": 0.4278486136409634, + "grad_norm": 0.44122424721717834, + "learning_rate": 0.00017837118967943782, + "loss": 0.281, + "step": 2114 + }, + { + "epoch": 0.42805100182149364, + "grad_norm": 0.2724647521972656, + "learning_rate": 0.00017835142951195642, + "loss": 0.25, + "step": 2115 + }, + { + "epoch": 0.4282533900020239, + "grad_norm": 0.2748100161552429, + "learning_rate": 0.00017833166141769958, + "loss": 0.2344, + "step": 2116 + }, + { + "epoch": 0.42845577818255415, + "grad_norm": 0.3923550844192505, + "learning_rate": 0.00017831188539866712, + "loss": 0.2644, + "step": 2117 + }, + { + "epoch": 0.4286581663630844, + "grad_norm": 0.34341961145401, + "learning_rate": 0.00017829210145685982, + "loss": 0.2778, + "step": 2118 + }, + { + "epoch": 0.42886055454361466, + "grad_norm": 0.3682166635990143, + "learning_rate": 0.00017827230959427919, + "loss": 0.2611, + "step": 2119 + }, + { + "epoch": 0.4290629427241449, + "grad_norm": 0.3362092971801758, + "learning_rate": 0.0001782525098129276, + "loss": 0.2391, + "step": 2120 + }, + { + "epoch": 0.42926533090467517, + "grad_norm": 0.33998823165893555, + "learning_rate": 0.00017823270211480817, + "loss": 0.2168, + "step": 2121 + }, + { + "epoch": 0.4294677190852054, + "grad_norm": 0.30269312858581543, + "learning_rate": 0.00017821288650192481, + "loss": 0.2279, + "step": 2122 + }, + { + "epoch": 0.4296701072657357, + "grad_norm": 0.4033576250076294, + "learning_rate": 0.00017819306297628225, + "loss": 0.2308, + "step": 2123 + }, + { + "epoch": 0.42987249544626593, + "grad_norm": 0.2952467203140259, + "learning_rate": 0.00017817323153988606, + "loss": 0.2606, + "step": 2124 + }, + { + "epoch": 0.4300748836267962, + "grad_norm": 0.3474556505680084, + "learning_rate": 0.0001781533921947426, + "loss": 0.2605, + "step": 2125 + }, + { + "epoch": 0.43027727180732644, + "grad_norm": 0.30628398060798645, + "learning_rate": 0.00017813354494285896, + "loss": 0.2623, + "step": 2126 + }, + { + "epoch": 0.4304796599878567, + "grad_norm": 0.26096710562705994, + "learning_rate": 0.00017811368978624305, + "loss": 0.2445, + "step": 2127 + }, + { + "epoch": 0.43068204816838696, + "grad_norm": 0.32468295097351074, + "learning_rate": 0.00017809382672690367, + "loss": 0.2433, + "step": 2128 + }, + { + "epoch": 0.4308844363489172, + "grad_norm": 0.3501925468444824, + "learning_rate": 0.00017807395576685035, + "loss": 0.2197, + "step": 2129 + }, + { + "epoch": 0.43108682452944747, + "grad_norm": 0.335664838552475, + "learning_rate": 0.0001780540769080934, + "loss": 0.2278, + "step": 2130 + }, + { + "epoch": 0.4312892127099777, + "grad_norm": 0.5295902490615845, + "learning_rate": 0.00017803419015264394, + "loss": 0.2839, + "step": 2131 + }, + { + "epoch": 0.431491600890508, + "grad_norm": 0.5748984217643738, + "learning_rate": 0.00017801429550251392, + "loss": 0.2706, + "step": 2132 + }, + { + "epoch": 0.43169398907103823, + "grad_norm": 0.3495369553565979, + "learning_rate": 0.0001779943929597161, + "loss": 0.2406, + "step": 2133 + }, + { + "epoch": 0.4318963772515685, + "grad_norm": 0.419474333524704, + "learning_rate": 0.00017797448252626397, + "loss": 0.2339, + "step": 2134 + }, + { + "epoch": 0.43209876543209874, + "grad_norm": 0.3343390226364136, + "learning_rate": 0.00017795456420417188, + "loss": 0.2389, + "step": 2135 + }, + { + "epoch": 0.432301153612629, + "grad_norm": 0.41015762090682983, + "learning_rate": 0.00017793463799545495, + "loss": 0.2492, + "step": 2136 + }, + { + "epoch": 0.43250354179315925, + "grad_norm": 0.28410017490386963, + "learning_rate": 0.0001779147039021291, + "loss": 0.233, + "step": 2137 + }, + { + "epoch": 0.43270592997368956, + "grad_norm": 0.3253934979438782, + "learning_rate": 0.00017789476192621106, + "loss": 0.2742, + "step": 2138 + }, + { + "epoch": 0.4329083181542198, + "grad_norm": 0.29073867201805115, + "learning_rate": 0.00017787481206971837, + "loss": 0.2235, + "step": 2139 + }, + { + "epoch": 0.4331107063347501, + "grad_norm": 0.31343135237693787, + "learning_rate": 0.0001778548543346693, + "loss": 0.2774, + "step": 2140 + }, + { + "epoch": 0.43331309451528033, + "grad_norm": 0.31908923387527466, + "learning_rate": 0.000177834888723083, + "loss": 0.264, + "step": 2141 + }, + { + "epoch": 0.4335154826958106, + "grad_norm": 0.25621846318244934, + "learning_rate": 0.00017781491523697937, + "loss": 0.2283, + "step": 2142 + }, + { + "epoch": 0.43371787087634084, + "grad_norm": 0.47303399443626404, + "learning_rate": 0.00017779493387837914, + "loss": 0.2955, + "step": 2143 + }, + { + "epoch": 0.4339202590568711, + "grad_norm": 0.604739248752594, + "learning_rate": 0.00017777494464930378, + "loss": 0.2493, + "step": 2144 + }, + { + "epoch": 0.43412264723740135, + "grad_norm": 0.31334736943244934, + "learning_rate": 0.0001777549475517756, + "loss": 0.2581, + "step": 2145 + }, + { + "epoch": 0.4343250354179316, + "grad_norm": 0.2978392243385315, + "learning_rate": 0.0001777349425878177, + "loss": 0.2616, + "step": 2146 + }, + { + "epoch": 0.43452742359846186, + "grad_norm": 0.39638951420783997, + "learning_rate": 0.00017771492975945396, + "loss": 0.2632, + "step": 2147 + }, + { + "epoch": 0.4347298117789921, + "grad_norm": 0.32750117778778076, + "learning_rate": 0.00017769490906870909, + "loss": 0.2686, + "step": 2148 + }, + { + "epoch": 0.43493219995952237, + "grad_norm": 0.3518666625022888, + "learning_rate": 0.00017767488051760857, + "loss": 0.2759, + "step": 2149 + }, + { + "epoch": 0.4351345881400526, + "grad_norm": 0.3919273912906647, + "learning_rate": 0.00017765484410817866, + "loss": 0.2458, + "step": 2150 + }, + { + "epoch": 0.4351345881400526, + "eval_loss": 0.27276766300201416, + "eval_runtime": 0.7375, + "eval_samples_per_second": 6.779, + "eval_steps_per_second": 1.356, + "step": 2150 + }, + { + "epoch": 0.4353369763205829, + "grad_norm": 0.312533438205719, + "learning_rate": 0.00017763479984244645, + "loss": 0.2375, + "step": 2151 + }, + { + "epoch": 0.43553936450111314, + "grad_norm": 0.441134512424469, + "learning_rate": 0.00017761474772243983, + "loss": 0.2061, + "step": 2152 + }, + { + "epoch": 0.4357417526816434, + "grad_norm": 0.36551475524902344, + "learning_rate": 0.00017759468775018742, + "loss": 0.2307, + "step": 2153 + }, + { + "epoch": 0.43594414086217365, + "grad_norm": 0.35309749841690063, + "learning_rate": 0.00017757461992771867, + "loss": 0.2429, + "step": 2154 + }, + { + "epoch": 0.4361465290427039, + "grad_norm": 0.2728305757045746, + "learning_rate": 0.00017755454425706388, + "loss": 0.2372, + "step": 2155 + }, + { + "epoch": 0.43634891722323416, + "grad_norm": 0.3202503025531769, + "learning_rate": 0.00017753446074025408, + "loss": 0.2282, + "step": 2156 + }, + { + "epoch": 0.4365513054037644, + "grad_norm": 0.36079493165016174, + "learning_rate": 0.00017751436937932108, + "loss": 0.2268, + "step": 2157 + }, + { + "epoch": 0.43675369358429467, + "grad_norm": 0.3829249441623688, + "learning_rate": 0.00017749427017629756, + "loss": 0.2148, + "step": 2158 + }, + { + "epoch": 0.4369560817648249, + "grad_norm": 0.39769890904426575, + "learning_rate": 0.0001774741631332169, + "loss": 0.275, + "step": 2159 + }, + { + "epoch": 0.4371584699453552, + "grad_norm": 0.3986724615097046, + "learning_rate": 0.00017745404825211336, + "loss": 0.2757, + "step": 2160 + }, + { + "epoch": 0.43736085812588543, + "grad_norm": 0.30949562788009644, + "learning_rate": 0.00017743392553502192, + "loss": 0.2538, + "step": 2161 + }, + { + "epoch": 0.4375632463064157, + "grad_norm": 0.2870640754699707, + "learning_rate": 0.0001774137949839784, + "loss": 0.2631, + "step": 2162 + }, + { + "epoch": 0.43776563448694594, + "grad_norm": 0.3055521249771118, + "learning_rate": 0.0001773936566010194, + "loss": 0.2433, + "step": 2163 + }, + { + "epoch": 0.4379680226674762, + "grad_norm": 0.2762567400932312, + "learning_rate": 0.0001773735103881823, + "loss": 0.261, + "step": 2164 + }, + { + "epoch": 0.43817041084800645, + "grad_norm": 0.2516343891620636, + "learning_rate": 0.00017735335634750532, + "loss": 0.2424, + "step": 2165 + }, + { + "epoch": 0.4383727990285367, + "grad_norm": 0.29465994238853455, + "learning_rate": 0.0001773331944810274, + "loss": 0.2296, + "step": 2166 + }, + { + "epoch": 0.43857518720906696, + "grad_norm": 0.24924996495246887, + "learning_rate": 0.00017731302479078828, + "loss": 0.2154, + "step": 2167 + }, + { + "epoch": 0.4387775753895973, + "grad_norm": 0.46507659554481506, + "learning_rate": 0.00017729284727882857, + "loss": 0.2635, + "step": 2168 + }, + { + "epoch": 0.43897996357012753, + "grad_norm": 0.3225403428077698, + "learning_rate": 0.0001772726619471896, + "loss": 0.2381, + "step": 2169 + }, + { + "epoch": 0.4391823517506578, + "grad_norm": 0.32947832345962524, + "learning_rate": 0.0001772524687979135, + "loss": 0.2426, + "step": 2170 + }, + { + "epoch": 0.43938473993118804, + "grad_norm": 0.29286065697669983, + "learning_rate": 0.0001772322678330432, + "loss": 0.2385, + "step": 2171 + }, + { + "epoch": 0.4395871281117183, + "grad_norm": 0.40721961855888367, + "learning_rate": 0.0001772120590546224, + "loss": 0.2746, + "step": 2172 + }, + { + "epoch": 0.43978951629224855, + "grad_norm": 0.2583456337451935, + "learning_rate": 0.0001771918424646957, + "loss": 0.2177, + "step": 2173 + }, + { + "epoch": 0.4399919044727788, + "grad_norm": 0.3217853903770447, + "learning_rate": 0.00017717161806530833, + "loss": 0.2292, + "step": 2174 + }, + { + "epoch": 0.44019429265330906, + "grad_norm": 0.32177701592445374, + "learning_rate": 0.00017715138585850637, + "loss": 0.2568, + "step": 2175 + }, + { + "epoch": 0.4403966808338393, + "grad_norm": 0.3047245740890503, + "learning_rate": 0.00017713114584633674, + "loss": 0.2221, + "step": 2176 + }, + { + "epoch": 0.44059906901436957, + "grad_norm": 0.43895235657691956, + "learning_rate": 0.00017711089803084713, + "loss": 0.2433, + "step": 2177 + }, + { + "epoch": 0.4408014571948998, + "grad_norm": 0.3079501688480377, + "learning_rate": 0.00017709064241408593, + "loss": 0.2418, + "step": 2178 + }, + { + "epoch": 0.4410038453754301, + "grad_norm": 0.3755057752132416, + "learning_rate": 0.00017707037899810247, + "loss": 0.2585, + "step": 2179 + }, + { + "epoch": 0.44120623355596034, + "grad_norm": 0.3147794306278229, + "learning_rate": 0.00017705010778494673, + "loss": 0.2534, + "step": 2180 + }, + { + "epoch": 0.4414086217364906, + "grad_norm": 0.37766438722610474, + "learning_rate": 0.00017702982877666957, + "loss": 0.252, + "step": 2181 + }, + { + "epoch": 0.44161100991702085, + "grad_norm": 0.36453086137771606, + "learning_rate": 0.0001770095419753226, + "loss": 0.258, + "step": 2182 + }, + { + "epoch": 0.4418133980975511, + "grad_norm": 0.2642430067062378, + "learning_rate": 0.0001769892473829582, + "loss": 0.2284, + "step": 2183 + }, + { + "epoch": 0.44201578627808136, + "grad_norm": 0.3428244888782501, + "learning_rate": 0.00017696894500162963, + "loss": 0.2241, + "step": 2184 + }, + { + "epoch": 0.4422181744586116, + "grad_norm": 0.32250645756721497, + "learning_rate": 0.0001769486348333908, + "loss": 0.2849, + "step": 2185 + }, + { + "epoch": 0.44242056263914187, + "grad_norm": 0.7376700043678284, + "learning_rate": 0.00017692831688029655, + "loss": 0.2689, + "step": 2186 + }, + { + "epoch": 0.4426229508196721, + "grad_norm": 0.39100563526153564, + "learning_rate": 0.00017690799114440236, + "loss": 0.2506, + "step": 2187 + }, + { + "epoch": 0.4428253390002024, + "grad_norm": 0.3266545236110687, + "learning_rate": 0.00017688765762776464, + "loss": 0.2703, + "step": 2188 + }, + { + "epoch": 0.44302772718073263, + "grad_norm": 0.391176700592041, + "learning_rate": 0.00017686731633244045, + "loss": 0.2712, + "step": 2189 + }, + { + "epoch": 0.4432301153612629, + "grad_norm": 0.30557316541671753, + "learning_rate": 0.00017684696726048778, + "loss": 0.2413, + "step": 2190 + }, + { + "epoch": 0.44343250354179314, + "grad_norm": 0.30448246002197266, + "learning_rate": 0.00017682661041396532, + "loss": 0.2763, + "step": 2191 + }, + { + "epoch": 0.4436348917223234, + "grad_norm": 0.38532236218452454, + "learning_rate": 0.00017680624579493253, + "loss": 0.2951, + "step": 2192 + }, + { + "epoch": 0.44383727990285365, + "grad_norm": 0.29205942153930664, + "learning_rate": 0.0001767858734054497, + "loss": 0.2443, + "step": 2193 + }, + { + "epoch": 0.4440396680833839, + "grad_norm": 0.3226570188999176, + "learning_rate": 0.00017676549324757793, + "loss": 0.2426, + "step": 2194 + }, + { + "epoch": 0.44424205626391416, + "grad_norm": 0.3055272400379181, + "learning_rate": 0.00017674510532337905, + "loss": 0.2766, + "step": 2195 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.3346841335296631, + "learning_rate": 0.00017672470963491567, + "loss": 0.2532, + "step": 2196 + }, + { + "epoch": 0.4446468326249747, + "grad_norm": 0.36648398637771606, + "learning_rate": 0.00017670430618425123, + "loss": 0.2539, + "step": 2197 + }, + { + "epoch": 0.444849220805505, + "grad_norm": 0.25422319769859314, + "learning_rate": 0.00017668389497344997, + "loss": 0.2566, + "step": 2198 + }, + { + "epoch": 0.44505160898603524, + "grad_norm": 0.3028642237186432, + "learning_rate": 0.00017666347600457685, + "loss": 0.2367, + "step": 2199 + }, + { + "epoch": 0.4452539971665655, + "grad_norm": 0.4553399085998535, + "learning_rate": 0.0001766430492796976, + "loss": 0.2821, + "step": 2200 + }, + { + "epoch": 0.4452539971665655, + "eval_loss": 0.270407497882843, + "eval_runtime": 0.7397, + "eval_samples_per_second": 6.76, + "eval_steps_per_second": 1.352, + "step": 2200 + }, + { + "epoch": 0.44545638534709575, + "grad_norm": 0.28558349609375, + "learning_rate": 0.00017662261480087886, + "loss": 0.2549, + "step": 2201 + }, + { + "epoch": 0.445658773527626, + "grad_norm": 0.32957684993743896, + "learning_rate": 0.00017660217257018794, + "loss": 0.2448, + "step": 2202 + }, + { + "epoch": 0.44586116170815626, + "grad_norm": 0.4825969934463501, + "learning_rate": 0.00017658172258969298, + "loss": 0.2355, + "step": 2203 + }, + { + "epoch": 0.4460635498886865, + "grad_norm": 0.2827821969985962, + "learning_rate": 0.00017656126486146291, + "loss": 0.2386, + "step": 2204 + }, + { + "epoch": 0.44626593806921677, + "grad_norm": 0.3403480350971222, + "learning_rate": 0.0001765407993875674, + "loss": 0.2504, + "step": 2205 + }, + { + "epoch": 0.446468326249747, + "grad_norm": 0.2827328145503998, + "learning_rate": 0.00017652032617007692, + "loss": 0.2382, + "step": 2206 + }, + { + "epoch": 0.4466707144302773, + "grad_norm": 0.3415543735027313, + "learning_rate": 0.0001764998452110628, + "loss": 0.2456, + "step": 2207 + }, + { + "epoch": 0.44687310261080754, + "grad_norm": 0.3274790644645691, + "learning_rate": 0.000176479356512597, + "loss": 0.25, + "step": 2208 + }, + { + "epoch": 0.4470754907913378, + "grad_norm": 0.27415189146995544, + "learning_rate": 0.0001764588600767524, + "loss": 0.2204, + "step": 2209 + }, + { + "epoch": 0.44727787897186805, + "grad_norm": 0.29590359330177307, + "learning_rate": 0.00017643835590560266, + "loss": 0.2628, + "step": 2210 + }, + { + "epoch": 0.4474802671523983, + "grad_norm": 0.3007771968841553, + "learning_rate": 0.00017641784400122208, + "loss": 0.2488, + "step": 2211 + }, + { + "epoch": 0.44768265533292856, + "grad_norm": 0.4987753927707672, + "learning_rate": 0.00017639732436568588, + "loss": 0.291, + "step": 2212 + }, + { + "epoch": 0.4478850435134588, + "grad_norm": 0.36342155933380127, + "learning_rate": 0.00017637679700107005, + "loss": 0.2815, + "step": 2213 + }, + { + "epoch": 0.44808743169398907, + "grad_norm": 0.3229970932006836, + "learning_rate": 0.0001763562619094513, + "loss": 0.2731, + "step": 2214 + }, + { + "epoch": 0.4482898198745193, + "grad_norm": 0.32807472348213196, + "learning_rate": 0.0001763357190929072, + "loss": 0.2226, + "step": 2215 + }, + { + "epoch": 0.4484922080550496, + "grad_norm": 0.4111528694629669, + "learning_rate": 0.000176315168553516, + "loss": 0.2287, + "step": 2216 + }, + { + "epoch": 0.44869459623557983, + "grad_norm": 0.2823032736778259, + "learning_rate": 0.0001762946102933568, + "loss": 0.2528, + "step": 2217 + }, + { + "epoch": 0.4488969844161101, + "grad_norm": 0.3747027814388275, + "learning_rate": 0.0001762740443145095, + "loss": 0.2593, + "step": 2218 + }, + { + "epoch": 0.44909937259664034, + "grad_norm": 0.3702718913555145, + "learning_rate": 0.00017625347061905476, + "loss": 0.2749, + "step": 2219 + }, + { + "epoch": 0.4493017607771706, + "grad_norm": 0.29596519470214844, + "learning_rate": 0.00017623288920907393, + "loss": 0.2209, + "step": 2220 + }, + { + "epoch": 0.44950414895770086, + "grad_norm": 0.32838091254234314, + "learning_rate": 0.0001762123000866493, + "loss": 0.2444, + "step": 2221 + }, + { + "epoch": 0.4497065371382311, + "grad_norm": 0.37883222103118896, + "learning_rate": 0.0001761917032538638, + "loss": 0.2952, + "step": 2222 + }, + { + "epoch": 0.44990892531876137, + "grad_norm": 0.37547382712364197, + "learning_rate": 0.00017617109871280126, + "loss": 0.2471, + "step": 2223 + }, + { + "epoch": 0.4501113134992916, + "grad_norm": 0.41560691595077515, + "learning_rate": 0.0001761504864655462, + "loss": 0.2471, + "step": 2224 + }, + { + "epoch": 0.4503137016798219, + "grad_norm": 0.41966041922569275, + "learning_rate": 0.00017612986651418397, + "loss": 0.2409, + "step": 2225 + }, + { + "epoch": 0.45051608986035213, + "grad_norm": 0.3438867926597595, + "learning_rate": 0.00017610923886080064, + "loss": 0.269, + "step": 2226 + }, + { + "epoch": 0.4507184780408824, + "grad_norm": 0.32733553647994995, + "learning_rate": 0.00017608860350748316, + "loss": 0.29, + "step": 2227 + }, + { + "epoch": 0.4509208662214127, + "grad_norm": 0.48707279562950134, + "learning_rate": 0.00017606796045631918, + "loss": 0.2627, + "step": 2228 + }, + { + "epoch": 0.45112325440194295, + "grad_norm": 0.29957085847854614, + "learning_rate": 0.0001760473097093971, + "loss": 0.2679, + "step": 2229 + }, + { + "epoch": 0.4513256425824732, + "grad_norm": 0.26773086190223694, + "learning_rate": 0.00017602665126880616, + "loss": 0.2451, + "step": 2230 + }, + { + "epoch": 0.45152803076300346, + "grad_norm": 0.3124980628490448, + "learning_rate": 0.00017600598513663643, + "loss": 0.2856, + "step": 2231 + }, + { + "epoch": 0.4517304189435337, + "grad_norm": 0.2624861001968384, + "learning_rate": 0.00017598531131497863, + "loss": 0.2523, + "step": 2232 + }, + { + "epoch": 0.451932807124064, + "grad_norm": 0.26772695779800415, + "learning_rate": 0.00017596462980592432, + "loss": 0.2321, + "step": 2233 + }, + { + "epoch": 0.45213519530459423, + "grad_norm": 0.3250735104084015, + "learning_rate": 0.00017594394061156584, + "loss": 0.2724, + "step": 2234 + }, + { + "epoch": 0.4523375834851245, + "grad_norm": 0.3289940059185028, + "learning_rate": 0.00017592324373399637, + "loss": 0.249, + "step": 2235 + }, + { + "epoch": 0.45253997166565474, + "grad_norm": 0.4136817455291748, + "learning_rate": 0.00017590253917530973, + "loss": 0.2712, + "step": 2236 + }, + { + "epoch": 0.452742359846185, + "grad_norm": 0.31983017921447754, + "learning_rate": 0.00017588182693760058, + "loss": 0.2446, + "step": 2237 + }, + { + "epoch": 0.45294474802671525, + "grad_norm": 0.3201993405818939, + "learning_rate": 0.00017586110702296447, + "loss": 0.2634, + "step": 2238 + }, + { + "epoch": 0.4531471362072455, + "grad_norm": 0.32686853408813477, + "learning_rate": 0.00017584037943349748, + "loss": 0.2521, + "step": 2239 + }, + { + "epoch": 0.45334952438777576, + "grad_norm": 0.3686739206314087, + "learning_rate": 0.0001758196441712967, + "loss": 0.2539, + "step": 2240 + }, + { + "epoch": 0.453551912568306, + "grad_norm": 0.36773359775543213, + "learning_rate": 0.00017579890123845993, + "loss": 0.2652, + "step": 2241 + }, + { + "epoch": 0.45375430074883627, + "grad_norm": 0.3526698350906372, + "learning_rate": 0.00017577815063708565, + "loss": 0.2548, + "step": 2242 + }, + { + "epoch": 0.4539566889293665, + "grad_norm": 0.301490843296051, + "learning_rate": 0.0001757573923692732, + "loss": 0.2734, + "step": 2243 + }, + { + "epoch": 0.4541590771098968, + "grad_norm": 0.26612064242362976, + "learning_rate": 0.00017573662643712276, + "loss": 0.2493, + "step": 2244 + }, + { + "epoch": 0.45436146529042704, + "grad_norm": 0.38326704502105713, + "learning_rate": 0.0001757158528427351, + "loss": 0.2493, + "step": 2245 + }, + { + "epoch": 0.4545638534709573, + "grad_norm": 0.34822413325309753, + "learning_rate": 0.00017569507158821197, + "loss": 0.2568, + "step": 2246 + }, + { + "epoch": 0.45476624165148755, + "grad_norm": 0.27359241247177124, + "learning_rate": 0.0001756742826756557, + "loss": 0.2329, + "step": 2247 + }, + { + "epoch": 0.4549686298320178, + "grad_norm": 0.28838682174682617, + "learning_rate": 0.0001756534861071696, + "loss": 0.2284, + "step": 2248 + }, + { + "epoch": 0.45517101801254806, + "grad_norm": 0.32817342877388, + "learning_rate": 0.00017563268188485758, + "loss": 0.2332, + "step": 2249 + }, + { + "epoch": 0.4553734061930783, + "grad_norm": 0.2918015122413635, + "learning_rate": 0.00017561187001082442, + "loss": 0.262, + "step": 2250 + }, + { + "epoch": 0.4553734061930783, + "eval_loss": 0.2709426283836365, + "eval_runtime": 0.7401, + "eval_samples_per_second": 6.756, + "eval_steps_per_second": 1.351, + "step": 2250 + }, + { + "epoch": 0.45557579437360857, + "grad_norm": 0.2817946970462799, + "learning_rate": 0.00017559105048717562, + "loss": 0.2314, + "step": 2251 + }, + { + "epoch": 0.4557781825541388, + "grad_norm": 0.3302723467350006, + "learning_rate": 0.0001755702233160175, + "loss": 0.2695, + "step": 2252 + }, + { + "epoch": 0.4559805707346691, + "grad_norm": 0.29945623874664307, + "learning_rate": 0.00017554938849945716, + "loss": 0.2501, + "step": 2253 + }, + { + "epoch": 0.45618295891519933, + "grad_norm": 0.2748788297176361, + "learning_rate": 0.0001755285460396024, + "loss": 0.2396, + "step": 2254 + }, + { + "epoch": 0.4563853470957296, + "grad_norm": 0.2754972577095032, + "learning_rate": 0.0001755076959385619, + "loss": 0.2274, + "step": 2255 + }, + { + "epoch": 0.45658773527625984, + "grad_norm": 0.34508219361305237, + "learning_rate": 0.00017548683819844496, + "loss": 0.2576, + "step": 2256 + }, + { + "epoch": 0.4567901234567901, + "grad_norm": 0.29633739590644836, + "learning_rate": 0.00017546597282136185, + "loss": 0.2708, + "step": 2257 + }, + { + "epoch": 0.45699251163732035, + "grad_norm": 0.27955976128578186, + "learning_rate": 0.00017544509980942346, + "loss": 0.2663, + "step": 2258 + }, + { + "epoch": 0.45719489981785066, + "grad_norm": 0.31975439190864563, + "learning_rate": 0.0001754242191647415, + "loss": 0.2182, + "step": 2259 + }, + { + "epoch": 0.4573972879983809, + "grad_norm": 0.3412756323814392, + "learning_rate": 0.00017540333088942846, + "loss": 0.2395, + "step": 2260 + }, + { + "epoch": 0.4575996761789112, + "grad_norm": 0.2998964786529541, + "learning_rate": 0.00017538243498559759, + "loss": 0.2522, + "step": 2261 + }, + { + "epoch": 0.45780206435944143, + "grad_norm": 0.665895938873291, + "learning_rate": 0.00017536153145536294, + "loss": 0.2933, + "step": 2262 + }, + { + "epoch": 0.4580044525399717, + "grad_norm": 0.3041466176509857, + "learning_rate": 0.0001753406203008393, + "loss": 0.2466, + "step": 2263 + }, + { + "epoch": 0.45820684072050194, + "grad_norm": 0.5185272097587585, + "learning_rate": 0.00017531970152414222, + "loss": 0.2446, + "step": 2264 + }, + { + "epoch": 0.4584092289010322, + "grad_norm": 0.2728477120399475, + "learning_rate": 0.00017529877512738806, + "loss": 0.2084, + "step": 2265 + }, + { + "epoch": 0.45861161708156245, + "grad_norm": 0.3017343282699585, + "learning_rate": 0.00017527784111269395, + "loss": 0.276, + "step": 2266 + }, + { + "epoch": 0.4588140052620927, + "grad_norm": 0.32620134949684143, + "learning_rate": 0.00017525689948217775, + "loss": 0.22, + "step": 2267 + }, + { + "epoch": 0.45901639344262296, + "grad_norm": 0.34241124987602234, + "learning_rate": 0.00017523595023795813, + "loss": 0.2587, + "step": 2268 + }, + { + "epoch": 0.4592187816231532, + "grad_norm": 0.4101926386356354, + "learning_rate": 0.00017521499338215454, + "loss": 0.2947, + "step": 2269 + }, + { + "epoch": 0.45942116980368347, + "grad_norm": 0.5489742755889893, + "learning_rate": 0.00017519402891688708, + "loss": 0.2311, + "step": 2270 + }, + { + "epoch": 0.4596235579842137, + "grad_norm": 0.3954886794090271, + "learning_rate": 0.00017517305684427677, + "loss": 0.2739, + "step": 2271 + }, + { + "epoch": 0.459825946164744, + "grad_norm": 0.3286878764629364, + "learning_rate": 0.00017515207716644539, + "loss": 0.2892, + "step": 2272 + }, + { + "epoch": 0.46002833434527424, + "grad_norm": 0.36722058057785034, + "learning_rate": 0.0001751310898855154, + "loss": 0.2398, + "step": 2273 + }, + { + "epoch": 0.4602307225258045, + "grad_norm": 0.30072125792503357, + "learning_rate": 0.0001751100950036101, + "loss": 0.2548, + "step": 2274 + }, + { + "epoch": 0.46043311070633475, + "grad_norm": 0.30294889211654663, + "learning_rate": 0.0001750890925228535, + "loss": 0.268, + "step": 2275 + }, + { + "epoch": 0.460635498886865, + "grad_norm": 0.28848111629486084, + "learning_rate": 0.00017506808244537037, + "loss": 0.2599, + "step": 2276 + }, + { + "epoch": 0.46083788706739526, + "grad_norm": 0.31934136152267456, + "learning_rate": 0.00017504706477328635, + "loss": 0.2682, + "step": 2277 + }, + { + "epoch": 0.4610402752479255, + "grad_norm": 0.29640549421310425, + "learning_rate": 0.0001750260395087278, + "loss": 0.2309, + "step": 2278 + }, + { + "epoch": 0.46124266342845577, + "grad_norm": 0.37822410464286804, + "learning_rate": 0.0001750050066538218, + "loss": 0.2387, + "step": 2279 + }, + { + "epoch": 0.461445051608986, + "grad_norm": 0.31998032331466675, + "learning_rate": 0.00017498396621069625, + "loss": 0.2472, + "step": 2280 + }, + { + "epoch": 0.4616474397895163, + "grad_norm": 0.4722261428833008, + "learning_rate": 0.00017496291818147982, + "loss": 0.251, + "step": 2281 + }, + { + "epoch": 0.46184982797004653, + "grad_norm": 0.36852967739105225, + "learning_rate": 0.00017494186256830188, + "loss": 0.2541, + "step": 2282 + }, + { + "epoch": 0.4620522161505768, + "grad_norm": 0.2886607348918915, + "learning_rate": 0.00017492079937329264, + "loss": 0.2552, + "step": 2283 + }, + { + "epoch": 0.46225460433110704, + "grad_norm": 0.3052384555339813, + "learning_rate": 0.00017489972859858306, + "loss": 0.2354, + "step": 2284 + }, + { + "epoch": 0.4624569925116373, + "grad_norm": 0.29837775230407715, + "learning_rate": 0.00017487865024630485, + "loss": 0.238, + "step": 2285 + }, + { + "epoch": 0.46265938069216755, + "grad_norm": 0.30569151043891907, + "learning_rate": 0.0001748575643185905, + "loss": 0.2933, + "step": 2286 + }, + { + "epoch": 0.4628617688726978, + "grad_norm": 0.3457178771495819, + "learning_rate": 0.0001748364708175733, + "loss": 0.2378, + "step": 2287 + }, + { + "epoch": 0.46306415705322806, + "grad_norm": 0.2781577706336975, + "learning_rate": 0.00017481536974538718, + "loss": 0.2398, + "step": 2288 + }, + { + "epoch": 0.4632665452337584, + "grad_norm": 0.3738473951816559, + "learning_rate": 0.000174794261104167, + "loss": 0.2376, + "step": 2289 + }, + { + "epoch": 0.46346893341428863, + "grad_norm": 0.3048925995826721, + "learning_rate": 0.0001747731448960483, + "loss": 0.2325, + "step": 2290 + }, + { + "epoch": 0.4636713215948189, + "grad_norm": 0.27175387740135193, + "learning_rate": 0.00017475202112316737, + "loss": 0.2007, + "step": 2291 + }, + { + "epoch": 0.46387370977534914, + "grad_norm": 0.26002347469329834, + "learning_rate": 0.0001747308897876613, + "loss": 0.227, + "step": 2292 + }, + { + "epoch": 0.4640760979558794, + "grad_norm": 0.33025866746902466, + "learning_rate": 0.00017470975089166793, + "loss": 0.2397, + "step": 2293 + }, + { + "epoch": 0.46427848613640965, + "grad_norm": 0.4511990249156952, + "learning_rate": 0.00017468860443732592, + "loss": 0.2607, + "step": 2294 + }, + { + "epoch": 0.4644808743169399, + "grad_norm": 0.3069620132446289, + "learning_rate": 0.0001746674504267746, + "loss": 0.2274, + "step": 2295 + }, + { + "epoch": 0.46468326249747016, + "grad_norm": 0.3227700889110565, + "learning_rate": 0.00017464628886215415, + "loss": 0.258, + "step": 2296 + }, + { + "epoch": 0.4648856506780004, + "grad_norm": 0.3729799687862396, + "learning_rate": 0.00017462511974560542, + "loss": 0.2519, + "step": 2297 + }, + { + "epoch": 0.4650880388585307, + "grad_norm": 0.3248327076435089, + "learning_rate": 0.0001746039430792701, + "loss": 0.2428, + "step": 2298 + }, + { + "epoch": 0.4652904270390609, + "grad_norm": 0.34883835911750793, + "learning_rate": 0.00017458275886529062, + "loss": 0.2611, + "step": 2299 + }, + { + "epoch": 0.4654928152195912, + "grad_norm": 0.3219202756881714, + "learning_rate": 0.0001745615671058102, + "loss": 0.2113, + "step": 2300 + }, + { + "epoch": 0.4654928152195912, + "eval_loss": 0.2663731575012207, + "eval_runtime": 0.7416, + "eval_samples_per_second": 6.742, + "eval_steps_per_second": 1.348, + "step": 2300 + }, + { + "epoch": 0.46569520340012144, + "grad_norm": 0.34709736704826355, + "learning_rate": 0.0001745403678029728, + "loss": 0.213, + "step": 2301 + }, + { + "epoch": 0.4658975915806517, + "grad_norm": 0.30065563321113586, + "learning_rate": 0.00017451916095892312, + "loss": 0.2546, + "step": 2302 + }, + { + "epoch": 0.46609997976118195, + "grad_norm": 0.35152703523635864, + "learning_rate": 0.00017449794657580664, + "loss": 0.2203, + "step": 2303 + }, + { + "epoch": 0.4663023679417122, + "grad_norm": 0.32369470596313477, + "learning_rate": 0.00017447672465576965, + "loss": 0.2377, + "step": 2304 + }, + { + "epoch": 0.46650475612224246, + "grad_norm": 0.2950402796268463, + "learning_rate": 0.0001744554952009591, + "loss": 0.2326, + "step": 2305 + }, + { + "epoch": 0.4667071443027727, + "grad_norm": 0.4171277582645416, + "learning_rate": 0.0001744342582135228, + "loss": 0.264, + "step": 2306 + }, + { + "epoch": 0.46690953248330297, + "grad_norm": 0.27577680349349976, + "learning_rate": 0.00017441301369560934, + "loss": 0.2414, + "step": 2307 + }, + { + "epoch": 0.4671119206638332, + "grad_norm": 0.4021974205970764, + "learning_rate": 0.0001743917616493679, + "loss": 0.27, + "step": 2308 + }, + { + "epoch": 0.4673143088443635, + "grad_norm": 0.32153424620628357, + "learning_rate": 0.00017437050207694865, + "loss": 0.2423, + "step": 2309 + }, + { + "epoch": 0.46751669702489373, + "grad_norm": 0.29214033484458923, + "learning_rate": 0.00017434923498050233, + "loss": 0.2429, + "step": 2310 + }, + { + "epoch": 0.467719085205424, + "grad_norm": 0.29247456789016724, + "learning_rate": 0.00017432796036218054, + "loss": 0.1918, + "step": 2311 + }, + { + "epoch": 0.46792147338595425, + "grad_norm": 0.332529217004776, + "learning_rate": 0.00017430667822413567, + "loss": 0.2558, + "step": 2312 + }, + { + "epoch": 0.4681238615664845, + "grad_norm": 0.2968290150165558, + "learning_rate": 0.00017428538856852077, + "loss": 0.2213, + "step": 2313 + }, + { + "epoch": 0.46832624974701476, + "grad_norm": 0.48056352138519287, + "learning_rate": 0.0001742640913974897, + "loss": 0.1798, + "step": 2314 + }, + { + "epoch": 0.468528637927545, + "grad_norm": 0.40288710594177246, + "learning_rate": 0.00017424278671319713, + "loss": 0.2906, + "step": 2315 + }, + { + "epoch": 0.46873102610807527, + "grad_norm": 0.37656524777412415, + "learning_rate": 0.00017422147451779844, + "loss": 0.2925, + "step": 2316 + }, + { + "epoch": 0.4689334142886055, + "grad_norm": 0.30084243416786194, + "learning_rate": 0.00017420015481344972, + "loss": 0.2435, + "step": 2317 + }, + { + "epoch": 0.4691358024691358, + "grad_norm": 0.25637879967689514, + "learning_rate": 0.00017417882760230793, + "loss": 0.218, + "step": 2318 + }, + { + "epoch": 0.4693381906496661, + "grad_norm": 0.3462231159210205, + "learning_rate": 0.00017415749288653072, + "loss": 0.2463, + "step": 2319 + }, + { + "epoch": 0.46954057883019634, + "grad_norm": 0.31451940536499023, + "learning_rate": 0.0001741361506682765, + "loss": 0.2761, + "step": 2320 + }, + { + "epoch": 0.4697429670107266, + "grad_norm": 0.468211829662323, + "learning_rate": 0.00017411480094970444, + "loss": 0.2239, + "step": 2321 + }, + { + "epoch": 0.46994535519125685, + "grad_norm": 0.3163670599460602, + "learning_rate": 0.00017409344373297452, + "loss": 0.2576, + "step": 2322 + }, + { + "epoch": 0.4701477433717871, + "grad_norm": 0.2964145839214325, + "learning_rate": 0.00017407207902024737, + "loss": 0.2495, + "step": 2323 + }, + { + "epoch": 0.47035013155231736, + "grad_norm": 0.3863260746002197, + "learning_rate": 0.00017405070681368457, + "loss": 0.2539, + "step": 2324 + }, + { + "epoch": 0.4705525197328476, + "grad_norm": 0.30294448137283325, + "learning_rate": 0.0001740293271154482, + "loss": 0.2599, + "step": 2325 + }, + { + "epoch": 0.4707549079133779, + "grad_norm": 0.3313601016998291, + "learning_rate": 0.0001740079399277013, + "loss": 0.258, + "step": 2326 + }, + { + "epoch": 0.47095729609390813, + "grad_norm": 0.3621000051498413, + "learning_rate": 0.00017398654525260763, + "loss": 0.238, + "step": 2327 + }, + { + "epoch": 0.4711596842744384, + "grad_norm": 0.36232197284698486, + "learning_rate": 0.0001739651430923316, + "loss": 0.2701, + "step": 2328 + }, + { + "epoch": 0.47136207245496864, + "grad_norm": 0.34678053855895996, + "learning_rate": 0.00017394373344903853, + "loss": 0.2305, + "step": 2329 + }, + { + "epoch": 0.4715644606354989, + "grad_norm": 0.29713359475135803, + "learning_rate": 0.00017392231632489439, + "loss": 0.2345, + "step": 2330 + }, + { + "epoch": 0.47176684881602915, + "grad_norm": 0.3039023280143738, + "learning_rate": 0.00017390089172206592, + "loss": 0.2383, + "step": 2331 + }, + { + "epoch": 0.4719692369965594, + "grad_norm": 0.264533668756485, + "learning_rate": 0.0001738794596427207, + "loss": 0.2304, + "step": 2332 + }, + { + "epoch": 0.47217162517708966, + "grad_norm": 0.272783100605011, + "learning_rate": 0.00017385802008902692, + "loss": 0.2483, + "step": 2333 + }, + { + "epoch": 0.4723740133576199, + "grad_norm": 0.28291958570480347, + "learning_rate": 0.00017383657306315367, + "loss": 0.2146, + "step": 2334 + }, + { + "epoch": 0.47257640153815017, + "grad_norm": 0.3644791841506958, + "learning_rate": 0.00017381511856727068, + "loss": 0.2394, + "step": 2335 + }, + { + "epoch": 0.4727787897186804, + "grad_norm": 0.35561367869377136, + "learning_rate": 0.00017379365660354857, + "loss": 0.2201, + "step": 2336 + }, + { + "epoch": 0.4729811778992107, + "grad_norm": 0.30612918734550476, + "learning_rate": 0.00017377218717415857, + "loss": 0.228, + "step": 2337 + }, + { + "epoch": 0.47318356607974094, + "grad_norm": 0.4196929931640625, + "learning_rate": 0.00017375071028127276, + "loss": 0.2628, + "step": 2338 + }, + { + "epoch": 0.4733859542602712, + "grad_norm": 0.3164200782775879, + "learning_rate": 0.00017372922592706397, + "loss": 0.2569, + "step": 2339 + }, + { + "epoch": 0.47358834244080145, + "grad_norm": 0.3175007700920105, + "learning_rate": 0.00017370773411370572, + "loss": 0.2318, + "step": 2340 + }, + { + "epoch": 0.4737907306213317, + "grad_norm": 0.3435089588165283, + "learning_rate": 0.00017368623484337233, + "loss": 0.2441, + "step": 2341 + }, + { + "epoch": 0.47399311880186196, + "grad_norm": 0.2463127225637436, + "learning_rate": 0.00017366472811823888, + "loss": 0.2352, + "step": 2342 + }, + { + "epoch": 0.4741955069823922, + "grad_norm": 0.4272097051143646, + "learning_rate": 0.00017364321394048118, + "loss": 0.2282, + "step": 2343 + }, + { + "epoch": 0.47439789516292247, + "grad_norm": 0.45957881212234497, + "learning_rate": 0.0001736216923122758, + "loss": 0.2859, + "step": 2344 + }, + { + "epoch": 0.4746002833434527, + "grad_norm": 0.36976000666618347, + "learning_rate": 0.00017360016323580014, + "loss": 0.1975, + "step": 2345 + }, + { + "epoch": 0.474802671523983, + "grad_norm": 0.287800669670105, + "learning_rate": 0.00017357862671323225, + "loss": 0.2337, + "step": 2346 + }, + { + "epoch": 0.47500505970451323, + "grad_norm": 0.36222925782203674, + "learning_rate": 0.00017355708274675093, + "loss": 0.2198, + "step": 2347 + }, + { + "epoch": 0.4752074478850435, + "grad_norm": 0.3630425035953522, + "learning_rate": 0.00017353553133853583, + "loss": 0.246, + "step": 2348 + }, + { + "epoch": 0.47540983606557374, + "grad_norm": 0.35259371995925903, + "learning_rate": 0.00017351397249076725, + "loss": 0.2544, + "step": 2349 + }, + { + "epoch": 0.47561222424610405, + "grad_norm": 0.43726104497909546, + "learning_rate": 0.00017349240620562632, + "loss": 0.1957, + "step": 2350 + }, + { + "epoch": 0.47561222424610405, + "eval_loss": 0.27133333683013916, + "eval_runtime": 0.7399, + "eval_samples_per_second": 6.757, + "eval_steps_per_second": 1.351, + "step": 2350 + }, + { + "epoch": 0.4758146124266343, + "grad_norm": 0.3979310691356659, + "learning_rate": 0.00017347083248529484, + "loss": 0.2692, + "step": 2351 + }, + { + "epoch": 0.47601700060716456, + "grad_norm": 0.3271339237689972, + "learning_rate": 0.00017344925133195552, + "loss": 0.2406, + "step": 2352 + }, + { + "epoch": 0.4762193887876948, + "grad_norm": 0.2918234169483185, + "learning_rate": 0.00017342766274779157, + "loss": 0.212, + "step": 2353 + }, + { + "epoch": 0.4764217769682251, + "grad_norm": 0.28919458389282227, + "learning_rate": 0.00017340606673498722, + "loss": 0.2696, + "step": 2354 + }, + { + "epoch": 0.47662416514875533, + "grad_norm": 0.3120443522930145, + "learning_rate": 0.00017338446329572723, + "loss": 0.228, + "step": 2355 + }, + { + "epoch": 0.4768265533292856, + "grad_norm": 0.28186094760894775, + "learning_rate": 0.00017336285243219732, + "loss": 0.2372, + "step": 2356 + }, + { + "epoch": 0.47702894150981584, + "grad_norm": 0.29053986072540283, + "learning_rate": 0.00017334123414658377, + "loss": 0.2802, + "step": 2357 + }, + { + "epoch": 0.4772313296903461, + "grad_norm": 0.36644598841667175, + "learning_rate": 0.00017331960844107369, + "loss": 0.2704, + "step": 2358 + }, + { + "epoch": 0.47743371787087635, + "grad_norm": 0.37110552191734314, + "learning_rate": 0.00017329797531785495, + "loss": 0.2791, + "step": 2359 + }, + { + "epoch": 0.4776361060514066, + "grad_norm": 0.2938483953475952, + "learning_rate": 0.0001732763347791162, + "loss": 0.2663, + "step": 2360 + }, + { + "epoch": 0.47783849423193686, + "grad_norm": 0.27444276213645935, + "learning_rate": 0.00017325468682704678, + "loss": 0.2137, + "step": 2361 + }, + { + "epoch": 0.4780408824124671, + "grad_norm": 0.30051189661026, + "learning_rate": 0.0001732330314638368, + "loss": 0.2213, + "step": 2362 + }, + { + "epoch": 0.47824327059299737, + "grad_norm": 0.38845130801200867, + "learning_rate": 0.00017321136869167712, + "loss": 0.2404, + "step": 2363 + }, + { + "epoch": 0.4784456587735276, + "grad_norm": 0.30238163471221924, + "learning_rate": 0.00017318969851275935, + "loss": 0.2617, + "step": 2364 + }, + { + "epoch": 0.4786480469540579, + "grad_norm": 0.2759285867214203, + "learning_rate": 0.00017316802092927586, + "loss": 0.2482, + "step": 2365 + }, + { + "epoch": 0.47885043513458814, + "grad_norm": 0.27550262212753296, + "learning_rate": 0.00017314633594341973, + "loss": 0.2125, + "step": 2366 + }, + { + "epoch": 0.4790528233151184, + "grad_norm": 0.33247604966163635, + "learning_rate": 0.00017312464355738488, + "loss": 0.2808, + "step": 2367 + }, + { + "epoch": 0.47925521149564865, + "grad_norm": 0.3416070342063904, + "learning_rate": 0.00017310294377336587, + "loss": 0.2645, + "step": 2368 + }, + { + "epoch": 0.4794575996761789, + "grad_norm": 0.3552078604698181, + "learning_rate": 0.00017308123659355804, + "loss": 0.28, + "step": 2369 + }, + { + "epoch": 0.47965998785670916, + "grad_norm": 0.2823755443096161, + "learning_rate": 0.00017305952202015755, + "loss": 0.2343, + "step": 2370 + }, + { + "epoch": 0.4798623760372394, + "grad_norm": 0.31195518374443054, + "learning_rate": 0.00017303780005536123, + "loss": 0.2446, + "step": 2371 + }, + { + "epoch": 0.48006476421776967, + "grad_norm": 0.3804989755153656, + "learning_rate": 0.0001730160707013667, + "loss": 0.274, + "step": 2372 + }, + { + "epoch": 0.4802671523982999, + "grad_norm": 0.2393997609615326, + "learning_rate": 0.00017299433396037223, + "loss": 0.22, + "step": 2373 + }, + { + "epoch": 0.4804695405788302, + "grad_norm": 0.3902396261692047, + "learning_rate": 0.000172972589834577, + "loss": 0.2347, + "step": 2374 + }, + { + "epoch": 0.48067192875936043, + "grad_norm": 0.3158196210861206, + "learning_rate": 0.00017295083832618083, + "loss": 0.2477, + "step": 2375 + }, + { + "epoch": 0.4808743169398907, + "grad_norm": 0.29111531376838684, + "learning_rate": 0.0001729290794373843, + "loss": 0.2271, + "step": 2376 + }, + { + "epoch": 0.48107670512042094, + "grad_norm": 0.28990867733955383, + "learning_rate": 0.00017290731317038874, + "loss": 0.24, + "step": 2377 + }, + { + "epoch": 0.4812790933009512, + "grad_norm": 0.3305584788322449, + "learning_rate": 0.00017288553952739627, + "loss": 0.2249, + "step": 2378 + }, + { + "epoch": 0.48148148148148145, + "grad_norm": 0.5030553340911865, + "learning_rate": 0.00017286375851060964, + "loss": 0.2334, + "step": 2379 + }, + { + "epoch": 0.48168386966201177, + "grad_norm": 0.267829567193985, + "learning_rate": 0.0001728419701222325, + "loss": 0.2448, + "step": 2380 + }, + { + "epoch": 0.481886257842542, + "grad_norm": 0.33796215057373047, + "learning_rate": 0.00017282017436446917, + "loss": 0.2248, + "step": 2381 + }, + { + "epoch": 0.4820886460230723, + "grad_norm": 0.29112884402275085, + "learning_rate": 0.00017279837123952466, + "loss": 0.2443, + "step": 2382 + }, + { + "epoch": 0.48229103420360253, + "grad_norm": 0.27587100863456726, + "learning_rate": 0.0001727765607496048, + "loss": 0.2304, + "step": 2383 + }, + { + "epoch": 0.4824934223841328, + "grad_norm": 0.4192744493484497, + "learning_rate": 0.0001727547428969162, + "loss": 0.2686, + "step": 2384 + }, + { + "epoch": 0.48269581056466304, + "grad_norm": 0.3476436138153076, + "learning_rate": 0.0001727329176836661, + "loss": 0.2703, + "step": 2385 + }, + { + "epoch": 0.4828981987451933, + "grad_norm": 0.28923192620277405, + "learning_rate": 0.0001727110851120626, + "loss": 0.2387, + "step": 2386 + }, + { + "epoch": 0.48310058692572355, + "grad_norm": 0.31386253237724304, + "learning_rate": 0.00017268924518431438, + "loss": 0.2524, + "step": 2387 + }, + { + "epoch": 0.4833029751062538, + "grad_norm": 0.28253522515296936, + "learning_rate": 0.0001726673979026311, + "loss": 0.2235, + "step": 2388 + }, + { + "epoch": 0.48350536328678406, + "grad_norm": 0.3415163457393646, + "learning_rate": 0.00017264554326922298, + "loss": 0.2787, + "step": 2389 + }, + { + "epoch": 0.4837077514673143, + "grad_norm": 0.3251768946647644, + "learning_rate": 0.00017262368128630106, + "loss": 0.2546, + "step": 2390 + }, + { + "epoch": 0.4839101396478446, + "grad_norm": 0.3223573863506317, + "learning_rate": 0.0001726018119560771, + "loss": 0.2708, + "step": 2391 + }, + { + "epoch": 0.48411252782837483, + "grad_norm": 0.30109891295433044, + "learning_rate": 0.0001725799352807636, + "loss": 0.2552, + "step": 2392 + }, + { + "epoch": 0.4843149160089051, + "grad_norm": 0.45630934834480286, + "learning_rate": 0.00017255805126257384, + "loss": 0.2713, + "step": 2393 + }, + { + "epoch": 0.48451730418943534, + "grad_norm": 0.3417312800884247, + "learning_rate": 0.00017253615990372176, + "loss": 0.294, + "step": 2394 + }, + { + "epoch": 0.4847196923699656, + "grad_norm": 0.3401585519313812, + "learning_rate": 0.00017251426120642216, + "loss": 0.2577, + "step": 2395 + }, + { + "epoch": 0.48492208055049585, + "grad_norm": 0.32333528995513916, + "learning_rate": 0.0001724923551728905, + "loss": 0.2418, + "step": 2396 + }, + { + "epoch": 0.4851244687310261, + "grad_norm": 0.24791204929351807, + "learning_rate": 0.000172470441805343, + "loss": 0.2308, + "step": 2397 + }, + { + "epoch": 0.48532685691155636, + "grad_norm": 0.26590603590011597, + "learning_rate": 0.0001724485211059966, + "loss": 0.2324, + "step": 2398 + }, + { + "epoch": 0.4855292450920866, + "grad_norm": 0.29200440645217896, + "learning_rate": 0.00017242659307706903, + "loss": 0.238, + "step": 2399 + }, + { + "epoch": 0.48573163327261687, + "grad_norm": 0.5061826109886169, + "learning_rate": 0.00017240465772077877, + "loss": 0.2549, + "step": 2400 + }, + { + "epoch": 0.48573163327261687, + "eval_loss": 0.26132091879844666, + "eval_runtime": 0.7361, + "eval_samples_per_second": 6.793, + "eval_steps_per_second": 1.359, + "step": 2400 + }, + { + "epoch": 0.4859340214531471, + "grad_norm": 0.3257822096347809, + "learning_rate": 0.00017238271503934493, + "loss": 0.2536, + "step": 2401 + }, + { + "epoch": 0.4861364096336774, + "grad_norm": 0.30192455649375916, + "learning_rate": 0.00017236076503498752, + "loss": 0.2279, + "step": 2402 + }, + { + "epoch": 0.48633879781420764, + "grad_norm": 0.3379042446613312, + "learning_rate": 0.00017233880770992717, + "loss": 0.27, + "step": 2403 + }, + { + "epoch": 0.4865411859947379, + "grad_norm": 0.3168107867240906, + "learning_rate": 0.00017231684306638528, + "loss": 0.2254, + "step": 2404 + }, + { + "epoch": 0.48674357417526815, + "grad_norm": 0.36106303334236145, + "learning_rate": 0.00017229487110658403, + "loss": 0.2457, + "step": 2405 + }, + { + "epoch": 0.4869459623557984, + "grad_norm": 0.26096561551094055, + "learning_rate": 0.0001722728918327463, + "loss": 0.2176, + "step": 2406 + }, + { + "epoch": 0.48714835053632866, + "grad_norm": 0.4154165983200073, + "learning_rate": 0.00017225090524709575, + "loss": 0.2241, + "step": 2407 + }, + { + "epoch": 0.4873507387168589, + "grad_norm": 0.32560572028160095, + "learning_rate": 0.0001722289113518567, + "loss": 0.2302, + "step": 2408 + }, + { + "epoch": 0.48755312689738917, + "grad_norm": 0.3593572676181793, + "learning_rate": 0.00017220691014925427, + "loss": 0.2349, + "step": 2409 + }, + { + "epoch": 0.4877555150779195, + "grad_norm": 0.4322805106639862, + "learning_rate": 0.00017218490164151438, + "loss": 0.2485, + "step": 2410 + }, + { + "epoch": 0.48795790325844973, + "grad_norm": 0.5639968514442444, + "learning_rate": 0.00017216288583086353, + "loss": 0.2509, + "step": 2411 + }, + { + "epoch": 0.48816029143898, + "grad_norm": 0.31465572118759155, + "learning_rate": 0.0001721408627195291, + "loss": 0.2316, + "step": 2412 + }, + { + "epoch": 0.48836267961951024, + "grad_norm": 0.29703637957572937, + "learning_rate": 0.00017211883230973916, + "loss": 0.2652, + "step": 2413 + }, + { + "epoch": 0.4885650678000405, + "grad_norm": 0.3234601616859436, + "learning_rate": 0.0001720967946037225, + "loss": 0.2635, + "step": 2414 + }, + { + "epoch": 0.48876745598057075, + "grad_norm": 0.3324073553085327, + "learning_rate": 0.00017207474960370865, + "loss": 0.233, + "step": 2415 + }, + { + "epoch": 0.488969844161101, + "grad_norm": 0.3848069906234741, + "learning_rate": 0.0001720526973119279, + "loss": 0.325, + "step": 2416 + }, + { + "epoch": 0.48917223234163126, + "grad_norm": 0.36358702182769775, + "learning_rate": 0.0001720306377306113, + "loss": 0.2692, + "step": 2417 + }, + { + "epoch": 0.4893746205221615, + "grad_norm": 0.2500711679458618, + "learning_rate": 0.00017200857086199057, + "loss": 0.2353, + "step": 2418 + }, + { + "epoch": 0.4895770087026918, + "grad_norm": 0.24969623982906342, + "learning_rate": 0.0001719864967082982, + "loss": 0.2451, + "step": 2419 + }, + { + "epoch": 0.48977939688322203, + "grad_norm": 0.2897688150405884, + "learning_rate": 0.00017196441527176748, + "loss": 0.2484, + "step": 2420 + }, + { + "epoch": 0.4899817850637523, + "grad_norm": 0.32977643609046936, + "learning_rate": 0.0001719423265546323, + "loss": 0.2536, + "step": 2421 + }, + { + "epoch": 0.49018417324428254, + "grad_norm": 0.286429226398468, + "learning_rate": 0.00017192023055912742, + "loss": 0.2234, + "step": 2422 + }, + { + "epoch": 0.4903865614248128, + "grad_norm": 0.31992316246032715, + "learning_rate": 0.00017189812728748828, + "loss": 0.2045, + "step": 2423 + }, + { + "epoch": 0.49058894960534305, + "grad_norm": 0.26441943645477295, + "learning_rate": 0.00017187601674195098, + "loss": 0.1897, + "step": 2424 + }, + { + "epoch": 0.4907913377858733, + "grad_norm": 0.27432650327682495, + "learning_rate": 0.00017185389892475256, + "loss": 0.2453, + "step": 2425 + }, + { + "epoch": 0.49099372596640356, + "grad_norm": 0.4113869071006775, + "learning_rate": 0.0001718317738381306, + "loss": 0.2395, + "step": 2426 + }, + { + "epoch": 0.4911961141469338, + "grad_norm": 0.40150976181030273, + "learning_rate": 0.0001718096414843234, + "loss": 0.2889, + "step": 2427 + }, + { + "epoch": 0.49139850232746407, + "grad_norm": 0.37498939037323, + "learning_rate": 0.00017178750186557025, + "loss": 0.2515, + "step": 2428 + }, + { + "epoch": 0.4916008905079943, + "grad_norm": 0.3221639096736908, + "learning_rate": 0.0001717653549841109, + "loss": 0.2316, + "step": 2429 + }, + { + "epoch": 0.4918032786885246, + "grad_norm": 0.3239342272281647, + "learning_rate": 0.00017174320084218593, + "loss": 0.294, + "step": 2430 + }, + { + "epoch": 0.49200566686905484, + "grad_norm": 0.3614806532859802, + "learning_rate": 0.00017172103944203672, + "loss": 0.2425, + "step": 2431 + }, + { + "epoch": 0.4922080550495851, + "grad_norm": 0.3906191885471344, + "learning_rate": 0.0001716988707859053, + "loss": 0.2704, + "step": 2432 + }, + { + "epoch": 0.49241044323011535, + "grad_norm": 0.35471323132514954, + "learning_rate": 0.00017167669487603443, + "loss": 0.256, + "step": 2433 + }, + { + "epoch": 0.4926128314106456, + "grad_norm": 0.34735792875289917, + "learning_rate": 0.0001716545117146677, + "loss": 0.2289, + "step": 2434 + }, + { + "epoch": 0.49281521959117586, + "grad_norm": 0.338459849357605, + "learning_rate": 0.00017163232130404932, + "loss": 0.257, + "step": 2435 + }, + { + "epoch": 0.4930176077717061, + "grad_norm": 0.29631951451301575, + "learning_rate": 0.0001716101236464243, + "loss": 0.2378, + "step": 2436 + }, + { + "epoch": 0.49321999595223637, + "grad_norm": 0.3412487506866455, + "learning_rate": 0.0001715879187440384, + "loss": 0.2671, + "step": 2437 + }, + { + "epoch": 0.4934223841327666, + "grad_norm": 0.30353328585624695, + "learning_rate": 0.000171565706599138, + "loss": 0.2573, + "step": 2438 + }, + { + "epoch": 0.4936247723132969, + "grad_norm": 0.3252297043800354, + "learning_rate": 0.00017154348721397033, + "loss": 0.2481, + "step": 2439 + }, + { + "epoch": 0.49382716049382713, + "grad_norm": 0.28343456983566284, + "learning_rate": 0.00017152126059078335, + "loss": 0.2594, + "step": 2440 + }, + { + "epoch": 0.49402954867435744, + "grad_norm": 0.32058560848236084, + "learning_rate": 0.0001714990267318257, + "loss": 0.2391, + "step": 2441 + }, + { + "epoch": 0.4942319368548877, + "grad_norm": 0.5241413116455078, + "learning_rate": 0.00017147678563934676, + "loss": 0.251, + "step": 2442 + }, + { + "epoch": 0.49443432503541795, + "grad_norm": 0.26299574971199036, + "learning_rate": 0.00017145453731559659, + "loss": 0.2592, + "step": 2443 + }, + { + "epoch": 0.4946367132159482, + "grad_norm": 0.532707691192627, + "learning_rate": 0.00017143228176282613, + "loss": 0.2871, + "step": 2444 + }, + { + "epoch": 0.49483910139647846, + "grad_norm": 0.34572750329971313, + "learning_rate": 0.00017141001898328693, + "loss": 0.2461, + "step": 2445 + }, + { + "epoch": 0.4950414895770087, + "grad_norm": 0.28672656416893005, + "learning_rate": 0.00017138774897923131, + "loss": 0.2246, + "step": 2446 + }, + { + "epoch": 0.495243877757539, + "grad_norm": 0.3325256407260895, + "learning_rate": 0.00017136547175291233, + "loss": 0.219, + "step": 2447 + }, + { + "epoch": 0.49544626593806923, + "grad_norm": 0.5237107872962952, + "learning_rate": 0.00017134318730658373, + "loss": 0.2892, + "step": 2448 + }, + { + "epoch": 0.4956486541185995, + "grad_norm": 0.31656259298324585, + "learning_rate": 0.00017132089564250003, + "loss": 0.2734, + "step": 2449 + }, + { + "epoch": 0.49585104229912974, + "grad_norm": 0.3295278549194336, + "learning_rate": 0.00017129859676291647, + "loss": 0.2296, + "step": 2450 + }, + { + "epoch": 0.49585104229912974, + "eval_loss": 0.2731389105319977, + "eval_runtime": 0.7382, + "eval_samples_per_second": 6.773, + "eval_steps_per_second": 1.355, + "step": 2450 + }, + { + "epoch": 0.49605343047966, + "grad_norm": 0.28645098209381104, + "learning_rate": 0.000171276290670089, + "loss": 0.2562, + "step": 2451 + }, + { + "epoch": 0.49625581866019025, + "grad_norm": 0.3923911154270172, + "learning_rate": 0.00017125397736627437, + "loss": 0.2886, + "step": 2452 + }, + { + "epoch": 0.4964582068407205, + "grad_norm": 0.3624133765697479, + "learning_rate": 0.00017123165685372995, + "loss": 0.2733, + "step": 2453 + }, + { + "epoch": 0.49666059502125076, + "grad_norm": 0.3099536895751953, + "learning_rate": 0.00017120932913471392, + "loss": 0.2276, + "step": 2454 + }, + { + "epoch": 0.496862983201781, + "grad_norm": 0.33509066700935364, + "learning_rate": 0.00017118699421148518, + "loss": 0.2628, + "step": 2455 + }, + { + "epoch": 0.49706537138231127, + "grad_norm": 0.3130567669868469, + "learning_rate": 0.00017116465208630327, + "loss": 0.2505, + "step": 2456 + }, + { + "epoch": 0.4972677595628415, + "grad_norm": 0.6522201895713806, + "learning_rate": 0.00017114230276142863, + "loss": 0.2666, + "step": 2457 + }, + { + "epoch": 0.4974701477433718, + "grad_norm": 0.2969781160354614, + "learning_rate": 0.00017111994623912228, + "loss": 0.2202, + "step": 2458 + }, + { + "epoch": 0.49767253592390204, + "grad_norm": 0.29119473695755005, + "learning_rate": 0.000171097582521646, + "loss": 0.2414, + "step": 2459 + }, + { + "epoch": 0.4978749241044323, + "grad_norm": 0.3184351325035095, + "learning_rate": 0.00017107521161126234, + "loss": 0.2583, + "step": 2460 + }, + { + "epoch": 0.49807731228496255, + "grad_norm": 0.35448184609413147, + "learning_rate": 0.0001710528335102346, + "loss": 0.227, + "step": 2461 + }, + { + "epoch": 0.4982797004654928, + "grad_norm": 0.2825421392917633, + "learning_rate": 0.00017103044822082666, + "loss": 0.1929, + "step": 2462 + }, + { + "epoch": 0.49848208864602306, + "grad_norm": 0.3474180996417999, + "learning_rate": 0.00017100805574530328, + "loss": 0.2645, + "step": 2463 + }, + { + "epoch": 0.4986844768265533, + "grad_norm": 0.30311545729637146, + "learning_rate": 0.00017098565608592993, + "loss": 0.2527, + "step": 2464 + }, + { + "epoch": 0.49888686500708357, + "grad_norm": 0.3159215748310089, + "learning_rate": 0.00017096324924497275, + "loss": 0.2153, + "step": 2465 + }, + { + "epoch": 0.4990892531876138, + "grad_norm": 0.2998165190219879, + "learning_rate": 0.00017094083522469858, + "loss": 0.2315, + "step": 2466 + }, + { + "epoch": 0.4992916413681441, + "grad_norm": 0.40242812037467957, + "learning_rate": 0.0001709184140273751, + "loss": 0.2216, + "step": 2467 + }, + { + "epoch": 0.49949402954867433, + "grad_norm": 0.39427369832992554, + "learning_rate": 0.00017089598565527063, + "loss": 0.2425, + "step": 2468 + }, + { + "epoch": 0.4996964177292046, + "grad_norm": 0.29181742668151855, + "learning_rate": 0.00017087355011065423, + "loss": 0.2357, + "step": 2469 + }, + { + "epoch": 0.49989880590973484, + "grad_norm": 0.2653137743473053, + "learning_rate": 0.00017085110739579567, + "loss": 0.2186, + "step": 2470 + }, + { + "epoch": 0.5001011940902651, + "grad_norm": 0.4047374725341797, + "learning_rate": 0.00017082865751296553, + "loss": 0.2876, + "step": 2471 + }, + { + "epoch": 0.5003035822707954, + "grad_norm": 0.2697608768939972, + "learning_rate": 0.00017080620046443503, + "loss": 0.224, + "step": 2472 + }, + { + "epoch": 0.5005059704513256, + "grad_norm": 0.3421246409416199, + "learning_rate": 0.0001707837362524761, + "loss": 0.2753, + "step": 2473 + }, + { + "epoch": 0.5007083586318559, + "grad_norm": 0.378449410200119, + "learning_rate": 0.00017076126487936146, + "loss": 0.2409, + "step": 2474 + }, + { + "epoch": 0.5009107468123861, + "grad_norm": 0.3779212534427643, + "learning_rate": 0.00017073878634736456, + "loss": 0.2712, + "step": 2475 + }, + { + "epoch": 0.5011131349929164, + "grad_norm": 0.3363097012042999, + "learning_rate": 0.0001707163006587595, + "loss": 0.2327, + "step": 2476 + }, + { + "epoch": 0.5013155231734466, + "grad_norm": 0.30967584252357483, + "learning_rate": 0.00017069380781582113, + "loss": 0.2203, + "step": 2477 + }, + { + "epoch": 0.5015179113539769, + "grad_norm": 0.32559165358543396, + "learning_rate": 0.00017067130782082507, + "loss": 0.2088, + "step": 2478 + }, + { + "epoch": 0.5017202995345071, + "grad_norm": 0.26994726061820984, + "learning_rate": 0.00017064880067604765, + "loss": 0.2497, + "step": 2479 + }, + { + "epoch": 0.5019226877150375, + "grad_norm": 0.38163644075393677, + "learning_rate": 0.0001706262863837659, + "loss": 0.2536, + "step": 2480 + }, + { + "epoch": 0.5021250758955677, + "grad_norm": 0.28695085644721985, + "learning_rate": 0.00017060376494625753, + "loss": 0.2414, + "step": 2481 + }, + { + "epoch": 0.502327464076098, + "grad_norm": 0.2939639389514923, + "learning_rate": 0.0001705812363658011, + "loss": 0.2208, + "step": 2482 + }, + { + "epoch": 0.5025298522566282, + "grad_norm": 0.29969316720962524, + "learning_rate": 0.00017055870064467573, + "loss": 0.2136, + "step": 2483 + }, + { + "epoch": 0.5027322404371585, + "grad_norm": 0.48427799344062805, + "learning_rate": 0.00017053615778516142, + "loss": 0.2368, + "step": 2484 + }, + { + "epoch": 0.5029346286176887, + "grad_norm": 0.37288084626197815, + "learning_rate": 0.0001705136077895388, + "loss": 0.2762, + "step": 2485 + }, + { + "epoch": 0.503137016798219, + "grad_norm": 0.3011093735694885, + "learning_rate": 0.00017049105066008923, + "loss": 0.2111, + "step": 2486 + }, + { + "epoch": 0.5033394049787493, + "grad_norm": 0.27865341305732727, + "learning_rate": 0.0001704684863990948, + "loss": 0.2527, + "step": 2487 + }, + { + "epoch": 0.5035417931592795, + "grad_norm": 0.3852303922176361, + "learning_rate": 0.00017044591500883834, + "loss": 0.2248, + "step": 2488 + }, + { + "epoch": 0.5037441813398098, + "grad_norm": 0.31749090552330017, + "learning_rate": 0.00017042333649160336, + "loss": 0.2468, + "step": 2489 + }, + { + "epoch": 0.50394656952034, + "grad_norm": 0.3167575001716614, + "learning_rate": 0.00017040075084967415, + "loss": 0.2337, + "step": 2490 + }, + { + "epoch": 0.5041489577008703, + "grad_norm": 0.3536628782749176, + "learning_rate": 0.00017037815808533568, + "loss": 0.2673, + "step": 2491 + }, + { + "epoch": 0.5043513458814005, + "grad_norm": 0.4146457314491272, + "learning_rate": 0.00017035555820087364, + "loss": 0.2238, + "step": 2492 + }, + { + "epoch": 0.5045537340619308, + "grad_norm": 0.32466617226600647, + "learning_rate": 0.00017033295119857448, + "loss": 0.2538, + "step": 2493 + }, + { + "epoch": 0.504756122242461, + "grad_norm": 0.2737172544002533, + "learning_rate": 0.00017031033708072527, + "loss": 0.2318, + "step": 2494 + }, + { + "epoch": 0.5049585104229913, + "grad_norm": 0.33758556842803955, + "learning_rate": 0.00017028771584961394, + "loss": 0.2248, + "step": 2495 + }, + { + "epoch": 0.5051608986035215, + "grad_norm": 0.2949804365634918, + "learning_rate": 0.00017026508750752904, + "loss": 0.2426, + "step": 2496 + }, + { + "epoch": 0.5053632867840518, + "grad_norm": 0.30609846115112305, + "learning_rate": 0.00017024245205675986, + "loss": 0.2452, + "step": 2497 + }, + { + "epoch": 0.505565674964582, + "grad_norm": 0.3072340488433838, + "learning_rate": 0.00017021980949959641, + "loss": 0.2238, + "step": 2498 + }, + { + "epoch": 0.5057680631451124, + "grad_norm": 0.3006766140460968, + "learning_rate": 0.0001701971598383295, + "loss": 0.2277, + "step": 2499 + }, + { + "epoch": 0.5059704513256426, + "grad_norm": 0.33413830399513245, + "learning_rate": 0.00017017450307525047, + "loss": 0.2794, + "step": 2500 + }, + { + "epoch": 0.5059704513256426, + "eval_loss": 0.2693020701408386, + "eval_runtime": 0.7363, + "eval_samples_per_second": 6.791, + "eval_steps_per_second": 1.358, + "step": 2500 + }, + { + "epoch": 0.5061728395061729, + "grad_norm": 0.35662609338760376, + "learning_rate": 0.00017015183921265158, + "loss": 0.2505, + "step": 2501 + }, + { + "epoch": 0.5063752276867031, + "grad_norm": 0.3591224253177643, + "learning_rate": 0.00017012916825282566, + "loss": 0.2682, + "step": 2502 + }, + { + "epoch": 0.5065776158672334, + "grad_norm": 0.5121231079101562, + "learning_rate": 0.00017010649019806638, + "loss": 0.2245, + "step": 2503 + }, + { + "epoch": 0.5067800040477636, + "grad_norm": 0.3819142282009125, + "learning_rate": 0.00017008380505066802, + "loss": 0.2537, + "step": 2504 + }, + { + "epoch": 0.5069823922282939, + "grad_norm": 0.29256799817085266, + "learning_rate": 0.0001700611128129257, + "loss": 0.2633, + "step": 2505 + }, + { + "epoch": 0.5071847804088241, + "grad_norm": 0.3312610685825348, + "learning_rate": 0.0001700384134871351, + "loss": 0.2593, + "step": 2506 + }, + { + "epoch": 0.5073871685893544, + "grad_norm": 0.303207129240036, + "learning_rate": 0.00017001570707559274, + "loss": 0.2476, + "step": 2507 + }, + { + "epoch": 0.5075895567698846, + "grad_norm": 0.2712869644165039, + "learning_rate": 0.00016999299358059575, + "loss": 0.2118, + "step": 2508 + }, + { + "epoch": 0.5077919449504149, + "grad_norm": 0.29760581254959106, + "learning_rate": 0.00016997027300444213, + "loss": 0.2539, + "step": 2509 + }, + { + "epoch": 0.5079943331309451, + "grad_norm": 0.2707705795764923, + "learning_rate": 0.00016994754534943048, + "loss": 0.2653, + "step": 2510 + }, + { + "epoch": 0.5081967213114754, + "grad_norm": 0.24326786398887634, + "learning_rate": 0.00016992481061786014, + "loss": 0.225, + "step": 2511 + }, + { + "epoch": 0.5083991094920056, + "grad_norm": 0.24204504489898682, + "learning_rate": 0.0001699020688120312, + "loss": 0.2092, + "step": 2512 + }, + { + "epoch": 0.5086014976725359, + "grad_norm": 0.27428555488586426, + "learning_rate": 0.00016987931993424438, + "loss": 0.2461, + "step": 2513 + }, + { + "epoch": 0.5088038858530661, + "grad_norm": 0.28147372603416443, + "learning_rate": 0.0001698565639868012, + "loss": 0.2386, + "step": 2514 + }, + { + "epoch": 0.5090062740335964, + "grad_norm": 0.2533692419528961, + "learning_rate": 0.0001698338009720039, + "loss": 0.2271, + "step": 2515 + }, + { + "epoch": 0.5092086622141268, + "grad_norm": 0.3866344094276428, + "learning_rate": 0.0001698110308921554, + "loss": 0.2464, + "step": 2516 + }, + { + "epoch": 0.509411050394657, + "grad_norm": 0.21789251267910004, + "learning_rate": 0.00016978825374955924, + "loss": 0.1852, + "step": 2517 + }, + { + "epoch": 0.5096134385751873, + "grad_norm": 0.26758989691734314, + "learning_rate": 0.00016976546954651988, + "loss": 0.2488, + "step": 2518 + }, + { + "epoch": 0.5098158267557175, + "grad_norm": 0.33704081177711487, + "learning_rate": 0.00016974267828534235, + "loss": 0.2584, + "step": 2519 + }, + { + "epoch": 0.5100182149362478, + "grad_norm": 0.2544485330581665, + "learning_rate": 0.00016971987996833242, + "loss": 0.2403, + "step": 2520 + }, + { + "epoch": 0.510220603116778, + "grad_norm": 0.2556194067001343, + "learning_rate": 0.00016969707459779665, + "loss": 0.1753, + "step": 2521 + }, + { + "epoch": 0.5104229912973083, + "grad_norm": 0.2857758402824402, + "learning_rate": 0.00016967426217604214, + "loss": 0.2157, + "step": 2522 + }, + { + "epoch": 0.5106253794778385, + "grad_norm": 0.31661200523376465, + "learning_rate": 0.00016965144270537688, + "loss": 0.2374, + "step": 2523 + }, + { + "epoch": 0.5108277676583688, + "grad_norm": 0.28023761510849, + "learning_rate": 0.0001696286161881095, + "loss": 0.2392, + "step": 2524 + }, + { + "epoch": 0.511030155838899, + "grad_norm": 0.252261221408844, + "learning_rate": 0.00016960578262654931, + "loss": 0.1939, + "step": 2525 + }, + { + "epoch": 0.5112325440194293, + "grad_norm": 0.33003121614456177, + "learning_rate": 0.00016958294202300644, + "loss": 0.2063, + "step": 2526 + }, + { + "epoch": 0.5114349321999595, + "grad_norm": 0.24865320324897766, + "learning_rate": 0.0001695600943797916, + "loss": 0.2183, + "step": 2527 + }, + { + "epoch": 0.5116373203804898, + "grad_norm": 0.421440064907074, + "learning_rate": 0.0001695372396992163, + "loss": 0.292, + "step": 2528 + }, + { + "epoch": 0.51183970856102, + "grad_norm": 0.2991817593574524, + "learning_rate": 0.00016951437798359275, + "loss": 0.219, + "step": 2529 + }, + { + "epoch": 0.5120420967415503, + "grad_norm": 0.3103179931640625, + "learning_rate": 0.00016949150923523384, + "loss": 0.2484, + "step": 2530 + }, + { + "epoch": 0.5122444849220805, + "grad_norm": 0.28612539172172546, + "learning_rate": 0.00016946863345645316, + "loss": 0.239, + "step": 2531 + }, + { + "epoch": 0.5124468731026108, + "grad_norm": 0.4773065447807312, + "learning_rate": 0.0001694457506495651, + "loss": 0.2489, + "step": 2532 + }, + { + "epoch": 0.512649261283141, + "grad_norm": 0.4546932578086853, + "learning_rate": 0.00016942286081688467, + "loss": 0.2987, + "step": 2533 + }, + { + "epoch": 0.5128516494636713, + "grad_norm": 0.3810085654258728, + "learning_rate": 0.0001693999639607276, + "loss": 0.2857, + "step": 2534 + }, + { + "epoch": 0.5130540376442015, + "grad_norm": 0.4003126621246338, + "learning_rate": 0.0001693770600834104, + "loss": 0.258, + "step": 2535 + }, + { + "epoch": 0.5132564258247319, + "grad_norm": 0.3469572365283966, + "learning_rate": 0.00016935414918725026, + "loss": 0.2709, + "step": 2536 + }, + { + "epoch": 0.513458814005262, + "grad_norm": 0.2894114851951599, + "learning_rate": 0.000169331231274565, + "loss": 0.2282, + "step": 2537 + }, + { + "epoch": 0.5136612021857924, + "grad_norm": 0.3092636168003082, + "learning_rate": 0.00016930830634767326, + "loss": 0.263, + "step": 2538 + }, + { + "epoch": 0.5138635903663226, + "grad_norm": 0.27841058373451233, + "learning_rate": 0.0001692853744088943, + "loss": 0.2381, + "step": 2539 + }, + { + "epoch": 0.5140659785468529, + "grad_norm": 0.2825442850589752, + "learning_rate": 0.00016926243546054817, + "loss": 0.223, + "step": 2540 + }, + { + "epoch": 0.5142683667273831, + "grad_norm": 0.34579232335090637, + "learning_rate": 0.0001692394895049556, + "loss": 0.2571, + "step": 2541 + }, + { + "epoch": 0.5144707549079134, + "grad_norm": 0.3164571523666382, + "learning_rate": 0.00016921653654443798, + "loss": 0.25, + "step": 2542 + }, + { + "epoch": 0.5146731430884436, + "grad_norm": 0.2637081742286682, + "learning_rate": 0.00016919357658131749, + "loss": 0.2312, + "step": 2543 + }, + { + "epoch": 0.5148755312689739, + "grad_norm": 0.31452441215515137, + "learning_rate": 0.00016917060961791695, + "loss": 0.2233, + "step": 2544 + }, + { + "epoch": 0.5150779194495041, + "grad_norm": 0.35815557837486267, + "learning_rate": 0.00016914763565655997, + "loss": 0.3006, + "step": 2545 + }, + { + "epoch": 0.5152803076300344, + "grad_norm": 0.28938955068588257, + "learning_rate": 0.0001691246546995707, + "loss": 0.2273, + "step": 2546 + }, + { + "epoch": 0.5154826958105647, + "grad_norm": 0.2862246632575989, + "learning_rate": 0.00016910166674927423, + "loss": 0.2668, + "step": 2547 + }, + { + "epoch": 0.5156850839910949, + "grad_norm": 0.33494991064071655, + "learning_rate": 0.0001690786718079962, + "loss": 0.2825, + "step": 2548 + }, + { + "epoch": 0.5158874721716252, + "grad_norm": 0.38121524453163147, + "learning_rate": 0.00016905566987806297, + "loss": 0.2888, + "step": 2549 + }, + { + "epoch": 0.5160898603521554, + "grad_norm": 0.47657451033592224, + "learning_rate": 0.00016903266096180162, + "loss": 0.1814, + "step": 2550 + }, + { + "epoch": 0.5160898603521554, + "eval_loss": 0.2675860524177551, + "eval_runtime": 0.7411, + "eval_samples_per_second": 6.747, + "eval_steps_per_second": 1.349, + "step": 2550 + }, + { + "epoch": 0.5162922485326857, + "grad_norm": 0.352152019739151, + "learning_rate": 0.00016900964506154007, + "loss": 0.2574, + "step": 2551 + }, + { + "epoch": 0.5164946367132159, + "grad_norm": 0.37841930985450745, + "learning_rate": 0.00016898662217960667, + "loss": 0.2526, + "step": 2552 + }, + { + "epoch": 0.5166970248937462, + "grad_norm": 0.3507513999938965, + "learning_rate": 0.00016896359231833075, + "loss": 0.2916, + "step": 2553 + }, + { + "epoch": 0.5168994130742764, + "grad_norm": 0.2841602861881256, + "learning_rate": 0.00016894055548004216, + "loss": 0.2546, + "step": 2554 + }, + { + "epoch": 0.5171018012548068, + "grad_norm": 0.3018854856491089, + "learning_rate": 0.00016891751166707154, + "loss": 0.2364, + "step": 2555 + }, + { + "epoch": 0.517304189435337, + "grad_norm": 0.30066660046577454, + "learning_rate": 0.00016889446088175027, + "loss": 0.235, + "step": 2556 + }, + { + "epoch": 0.5175065776158673, + "grad_norm": 0.3245569169521332, + "learning_rate": 0.00016887140312641034, + "loss": 0.2193, + "step": 2557 + }, + { + "epoch": 0.5177089657963975, + "grad_norm": 0.42227664589881897, + "learning_rate": 0.0001688483384033845, + "loss": 0.2189, + "step": 2558 + }, + { + "epoch": 0.5179113539769278, + "grad_norm": 0.2770608961582184, + "learning_rate": 0.00016882526671500617, + "loss": 0.2144, + "step": 2559 + }, + { + "epoch": 0.518113742157458, + "grad_norm": 0.38048413395881653, + "learning_rate": 0.00016880218806360957, + "loss": 0.2379, + "step": 2560 + }, + { + "epoch": 0.5183161303379883, + "grad_norm": 0.4153047502040863, + "learning_rate": 0.0001687791024515295, + "loss": 0.2446, + "step": 2561 + }, + { + "epoch": 0.5185185185185185, + "grad_norm": 0.3124135434627533, + "learning_rate": 0.00016875600988110155, + "loss": 0.2609, + "step": 2562 + }, + { + "epoch": 0.5187209066990488, + "grad_norm": 0.3253217041492462, + "learning_rate": 0.00016873291035466193, + "loss": 0.2902, + "step": 2563 + }, + { + "epoch": 0.518923294879579, + "grad_norm": 0.32971933484077454, + "learning_rate": 0.00016870980387454764, + "loss": 0.2548, + "step": 2564 + }, + { + "epoch": 0.5191256830601093, + "grad_norm": 0.3016485869884491, + "learning_rate": 0.00016868669044309642, + "loss": 0.2022, + "step": 2565 + }, + { + "epoch": 0.5193280712406395, + "grad_norm": 0.299196720123291, + "learning_rate": 0.00016866357006264652, + "loss": 0.2464, + "step": 2566 + }, + { + "epoch": 0.5195304594211698, + "grad_norm": 0.30322641134262085, + "learning_rate": 0.00016864044273553713, + "loss": 0.2517, + "step": 2567 + }, + { + "epoch": 0.5197328476017, + "grad_norm": 0.32855749130249023, + "learning_rate": 0.00016861730846410794, + "loss": 0.2456, + "step": 2568 + }, + { + "epoch": 0.5199352357822303, + "grad_norm": 0.3540678322315216, + "learning_rate": 0.00016859416725069947, + "loss": 0.2404, + "step": 2569 + }, + { + "epoch": 0.5201376239627605, + "grad_norm": 0.28167209029197693, + "learning_rate": 0.00016857101909765294, + "loss": 0.2428, + "step": 2570 + }, + { + "epoch": 0.5203400121432908, + "grad_norm": 0.36069634556770325, + "learning_rate": 0.0001685478640073102, + "loss": 0.2775, + "step": 2571 + }, + { + "epoch": 0.520542400323821, + "grad_norm": 0.29065969586372375, + "learning_rate": 0.00016852470198201383, + "loss": 0.2611, + "step": 2572 + }, + { + "epoch": 0.5207447885043514, + "grad_norm": 0.23961427807807922, + "learning_rate": 0.00016850153302410713, + "loss": 0.2346, + "step": 2573 + }, + { + "epoch": 0.5209471766848816, + "grad_norm": 0.30201297998428345, + "learning_rate": 0.00016847835713593412, + "loss": 0.206, + "step": 2574 + }, + { + "epoch": 0.5211495648654119, + "grad_norm": 0.30087214708328247, + "learning_rate": 0.00016845517431983946, + "loss": 0.1782, + "step": 2575 + }, + { + "epoch": 0.5213519530459421, + "grad_norm": 0.43761369585990906, + "learning_rate": 0.00016843198457816856, + "loss": 0.2583, + "step": 2576 + }, + { + "epoch": 0.5215543412264724, + "grad_norm": 0.2883847653865814, + "learning_rate": 0.0001684087879132675, + "loss": 0.2656, + "step": 2577 + }, + { + "epoch": 0.5217567294070027, + "grad_norm": 0.24638386070728302, + "learning_rate": 0.00016838558432748308, + "loss": 0.2279, + "step": 2578 + }, + { + "epoch": 0.5219591175875329, + "grad_norm": 0.27895137667655945, + "learning_rate": 0.00016836237382316283, + "loss": 0.2412, + "step": 2579 + }, + { + "epoch": 0.5221615057680632, + "grad_norm": 0.34711912274360657, + "learning_rate": 0.00016833915640265484, + "loss": 0.2468, + "step": 2580 + }, + { + "epoch": 0.5223638939485934, + "grad_norm": 0.3112831711769104, + "learning_rate": 0.00016831593206830816, + "loss": 0.2487, + "step": 2581 + }, + { + "epoch": 0.5225662821291237, + "grad_norm": 0.3572124242782593, + "learning_rate": 0.00016829270082247227, + "loss": 0.2768, + "step": 2582 + }, + { + "epoch": 0.5227686703096539, + "grad_norm": 0.2616262137889862, + "learning_rate": 0.00016826946266749752, + "loss": 0.2074, + "step": 2583 + }, + { + "epoch": 0.5229710584901842, + "grad_norm": 0.3566007614135742, + "learning_rate": 0.00016824621760573485, + "loss": 0.228, + "step": 2584 + }, + { + "epoch": 0.5231734466707144, + "grad_norm": 0.2676408886909485, + "learning_rate": 0.000168222965639536, + "loss": 0.237, + "step": 2585 + }, + { + "epoch": 0.5233758348512447, + "grad_norm": 0.40267249941825867, + "learning_rate": 0.00016819970677125335, + "loss": 0.2302, + "step": 2586 + }, + { + "epoch": 0.5235782230317749, + "grad_norm": 0.2821453809738159, + "learning_rate": 0.00016817644100323995, + "loss": 0.24, + "step": 2587 + }, + { + "epoch": 0.5237806112123052, + "grad_norm": 0.36152565479278564, + "learning_rate": 0.00016815316833784962, + "loss": 0.2853, + "step": 2588 + }, + { + "epoch": 0.5239829993928354, + "grad_norm": 0.324090838432312, + "learning_rate": 0.00016812988877743686, + "loss": 0.2615, + "step": 2589 + }, + { + "epoch": 0.5241853875733657, + "grad_norm": 0.2735280692577362, + "learning_rate": 0.00016810660232435685, + "loss": 0.2296, + "step": 2590 + }, + { + "epoch": 0.524387775753896, + "grad_norm": 0.29336437582969666, + "learning_rate": 0.00016808330898096543, + "loss": 0.2259, + "step": 2591 + }, + { + "epoch": 0.5245901639344263, + "grad_norm": 0.3331908583641052, + "learning_rate": 0.00016806000874961918, + "loss": 0.2717, + "step": 2592 + }, + { + "epoch": 0.5247925521149565, + "grad_norm": 0.30863717198371887, + "learning_rate": 0.00016803670163267542, + "loss": 0.2502, + "step": 2593 + }, + { + "epoch": 0.5249949402954868, + "grad_norm": 0.42840126156806946, + "learning_rate": 0.00016801338763249208, + "loss": 0.2422, + "step": 2594 + }, + { + "epoch": 0.525197328476017, + "grad_norm": 0.28465163707733154, + "learning_rate": 0.0001679900667514278, + "loss": 0.231, + "step": 2595 + }, + { + "epoch": 0.5253997166565473, + "grad_norm": 0.46548783779144287, + "learning_rate": 0.00016796673899184203, + "loss": 0.2115, + "step": 2596 + }, + { + "epoch": 0.5256021048370775, + "grad_norm": 0.32914069294929504, + "learning_rate": 0.00016794340435609474, + "loss": 0.2153, + "step": 2597 + }, + { + "epoch": 0.5258044930176078, + "grad_norm": 0.34944626688957214, + "learning_rate": 0.00016792006284654677, + "loss": 0.2272, + "step": 2598 + }, + { + "epoch": 0.526006881198138, + "grad_norm": 0.28967317938804626, + "learning_rate": 0.00016789671446555945, + "loss": 0.2393, + "step": 2599 + }, + { + "epoch": 0.5262092693786683, + "grad_norm": 0.2868635952472687, + "learning_rate": 0.00016787335921549502, + "loss": 0.2639, + "step": 2600 + }, + { + "epoch": 0.5262092693786683, + "eval_loss": 0.2700594961643219, + "eval_runtime": 0.7374, + "eval_samples_per_second": 6.78, + "eval_steps_per_second": 1.356, + "step": 2600 + }, + { + "epoch": 0.5264116575591985, + "grad_norm": 0.3494894504547119, + "learning_rate": 0.0001678499970987163, + "loss": 0.2236, + "step": 2601 + }, + { + "epoch": 0.5266140457397288, + "grad_norm": 0.2993201017379761, + "learning_rate": 0.00016782662811758682, + "loss": 0.2574, + "step": 2602 + }, + { + "epoch": 0.526816433920259, + "grad_norm": 0.36104443669319153, + "learning_rate": 0.0001678032522744708, + "loss": 0.2583, + "step": 2603 + }, + { + "epoch": 0.5270188221007893, + "grad_norm": 0.355884313583374, + "learning_rate": 0.00016777986957173315, + "loss": 0.2564, + "step": 2604 + }, + { + "epoch": 0.5272212102813195, + "grad_norm": 0.2789347767829895, + "learning_rate": 0.00016775648001173953, + "loss": 0.2451, + "step": 2605 + }, + { + "epoch": 0.5274235984618498, + "grad_norm": 0.4173826277256012, + "learning_rate": 0.0001677330835968562, + "loss": 0.2581, + "step": 2606 + }, + { + "epoch": 0.5276259866423801, + "grad_norm": 0.32102257013320923, + "learning_rate": 0.0001677096803294502, + "loss": 0.2328, + "step": 2607 + }, + { + "epoch": 0.5278283748229103, + "grad_norm": 0.3866223394870758, + "learning_rate": 0.00016768627021188922, + "loss": 0.2324, + "step": 2608 + }, + { + "epoch": 0.5280307630034407, + "grad_norm": 0.24886023998260498, + "learning_rate": 0.00016766285324654163, + "loss": 0.2293, + "step": 2609 + }, + { + "epoch": 0.5282331511839709, + "grad_norm": 0.4187617897987366, + "learning_rate": 0.00016763942943577654, + "loss": 0.2507, + "step": 2610 + }, + { + "epoch": 0.5284355393645012, + "grad_norm": 0.323421835899353, + "learning_rate": 0.0001676159987819637, + "loss": 0.2571, + "step": 2611 + }, + { + "epoch": 0.5286379275450314, + "grad_norm": 0.2955639958381653, + "learning_rate": 0.00016759256128747358, + "loss": 0.2485, + "step": 2612 + }, + { + "epoch": 0.5288403157255617, + "grad_norm": 0.3304389417171478, + "learning_rate": 0.00016756911695467737, + "loss": 0.2401, + "step": 2613 + }, + { + "epoch": 0.5290427039060919, + "grad_norm": 0.27834227681159973, + "learning_rate": 0.0001675456657859469, + "loss": 0.1978, + "step": 2614 + }, + { + "epoch": 0.5292450920866222, + "grad_norm": 0.29491370916366577, + "learning_rate": 0.00016752220778365468, + "loss": 0.2288, + "step": 2615 + }, + { + "epoch": 0.5294474802671524, + "grad_norm": 0.39043062925338745, + "learning_rate": 0.000167498742950174, + "loss": 0.2469, + "step": 2616 + }, + { + "epoch": 0.5296498684476827, + "grad_norm": 0.28894317150115967, + "learning_rate": 0.0001674752712878788, + "loss": 0.2242, + "step": 2617 + }, + { + "epoch": 0.5298522566282129, + "grad_norm": 0.2729368805885315, + "learning_rate": 0.0001674517927991436, + "loss": 0.2229, + "step": 2618 + }, + { + "epoch": 0.5300546448087432, + "grad_norm": 0.3531043231487274, + "learning_rate": 0.00016742830748634382, + "loss": 0.2392, + "step": 2619 + }, + { + "epoch": 0.5302570329892734, + "grad_norm": 0.31617650389671326, + "learning_rate": 0.00016740481535185538, + "loss": 0.2367, + "step": 2620 + }, + { + "epoch": 0.5304594211698037, + "grad_norm": 0.28258374333381653, + "learning_rate": 0.000167381316398055, + "loss": 0.2556, + "step": 2621 + }, + { + "epoch": 0.5306618093503339, + "grad_norm": 0.2794475853443146, + "learning_rate": 0.00016735781062732005, + "loss": 0.2592, + "step": 2622 + }, + { + "epoch": 0.5308641975308642, + "grad_norm": 0.25831735134124756, + "learning_rate": 0.00016733429804202858, + "loss": 0.2083, + "step": 2623 + }, + { + "epoch": 0.5310665857113944, + "grad_norm": 0.35561296343803406, + "learning_rate": 0.0001673107786445594, + "loss": 0.2555, + "step": 2624 + }, + { + "epoch": 0.5312689738919247, + "grad_norm": 0.2760870158672333, + "learning_rate": 0.0001672872524372919, + "loss": 0.2023, + "step": 2625 + }, + { + "epoch": 0.5314713620724549, + "grad_norm": 0.33602291345596313, + "learning_rate": 0.00016726371942260625, + "loss": 0.2226, + "step": 2626 + }, + { + "epoch": 0.5316737502529852, + "grad_norm": 0.302262544631958, + "learning_rate": 0.00016724017960288324, + "loss": 0.2788, + "step": 2627 + }, + { + "epoch": 0.5318761384335154, + "grad_norm": 0.26179543137550354, + "learning_rate": 0.00016721663298050442, + "loss": 0.242, + "step": 2628 + }, + { + "epoch": 0.5320785266140458, + "grad_norm": 0.27100905776023865, + "learning_rate": 0.00016719307955785195, + "loss": 0.2288, + "step": 2629 + }, + { + "epoch": 0.532280914794576, + "grad_norm": 0.3250473141670227, + "learning_rate": 0.00016716951933730875, + "loss": 0.2845, + "step": 2630 + }, + { + "epoch": 0.5324833029751063, + "grad_norm": 0.2948412001132965, + "learning_rate": 0.0001671459523212584, + "loss": 0.2337, + "step": 2631 + }, + { + "epoch": 0.5326856911556365, + "grad_norm": 0.3160333037376404, + "learning_rate": 0.0001671223785120851, + "loss": 0.2939, + "step": 2632 + }, + { + "epoch": 0.5328880793361668, + "grad_norm": 0.2872770428657532, + "learning_rate": 0.0001670987979121739, + "loss": 0.2317, + "step": 2633 + }, + { + "epoch": 0.533090467516697, + "grad_norm": 0.2879941761493683, + "learning_rate": 0.00016707521052391035, + "loss": 0.2457, + "step": 2634 + }, + { + "epoch": 0.5332928556972273, + "grad_norm": 0.2784724235534668, + "learning_rate": 0.00016705161634968086, + "loss": 0.2411, + "step": 2635 + }, + { + "epoch": 0.5334952438777575, + "grad_norm": 0.2538287043571472, + "learning_rate": 0.00016702801539187235, + "loss": 0.2063, + "step": 2636 + }, + { + "epoch": 0.5336976320582878, + "grad_norm": 0.30907729268074036, + "learning_rate": 0.00016700440765287256, + "loss": 0.2445, + "step": 2637 + }, + { + "epoch": 0.5339000202388181, + "grad_norm": 0.28510886430740356, + "learning_rate": 0.00016698079313506989, + "loss": 0.2463, + "step": 2638 + }, + { + "epoch": 0.5341024084193483, + "grad_norm": 0.32631993293762207, + "learning_rate": 0.0001669571718408534, + "loss": 0.2277, + "step": 2639 + }, + { + "epoch": 0.5343047965998786, + "grad_norm": 0.298195481300354, + "learning_rate": 0.00016693354377261284, + "loss": 0.2531, + "step": 2640 + }, + { + "epoch": 0.5345071847804088, + "grad_norm": 0.32875820994377136, + "learning_rate": 0.00016690990893273862, + "loss": 0.2492, + "step": 2641 + }, + { + "epoch": 0.5347095729609391, + "grad_norm": 0.27733179926872253, + "learning_rate": 0.0001668862673236219, + "loss": 0.2143, + "step": 2642 + }, + { + "epoch": 0.5349119611414693, + "grad_norm": 0.30193451046943665, + "learning_rate": 0.00016686261894765448, + "loss": 0.2301, + "step": 2643 + }, + { + "epoch": 0.5351143493219996, + "grad_norm": 0.3172464072704315, + "learning_rate": 0.00016683896380722887, + "loss": 0.2468, + "step": 2644 + }, + { + "epoch": 0.5353167375025298, + "grad_norm": 1.5048164129257202, + "learning_rate": 0.00016681530190473822, + "loss": 0.2744, + "step": 2645 + }, + { + "epoch": 0.5355191256830601, + "grad_norm": 0.3033134937286377, + "learning_rate": 0.00016679163324257644, + "loss": 0.253, + "step": 2646 + }, + { + "epoch": 0.5357215138635903, + "grad_norm": 0.36687731742858887, + "learning_rate": 0.00016676795782313798, + "loss": 0.2557, + "step": 2647 + }, + { + "epoch": 0.5359239020441207, + "grad_norm": 0.38913866877555847, + "learning_rate": 0.00016674427564881817, + "loss": 0.2596, + "step": 2648 + }, + { + "epoch": 0.5361262902246509, + "grad_norm": 0.3134247064590454, + "learning_rate": 0.0001667205867220129, + "loss": 0.2355, + "step": 2649 + }, + { + "epoch": 0.5363286784051812, + "grad_norm": 0.30010947585105896, + "learning_rate": 0.00016669689104511877, + "loss": 0.2351, + "step": 2650 + }, + { + "epoch": 0.5363286784051812, + "eval_loss": 0.2756694257259369, + "eval_runtime": 0.7371, + "eval_samples_per_second": 6.783, + "eval_steps_per_second": 1.357, + "step": 2650 + }, + { + "epoch": 0.5365310665857114, + "grad_norm": 0.3701987564563751, + "learning_rate": 0.00016667318862053303, + "loss": 0.2625, + "step": 2651 + }, + { + "epoch": 0.5367334547662417, + "grad_norm": 0.2762279510498047, + "learning_rate": 0.00016664947945065365, + "loss": 0.2387, + "step": 2652 + }, + { + "epoch": 0.5369358429467719, + "grad_norm": 0.5423218011856079, + "learning_rate": 0.0001666257635378793, + "loss": 0.2536, + "step": 2653 + }, + { + "epoch": 0.5371382311273022, + "grad_norm": 0.3626985549926758, + "learning_rate": 0.00016660204088460927, + "loss": 0.2439, + "step": 2654 + }, + { + "epoch": 0.5373406193078324, + "grad_norm": 0.4017620384693146, + "learning_rate": 0.00016657831149324362, + "loss": 0.2285, + "step": 2655 + }, + { + "epoch": 0.5375430074883627, + "grad_norm": 0.29086053371429443, + "learning_rate": 0.000166554575366183, + "loss": 0.2206, + "step": 2656 + }, + { + "epoch": 0.5377453956688929, + "grad_norm": 0.2927820682525635, + "learning_rate": 0.0001665308325058288, + "loss": 0.2295, + "step": 2657 + }, + { + "epoch": 0.5379477838494232, + "grad_norm": 0.4150175154209137, + "learning_rate": 0.00016650708291458303, + "loss": 0.2561, + "step": 2658 + }, + { + "epoch": 0.5381501720299534, + "grad_norm": 0.33315229415893555, + "learning_rate": 0.00016648332659484848, + "loss": 0.2453, + "step": 2659 + }, + { + "epoch": 0.5383525602104837, + "grad_norm": 0.39167946577072144, + "learning_rate": 0.00016645956354902858, + "loss": 0.274, + "step": 2660 + }, + { + "epoch": 0.5385549483910139, + "grad_norm": 0.2934190332889557, + "learning_rate": 0.00016643579377952737, + "loss": 0.2584, + "step": 2661 + }, + { + "epoch": 0.5387573365715442, + "grad_norm": 0.3536292314529419, + "learning_rate": 0.00016641201728874965, + "loss": 0.2535, + "step": 2662 + }, + { + "epoch": 0.5389597247520744, + "grad_norm": 0.29246678948402405, + "learning_rate": 0.00016638823407910084, + "loss": 0.2585, + "step": 2663 + }, + { + "epoch": 0.5391621129326047, + "grad_norm": 0.43456900119781494, + "learning_rate": 0.00016636444415298716, + "loss": 0.267, + "step": 2664 + }, + { + "epoch": 0.5393645011131349, + "grad_norm": 0.2911350131034851, + "learning_rate": 0.00016634064751281535, + "loss": 0.217, + "step": 2665 + }, + { + "epoch": 0.5395668892936653, + "grad_norm": 0.31089237332344055, + "learning_rate": 0.00016631684416099294, + "loss": 0.2452, + "step": 2666 + }, + { + "epoch": 0.5397692774741955, + "grad_norm": 0.34671053290367126, + "learning_rate": 0.00016629303409992807, + "loss": 0.2875, + "step": 2667 + }, + { + "epoch": 0.5399716656547258, + "grad_norm": 0.2837030291557312, + "learning_rate": 0.00016626921733202963, + "loss": 0.2345, + "step": 2668 + }, + { + "epoch": 0.5401740538352561, + "grad_norm": 0.48084816336631775, + "learning_rate": 0.00016624539385970714, + "loss": 0.2603, + "step": 2669 + }, + { + "epoch": 0.5403764420157863, + "grad_norm": 0.33831149339675903, + "learning_rate": 0.0001662215636853708, + "loss": 0.2847, + "step": 2670 + }, + { + "epoch": 0.5405788301963166, + "grad_norm": 0.27991217374801636, + "learning_rate": 0.0001661977268114315, + "loss": 0.2168, + "step": 2671 + }, + { + "epoch": 0.5407812183768468, + "grad_norm": 0.2763236463069916, + "learning_rate": 0.0001661738832403008, + "loss": 0.2532, + "step": 2672 + }, + { + "epoch": 0.5409836065573771, + "grad_norm": 0.31166934967041016, + "learning_rate": 0.00016615003297439099, + "loss": 0.2738, + "step": 2673 + }, + { + "epoch": 0.5411859947379073, + "grad_norm": 0.30180248618125916, + "learning_rate": 0.00016612617601611488, + "loss": 0.2273, + "step": 2674 + }, + { + "epoch": 0.5413883829184376, + "grad_norm": 0.25693386793136597, + "learning_rate": 0.00016610231236788617, + "loss": 0.2059, + "step": 2675 + }, + { + "epoch": 0.5415907710989678, + "grad_norm": 0.2835194170475006, + "learning_rate": 0.0001660784420321191, + "loss": 0.2069, + "step": 2676 + }, + { + "epoch": 0.5417931592794981, + "grad_norm": 0.4018629491329193, + "learning_rate": 0.00016605456501122862, + "loss": 0.2489, + "step": 2677 + }, + { + "epoch": 0.5419955474600283, + "grad_norm": 0.3063729405403137, + "learning_rate": 0.00016603068130763036, + "loss": 0.2791, + "step": 2678 + }, + { + "epoch": 0.5421979356405586, + "grad_norm": 0.2935909330844879, + "learning_rate": 0.0001660067909237406, + "loss": 0.2437, + "step": 2679 + }, + { + "epoch": 0.5424003238210888, + "grad_norm": 0.27330735325813293, + "learning_rate": 0.00016598289386197634, + "loss": 0.1945, + "step": 2680 + }, + { + "epoch": 0.5426027120016191, + "grad_norm": 0.3396134078502655, + "learning_rate": 0.00016595899012475523, + "loss": 0.2554, + "step": 2681 + }, + { + "epoch": 0.5428051001821493, + "grad_norm": 0.26023051142692566, + "learning_rate": 0.00016593507971449561, + "loss": 0.2245, + "step": 2682 + }, + { + "epoch": 0.5430074883626796, + "grad_norm": 0.3104912042617798, + "learning_rate": 0.00016591116263361646, + "loss": 0.2393, + "step": 2683 + }, + { + "epoch": 0.5432098765432098, + "grad_norm": 0.26407697796821594, + "learning_rate": 0.00016588723888453748, + "loss": 0.2586, + "step": 2684 + }, + { + "epoch": 0.5434122647237402, + "grad_norm": 0.30238598585128784, + "learning_rate": 0.000165863308469679, + "loss": 0.253, + "step": 2685 + }, + { + "epoch": 0.5436146529042704, + "grad_norm": 0.33824655413627625, + "learning_rate": 0.0001658393713914621, + "loss": 0.2808, + "step": 2686 + }, + { + "epoch": 0.5438170410848007, + "grad_norm": 0.725450873374939, + "learning_rate": 0.0001658154276523084, + "loss": 0.2531, + "step": 2687 + }, + { + "epoch": 0.5440194292653309, + "grad_norm": 0.28861457109451294, + "learning_rate": 0.00016579147725464036, + "loss": 0.2393, + "step": 2688 + }, + { + "epoch": 0.5442218174458612, + "grad_norm": 0.40884849429130554, + "learning_rate": 0.000165767520200881, + "loss": 0.2718, + "step": 2689 + }, + { + "epoch": 0.5444242056263914, + "grad_norm": 0.5971659421920776, + "learning_rate": 0.000165743556493454, + "loss": 0.2587, + "step": 2690 + }, + { + "epoch": 0.5446265938069217, + "grad_norm": 0.26276418566703796, + "learning_rate": 0.00016571958613478382, + "loss": 0.2386, + "step": 2691 + }, + { + "epoch": 0.5448289819874519, + "grad_norm": 0.2730572819709778, + "learning_rate": 0.00016569560912729552, + "loss": 0.2305, + "step": 2692 + }, + { + "epoch": 0.5450313701679822, + "grad_norm": 0.3255332410335541, + "learning_rate": 0.00016567162547341478, + "loss": 0.2445, + "step": 2693 + }, + { + "epoch": 0.5452337583485124, + "grad_norm": 0.32056882977485657, + "learning_rate": 0.0001656476351755681, + "loss": 0.2668, + "step": 2694 + }, + { + "epoch": 0.5454361465290427, + "grad_norm": 0.3126126229763031, + "learning_rate": 0.0001656236382361825, + "loss": 0.2294, + "step": 2695 + }, + { + "epoch": 0.5456385347095729, + "grad_norm": 0.38915979862213135, + "learning_rate": 0.00016559963465768575, + "loss": 0.229, + "step": 2696 + }, + { + "epoch": 0.5458409228901032, + "grad_norm": 0.3153875172138214, + "learning_rate": 0.00016557562444250633, + "loss": 0.233, + "step": 2697 + }, + { + "epoch": 0.5460433110706335, + "grad_norm": 0.3637772798538208, + "learning_rate": 0.0001655516075930733, + "loss": 0.2572, + "step": 2698 + }, + { + "epoch": 0.5462456992511637, + "grad_norm": 0.28701117634773254, + "learning_rate": 0.00016552758411181643, + "loss": 0.2588, + "step": 2699 + }, + { + "epoch": 0.546448087431694, + "grad_norm": 0.3291356563568115, + "learning_rate": 0.00016550355400116615, + "loss": 0.2454, + "step": 2700 + }, + { + "epoch": 0.546448087431694, + "eval_loss": 0.2727106213569641, + "eval_runtime": 0.7413, + "eval_samples_per_second": 6.745, + "eval_steps_per_second": 1.349, + "step": 2700 + }, + { + "epoch": 0.5466504756122242, + "grad_norm": 0.3378230035305023, + "learning_rate": 0.0001654795172635536, + "loss": 0.2608, + "step": 2701 + }, + { + "epoch": 0.5468528637927546, + "grad_norm": 0.28990438580513, + "learning_rate": 0.0001654554739014106, + "loss": 0.2299, + "step": 2702 + }, + { + "epoch": 0.5470552519732848, + "grad_norm": 0.2976401150226593, + "learning_rate": 0.00016543142391716958, + "loss": 0.2179, + "step": 2703 + }, + { + "epoch": 0.5472576401538151, + "grad_norm": 0.3290218412876129, + "learning_rate": 0.00016540736731326358, + "loss": 0.2793, + "step": 2704 + }, + { + "epoch": 0.5474600283343453, + "grad_norm": 0.3076421618461609, + "learning_rate": 0.00016538330409212655, + "loss": 0.2466, + "step": 2705 + }, + { + "epoch": 0.5476624165148756, + "grad_norm": 0.2529980540275574, + "learning_rate": 0.00016535923425619283, + "loss": 0.2289, + "step": 2706 + }, + { + "epoch": 0.5478648046954058, + "grad_norm": 0.3483887314796448, + "learning_rate": 0.00016533515780789757, + "loss": 0.2344, + "step": 2707 + }, + { + "epoch": 0.5480671928759361, + "grad_norm": 0.46474558115005493, + "learning_rate": 0.00016531107474967663, + "loss": 0.2104, + "step": 2708 + }, + { + "epoch": 0.5482695810564663, + "grad_norm": 0.3963288962841034, + "learning_rate": 0.00016528698508396644, + "loss": 0.2273, + "step": 2709 + }, + { + "epoch": 0.5484719692369966, + "grad_norm": 0.32386282086372375, + "learning_rate": 0.00016526288881320414, + "loss": 0.213, + "step": 2710 + }, + { + "epoch": 0.5486743574175268, + "grad_norm": 0.3164507746696472, + "learning_rate": 0.00016523878593982755, + "loss": 0.2496, + "step": 2711 + }, + { + "epoch": 0.5488767455980571, + "grad_norm": 0.316854864358902, + "learning_rate": 0.00016521467646627515, + "loss": 0.2555, + "step": 2712 + }, + { + "epoch": 0.5490791337785873, + "grad_norm": 0.38298240303993225, + "learning_rate": 0.00016519056039498607, + "loss": 0.2372, + "step": 2713 + }, + { + "epoch": 0.5492815219591176, + "grad_norm": 0.4422507584095001, + "learning_rate": 0.00016516643772840011, + "loss": 0.2916, + "step": 2714 + }, + { + "epoch": 0.5494839101396478, + "grad_norm": 0.3752078711986542, + "learning_rate": 0.0001651423084689578, + "loss": 0.2451, + "step": 2715 + }, + { + "epoch": 0.5496862983201781, + "grad_norm": 0.25422441959381104, + "learning_rate": 0.00016511817261910023, + "loss": 0.1938, + "step": 2716 + }, + { + "epoch": 0.5498886865007083, + "grad_norm": 0.2828325927257538, + "learning_rate": 0.00016509403018126923, + "loss": 0.2518, + "step": 2717 + }, + { + "epoch": 0.5500910746812386, + "grad_norm": 0.31741753220558167, + "learning_rate": 0.00016506988115790727, + "loss": 0.2348, + "step": 2718 + }, + { + "epoch": 0.5502934628617688, + "grad_norm": 0.2640233337879181, + "learning_rate": 0.0001650457255514575, + "loss": 0.2212, + "step": 2719 + }, + { + "epoch": 0.5504958510422991, + "grad_norm": 0.29826608300209045, + "learning_rate": 0.0001650215633643638, + "loss": 0.2049, + "step": 2720 + }, + { + "epoch": 0.5506982392228293, + "grad_norm": 0.3221287429332733, + "learning_rate": 0.00016499739459907052, + "loss": 0.246, + "step": 2721 + }, + { + "epoch": 0.5509006274033597, + "grad_norm": 0.2922976016998291, + "learning_rate": 0.00016497321925802285, + "loss": 0.1945, + "step": 2722 + }, + { + "epoch": 0.5511030155838899, + "grad_norm": 0.4366983473300934, + "learning_rate": 0.00016494903734366663, + "loss": 0.2492, + "step": 2723 + }, + { + "epoch": 0.5513054037644202, + "grad_norm": 0.4032882750034332, + "learning_rate": 0.00016492484885844834, + "loss": 0.2792, + "step": 2724 + }, + { + "epoch": 0.5515077919449504, + "grad_norm": 0.4576028287410736, + "learning_rate": 0.00016490065380481508, + "loss": 0.2362, + "step": 2725 + }, + { + "epoch": 0.5517101801254807, + "grad_norm": 0.29097214341163635, + "learning_rate": 0.00016487645218521464, + "loss": 0.1913, + "step": 2726 + }, + { + "epoch": 0.5519125683060109, + "grad_norm": 0.5986493229866028, + "learning_rate": 0.00016485224400209555, + "loss": 0.2713, + "step": 2727 + }, + { + "epoch": 0.5521149564865412, + "grad_norm": 0.33854955434799194, + "learning_rate": 0.0001648280292579069, + "loss": 0.2676, + "step": 2728 + }, + { + "epoch": 0.5523173446670715, + "grad_norm": 0.28433772921562195, + "learning_rate": 0.00016480380795509843, + "loss": 0.2161, + "step": 2729 + }, + { + "epoch": 0.5525197328476017, + "grad_norm": 0.34827956557273865, + "learning_rate": 0.00016477958009612068, + "loss": 0.2212, + "step": 2730 + }, + { + "epoch": 0.552722121028132, + "grad_norm": 0.3341020941734314, + "learning_rate": 0.00016475534568342472, + "loss": 0.2367, + "step": 2731 + }, + { + "epoch": 0.5529245092086622, + "grad_norm": 0.29117056727409363, + "learning_rate": 0.00016473110471946237, + "loss": 0.2335, + "step": 2732 + }, + { + "epoch": 0.5531268973891925, + "grad_norm": 0.3291405141353607, + "learning_rate": 0.00016470685720668606, + "loss": 0.247, + "step": 2733 + }, + { + "epoch": 0.5533292855697227, + "grad_norm": 0.378342866897583, + "learning_rate": 0.00016468260314754892, + "loss": 0.2516, + "step": 2734 + }, + { + "epoch": 0.553531673750253, + "grad_norm": 0.498351126909256, + "learning_rate": 0.00016465834254450468, + "loss": 0.2192, + "step": 2735 + }, + { + "epoch": 0.5537340619307832, + "grad_norm": 0.3393566906452179, + "learning_rate": 0.00016463407540000778, + "loss": 0.2314, + "step": 2736 + }, + { + "epoch": 0.5539364501113135, + "grad_norm": 0.5438311696052551, + "learning_rate": 0.00016460980171651338, + "loss": 0.2736, + "step": 2737 + }, + { + "epoch": 0.5541388382918437, + "grad_norm": 0.3348614573478699, + "learning_rate": 0.00016458552149647716, + "loss": 0.2654, + "step": 2738 + }, + { + "epoch": 0.554341226472374, + "grad_norm": 0.312345951795578, + "learning_rate": 0.00016456123474235552, + "loss": 0.2666, + "step": 2739 + }, + { + "epoch": 0.5545436146529042, + "grad_norm": 0.2773478627204895, + "learning_rate": 0.00016453694145660564, + "loss": 0.2572, + "step": 2740 + }, + { + "epoch": 0.5547460028334346, + "grad_norm": 0.3372350037097931, + "learning_rate": 0.00016451264164168516, + "loss": 0.2576, + "step": 2741 + }, + { + "epoch": 0.5549483910139648, + "grad_norm": 0.34194353222846985, + "learning_rate": 0.00016448833530005255, + "loss": 0.2337, + "step": 2742 + }, + { + "epoch": 0.5551507791944951, + "grad_norm": 0.29681119322776794, + "learning_rate": 0.00016446402243416682, + "loss": 0.2518, + "step": 2743 + }, + { + "epoch": 0.5553531673750253, + "grad_norm": 0.31567174196243286, + "learning_rate": 0.0001644397030464877, + "loss": 0.2543, + "step": 2744 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.3233564794063568, + "learning_rate": 0.00016441537713947563, + "loss": 0.251, + "step": 2745 + }, + { + "epoch": 0.5557579437360858, + "grad_norm": 0.25843173265457153, + "learning_rate": 0.00016439104471559156, + "loss": 0.2231, + "step": 2746 + }, + { + "epoch": 0.5559603319166161, + "grad_norm": 0.38345223665237427, + "learning_rate": 0.00016436670577729727, + "loss": 0.2568, + "step": 2747 + }, + { + "epoch": 0.5561627200971463, + "grad_norm": 0.361279159784317, + "learning_rate": 0.00016434236032705508, + "loss": 0.2421, + "step": 2748 + }, + { + "epoch": 0.5563651082776766, + "grad_norm": 0.29712972044944763, + "learning_rate": 0.000164318008367328, + "loss": 0.2375, + "step": 2749 + }, + { + "epoch": 0.5565674964582068, + "grad_norm": 0.3402288556098938, + "learning_rate": 0.0001642936499005797, + "loss": 0.2556, + "step": 2750 + }, + { + "epoch": 0.5565674964582068, + "eval_loss": 0.271230548620224, + "eval_runtime": 0.7399, + "eval_samples_per_second": 6.758, + "eval_steps_per_second": 1.352, + "step": 2750 + }, + { + "epoch": 0.5567698846387371, + "grad_norm": 0.31470251083374023, + "learning_rate": 0.00016426928492927455, + "loss": 0.272, + "step": 2751 + }, + { + "epoch": 0.5569722728192673, + "grad_norm": 0.40976059436798096, + "learning_rate": 0.0001642449134558775, + "loss": 0.2206, + "step": 2752 + }, + { + "epoch": 0.5571746609997976, + "grad_norm": 0.3531858026981354, + "learning_rate": 0.00016422053548285424, + "loss": 0.2526, + "step": 2753 + }, + { + "epoch": 0.5573770491803278, + "grad_norm": 0.33502495288848877, + "learning_rate": 0.00016419615101267103, + "loss": 0.2359, + "step": 2754 + }, + { + "epoch": 0.5575794373608581, + "grad_norm": 0.24423053860664368, + "learning_rate": 0.0001641717600477949, + "loss": 0.2205, + "step": 2755 + }, + { + "epoch": 0.5577818255413883, + "grad_norm": 0.3108390271663666, + "learning_rate": 0.00016414736259069338, + "loss": 0.2626, + "step": 2756 + }, + { + "epoch": 0.5579842137219186, + "grad_norm": 0.3606570363044739, + "learning_rate": 0.00016412295864383486, + "loss": 0.2688, + "step": 2757 + }, + { + "epoch": 0.5581866019024488, + "grad_norm": 0.42568790912628174, + "learning_rate": 0.0001640985482096882, + "loss": 0.2456, + "step": 2758 + }, + { + "epoch": 0.5583889900829792, + "grad_norm": 0.35818469524383545, + "learning_rate": 0.00016407413129072303, + "loss": 0.2762, + "step": 2759 + }, + { + "epoch": 0.5585913782635095, + "grad_norm": 0.27855581045150757, + "learning_rate": 0.00016404970788940957, + "loss": 0.2409, + "step": 2760 + }, + { + "epoch": 0.5587937664440397, + "grad_norm": 0.3034149706363678, + "learning_rate": 0.0001640252780082187, + "loss": 0.2416, + "step": 2761 + }, + { + "epoch": 0.55899615462457, + "grad_norm": 0.2733892798423767, + "learning_rate": 0.00016400084164962201, + "loss": 0.256, + "step": 2762 + }, + { + "epoch": 0.5591985428051002, + "grad_norm": 0.3733433485031128, + "learning_rate": 0.00016397639881609175, + "loss": 0.266, + "step": 2763 + }, + { + "epoch": 0.5594009309856305, + "grad_norm": 0.29257914423942566, + "learning_rate": 0.00016395194951010074, + "loss": 0.2496, + "step": 2764 + }, + { + "epoch": 0.5596033191661607, + "grad_norm": 0.2640772759914398, + "learning_rate": 0.00016392749373412253, + "loss": 0.2297, + "step": 2765 + }, + { + "epoch": 0.559805707346691, + "grad_norm": 0.3010108470916748, + "learning_rate": 0.00016390303149063128, + "loss": 0.236, + "step": 2766 + }, + { + "epoch": 0.5600080955272212, + "grad_norm": 0.3347938358783722, + "learning_rate": 0.0001638785627821018, + "loss": 0.2321, + "step": 2767 + }, + { + "epoch": 0.5602104837077515, + "grad_norm": 0.2746746838092804, + "learning_rate": 0.00016385408761100965, + "loss": 0.2623, + "step": 2768 + }, + { + "epoch": 0.5604128718882817, + "grad_norm": 0.31118178367614746, + "learning_rate": 0.0001638296059798309, + "loss": 0.2429, + "step": 2769 + }, + { + "epoch": 0.560615260068812, + "grad_norm": 0.3054017126560211, + "learning_rate": 0.00016380511789104238, + "loss": 0.2455, + "step": 2770 + }, + { + "epoch": 0.5608176482493422, + "grad_norm": 0.28247079253196716, + "learning_rate": 0.00016378062334712157, + "loss": 0.2251, + "step": 2771 + }, + { + "epoch": 0.5610200364298725, + "grad_norm": 0.2857528030872345, + "learning_rate": 0.0001637561223505465, + "loss": 0.2441, + "step": 2772 + }, + { + "epoch": 0.5612224246104027, + "grad_norm": 0.31612929701805115, + "learning_rate": 0.00016373161490379595, + "loss": 0.266, + "step": 2773 + }, + { + "epoch": 0.561424812790933, + "grad_norm": 0.2727610766887665, + "learning_rate": 0.00016370710100934935, + "loss": 0.2511, + "step": 2774 + }, + { + "epoch": 0.5616272009714632, + "grad_norm": 0.34806764125823975, + "learning_rate": 0.00016368258066968671, + "loss": 0.2678, + "step": 2775 + }, + { + "epoch": 0.5618295891519935, + "grad_norm": 0.4355735778808594, + "learning_rate": 0.00016365805388728882, + "loss": 0.2785, + "step": 2776 + }, + { + "epoch": 0.5620319773325237, + "grad_norm": 0.2966254651546478, + "learning_rate": 0.00016363352066463694, + "loss": 0.2023, + "step": 2777 + }, + { + "epoch": 0.5622343655130541, + "grad_norm": 0.3531200587749481, + "learning_rate": 0.00016360898100421318, + "loss": 0.2288, + "step": 2778 + }, + { + "epoch": 0.5624367536935843, + "grad_norm": 0.24990540742874146, + "learning_rate": 0.00016358443490850015, + "loss": 0.2125, + "step": 2779 + }, + { + "epoch": 0.5626391418741146, + "grad_norm": 0.31871917843818665, + "learning_rate": 0.00016355988237998117, + "loss": 0.2483, + "step": 2780 + }, + { + "epoch": 0.5628415300546448, + "grad_norm": 0.3779838979244232, + "learning_rate": 0.0001635353234211402, + "loss": 0.2513, + "step": 2781 + }, + { + "epoch": 0.5630439182351751, + "grad_norm": 0.31013843417167664, + "learning_rate": 0.00016351075803446193, + "loss": 0.2343, + "step": 2782 + }, + { + "epoch": 0.5632463064157053, + "grad_norm": 0.2897894084453583, + "learning_rate": 0.00016348618622243156, + "loss": 0.2415, + "step": 2783 + }, + { + "epoch": 0.5634486945962356, + "grad_norm": 0.29897069931030273, + "learning_rate": 0.000163461607987535, + "loss": 0.237, + "step": 2784 + }, + { + "epoch": 0.5636510827767658, + "grad_norm": 0.31116607785224915, + "learning_rate": 0.00016343702333225881, + "loss": 0.2673, + "step": 2785 + }, + { + "epoch": 0.5638534709572961, + "grad_norm": 0.2632872760295868, + "learning_rate": 0.0001634124322590903, + "loss": 0.206, + "step": 2786 + }, + { + "epoch": 0.5640558591378263, + "grad_norm": 0.4221595525741577, + "learning_rate": 0.00016338783477051722, + "loss": 0.2548, + "step": 2787 + }, + { + "epoch": 0.5642582473183566, + "grad_norm": 0.41988351941108704, + "learning_rate": 0.00016336323086902816, + "loss": 0.3041, + "step": 2788 + }, + { + "epoch": 0.5644606354988869, + "grad_norm": 0.3068259060382843, + "learning_rate": 0.00016333862055711228, + "loss": 0.2578, + "step": 2789 + }, + { + "epoch": 0.5646630236794171, + "grad_norm": 0.41189467906951904, + "learning_rate": 0.00016331400383725936, + "loss": 0.2482, + "step": 2790 + }, + { + "epoch": 0.5648654118599474, + "grad_norm": 0.2784172594547272, + "learning_rate": 0.00016328938071195984, + "loss": 0.2306, + "step": 2791 + }, + { + "epoch": 0.5650678000404776, + "grad_norm": 0.3234933614730835, + "learning_rate": 0.00016326475118370493, + "loss": 0.2457, + "step": 2792 + }, + { + "epoch": 0.5652701882210079, + "grad_norm": 0.3366439938545227, + "learning_rate": 0.0001632401152549863, + "loss": 0.2598, + "step": 2793 + }, + { + "epoch": 0.5654725764015381, + "grad_norm": 1.26985502243042, + "learning_rate": 0.00016321547292829638, + "loss": 0.2393, + "step": 2794 + }, + { + "epoch": 0.5656749645820685, + "grad_norm": 0.28081372380256653, + "learning_rate": 0.00016319082420612825, + "loss": 0.256, + "step": 2795 + }, + { + "epoch": 0.5658773527625987, + "grad_norm": 0.31037789583206177, + "learning_rate": 0.00016316616909097553, + "loss": 0.2431, + "step": 2796 + }, + { + "epoch": 0.566079740943129, + "grad_norm": 0.2816120684146881, + "learning_rate": 0.00016314150758533265, + "loss": 0.2261, + "step": 2797 + }, + { + "epoch": 0.5662821291236592, + "grad_norm": 0.4624975621700287, + "learning_rate": 0.00016311683969169453, + "loss": 0.2336, + "step": 2798 + }, + { + "epoch": 0.5664845173041895, + "grad_norm": 0.28248104453086853, + "learning_rate": 0.00016309216541255688, + "loss": 0.2206, + "step": 2799 + }, + { + "epoch": 0.5666869054847197, + "grad_norm": 0.36865583062171936, + "learning_rate": 0.00016306748475041594, + "loss": 0.2809, + "step": 2800 + }, + { + "epoch": 0.5666869054847197, + "eval_loss": 0.269897997379303, + "eval_runtime": 0.7397, + "eval_samples_per_second": 6.759, + "eval_steps_per_second": 1.352, + "step": 2800 + }, + { + "epoch": 0.56688929366525, + "grad_norm": 0.339393675327301, + "learning_rate": 0.00016304279770776867, + "loss": 0.2045, + "step": 2801 + }, + { + "epoch": 0.5670916818457802, + "grad_norm": 0.38487958908081055, + "learning_rate": 0.00016301810428711263, + "loss": 0.2546, + "step": 2802 + }, + { + "epoch": 0.5672940700263105, + "grad_norm": 0.39645740389823914, + "learning_rate": 0.00016299340449094603, + "loss": 0.2231, + "step": 2803 + }, + { + "epoch": 0.5674964582068407, + "grad_norm": 0.26686328649520874, + "learning_rate": 0.00016296869832176772, + "loss": 0.2263, + "step": 2804 + }, + { + "epoch": 0.567698846387371, + "grad_norm": 0.3659006655216217, + "learning_rate": 0.00016294398578207728, + "loss": 0.2385, + "step": 2805 + }, + { + "epoch": 0.5679012345679012, + "grad_norm": 0.36053726077079773, + "learning_rate": 0.0001629192668743748, + "loss": 0.2548, + "step": 2806 + }, + { + "epoch": 0.5681036227484315, + "grad_norm": 0.29510796070098877, + "learning_rate": 0.0001628945416011611, + "loss": 0.2512, + "step": 2807 + }, + { + "epoch": 0.5683060109289617, + "grad_norm": 0.3900874853134155, + "learning_rate": 0.00016286980996493762, + "loss": 0.3024, + "step": 2808 + }, + { + "epoch": 0.568508399109492, + "grad_norm": 0.2818908393383026, + "learning_rate": 0.00016284507196820646, + "loss": 0.1875, + "step": 2809 + }, + { + "epoch": 0.5687107872900222, + "grad_norm": 0.25959712266921997, + "learning_rate": 0.00016282032761347036, + "loss": 0.2116, + "step": 2810 + }, + { + "epoch": 0.5689131754705525, + "grad_norm": 0.47624823451042175, + "learning_rate": 0.00016279557690323268, + "loss": 0.2554, + "step": 2811 + }, + { + "epoch": 0.5691155636510827, + "grad_norm": 0.27003055810928345, + "learning_rate": 0.0001627708198399974, + "loss": 0.2458, + "step": 2812 + }, + { + "epoch": 0.569317951831613, + "grad_norm": 0.34233883023262024, + "learning_rate": 0.00016274605642626925, + "loss": 0.2444, + "step": 2813 + }, + { + "epoch": 0.5695203400121432, + "grad_norm": 0.31025639176368713, + "learning_rate": 0.00016272128666455348, + "loss": 0.2504, + "step": 2814 + }, + { + "epoch": 0.5697227281926736, + "grad_norm": 0.3257082402706146, + "learning_rate": 0.00016269651055735604, + "loss": 0.2437, + "step": 2815 + }, + { + "epoch": 0.5699251163732038, + "grad_norm": 0.4323844313621521, + "learning_rate": 0.0001626717281071836, + "loss": 0.2304, + "step": 2816 + }, + { + "epoch": 0.5701275045537341, + "grad_norm": 0.3516029119491577, + "learning_rate": 0.00016264693931654324, + "loss": 0.2285, + "step": 2817 + }, + { + "epoch": 0.5703298927342643, + "grad_norm": 0.3017370104789734, + "learning_rate": 0.00016262214418794293, + "loss": 0.2512, + "step": 2818 + }, + { + "epoch": 0.5705322809147946, + "grad_norm": 0.33841025829315186, + "learning_rate": 0.00016259734272389115, + "loss": 0.271, + "step": 2819 + }, + { + "epoch": 0.5707346690953249, + "grad_norm": 0.43659186363220215, + "learning_rate": 0.0001625725349268971, + "loss": 0.2231, + "step": 2820 + }, + { + "epoch": 0.5709370572758551, + "grad_norm": 0.28855910897254944, + "learning_rate": 0.00016254772079947046, + "loss": 0.2396, + "step": 2821 + }, + { + "epoch": 0.5711394454563854, + "grad_norm": 0.25596854090690613, + "learning_rate": 0.00016252290034412175, + "loss": 0.2379, + "step": 2822 + }, + { + "epoch": 0.5713418336369156, + "grad_norm": 0.31578749418258667, + "learning_rate": 0.0001624980735633621, + "loss": 0.2174, + "step": 2823 + }, + { + "epoch": 0.5715442218174459, + "grad_norm": 0.35793131589889526, + "learning_rate": 0.0001624732404597031, + "loss": 0.2461, + "step": 2824 + }, + { + "epoch": 0.5717466099979761, + "grad_norm": 0.2687852084636688, + "learning_rate": 0.00016244840103565714, + "loss": 0.2315, + "step": 2825 + }, + { + "epoch": 0.5719489981785064, + "grad_norm": 0.5443539023399353, + "learning_rate": 0.00016242355529373726, + "loss": 0.2681, + "step": 2826 + }, + { + "epoch": 0.5721513863590366, + "grad_norm": 0.3889239430427551, + "learning_rate": 0.00016239870323645706, + "loss": 0.2648, + "step": 2827 + }, + { + "epoch": 0.5723537745395669, + "grad_norm": 0.2995070815086365, + "learning_rate": 0.00016237384486633078, + "loss": 0.2608, + "step": 2828 + }, + { + "epoch": 0.5725561627200971, + "grad_norm": 0.31988614797592163, + "learning_rate": 0.00016234898018587337, + "loss": 0.2706, + "step": 2829 + }, + { + "epoch": 0.5727585509006274, + "grad_norm": 0.3469448983669281, + "learning_rate": 0.00016232410919760036, + "loss": 0.2828, + "step": 2830 + }, + { + "epoch": 0.5729609390811576, + "grad_norm": 0.438338965177536, + "learning_rate": 0.00016229923190402796, + "loss": 0.2606, + "step": 2831 + }, + { + "epoch": 0.573163327261688, + "grad_norm": 0.25825679302215576, + "learning_rate": 0.00016227434830767294, + "loss": 0.2145, + "step": 2832 + }, + { + "epoch": 0.5733657154422181, + "grad_norm": 0.2764333486557007, + "learning_rate": 0.00016224945841105282, + "loss": 0.2257, + "step": 2833 + }, + { + "epoch": 0.5735681036227485, + "grad_norm": 0.3371870219707489, + "learning_rate": 0.00016222456221668568, + "loss": 0.2526, + "step": 2834 + }, + { + "epoch": 0.5737704918032787, + "grad_norm": 0.3069702088832855, + "learning_rate": 0.00016219965972709023, + "loss": 0.2591, + "step": 2835 + }, + { + "epoch": 0.573972879983809, + "grad_norm": 0.33921316266059875, + "learning_rate": 0.00016217475094478586, + "loss": 0.2205, + "step": 2836 + }, + { + "epoch": 0.5741752681643392, + "grad_norm": 0.3128065764904022, + "learning_rate": 0.0001621498358722926, + "loss": 0.2617, + "step": 2837 + }, + { + "epoch": 0.5743776563448695, + "grad_norm": 0.32519394159317017, + "learning_rate": 0.0001621249145121311, + "loss": 0.2146, + "step": 2838 + }, + { + "epoch": 0.5745800445253997, + "grad_norm": 0.32722973823547363, + "learning_rate": 0.00016209998686682258, + "loss": 0.2562, + "step": 2839 + }, + { + "epoch": 0.57478243270593, + "grad_norm": 0.2931281328201294, + "learning_rate": 0.00016207505293888903, + "loss": 0.204, + "step": 2840 + }, + { + "epoch": 0.5749848208864602, + "grad_norm": 0.4192622900009155, + "learning_rate": 0.00016205011273085293, + "loss": 0.2562, + "step": 2841 + }, + { + "epoch": 0.5751872090669905, + "grad_norm": 0.37887004017829895, + "learning_rate": 0.00016202516624523754, + "loss": 0.2936, + "step": 2842 + }, + { + "epoch": 0.5753895972475207, + "grad_norm": 0.3006739616394043, + "learning_rate": 0.00016200021348456662, + "loss": 0.2494, + "step": 2843 + }, + { + "epoch": 0.575591985428051, + "grad_norm": 0.3014276623725891, + "learning_rate": 0.00016197525445136468, + "loss": 0.2633, + "step": 2844 + }, + { + "epoch": 0.5757943736085812, + "grad_norm": 0.29328301548957825, + "learning_rate": 0.00016195028914815679, + "loss": 0.2494, + "step": 2845 + }, + { + "epoch": 0.5759967617891115, + "grad_norm": 0.30112847685813904, + "learning_rate": 0.00016192531757746868, + "loss": 0.2129, + "step": 2846 + }, + { + "epoch": 0.5761991499696417, + "grad_norm": 0.3580033779144287, + "learning_rate": 0.0001619003397418267, + "loss": 0.2478, + "step": 2847 + }, + { + "epoch": 0.576401538150172, + "grad_norm": 0.34292858839035034, + "learning_rate": 0.0001618753556437579, + "loss": 0.2374, + "step": 2848 + }, + { + "epoch": 0.5766039263307022, + "grad_norm": 0.4073190689086914, + "learning_rate": 0.00016185036528578985, + "loss": 0.2558, + "step": 2849 + }, + { + "epoch": 0.5768063145112325, + "grad_norm": 0.355729341506958, + "learning_rate": 0.00016182536867045082, + "loss": 0.2413, + "step": 2850 + }, + { + "epoch": 0.5768063145112325, + "eval_loss": 0.27183297276496887, + "eval_runtime": 0.7381, + "eval_samples_per_second": 6.775, + "eval_steps_per_second": 1.355, + "step": 2850 + }, + { + "epoch": 0.5770087026917629, + "grad_norm": 0.4269033670425415, + "learning_rate": 0.00016180036580026972, + "loss": 0.2233, + "step": 2851 + }, + { + "epoch": 0.577211090872293, + "grad_norm": 0.284562885761261, + "learning_rate": 0.00016177535667777603, + "loss": 0.2212, + "step": 2852 + }, + { + "epoch": 0.5774134790528234, + "grad_norm": 0.3404622972011566, + "learning_rate": 0.00016175034130550003, + "loss": 0.2369, + "step": 2853 + }, + { + "epoch": 0.5776158672333536, + "grad_norm": 0.37222614884376526, + "learning_rate": 0.00016172531968597234, + "loss": 0.2291, + "step": 2854 + }, + { + "epoch": 0.5778182554138839, + "grad_norm": 0.37149935960769653, + "learning_rate": 0.00016170029182172458, + "loss": 0.2321, + "step": 2855 + }, + { + "epoch": 0.5780206435944141, + "grad_norm": 0.2935652732849121, + "learning_rate": 0.00016167525771528863, + "loss": 0.2377, + "step": 2856 + }, + { + "epoch": 0.5782230317749444, + "grad_norm": 0.3011722266674042, + "learning_rate": 0.0001616502173691973, + "loss": 0.238, + "step": 2857 + }, + { + "epoch": 0.5784254199554746, + "grad_norm": 0.24494045972824097, + "learning_rate": 0.00016162517078598384, + "loss": 0.2217, + "step": 2858 + }, + { + "epoch": 0.5786278081360049, + "grad_norm": 0.3266785740852356, + "learning_rate": 0.00016160011796818223, + "loss": 0.2459, + "step": 2859 + }, + { + "epoch": 0.5788301963165351, + "grad_norm": 0.42591631412506104, + "learning_rate": 0.00016157505891832707, + "loss": 0.2655, + "step": 2860 + }, + { + "epoch": 0.5790325844970654, + "grad_norm": 0.33396992087364197, + "learning_rate": 0.00016154999363895354, + "loss": 0.2523, + "step": 2861 + }, + { + "epoch": 0.5792349726775956, + "grad_norm": 0.3604094088077545, + "learning_rate": 0.0001615249221325975, + "loss": 0.2697, + "step": 2862 + }, + { + "epoch": 0.5794373608581259, + "grad_norm": 0.3391878306865692, + "learning_rate": 0.00016149984440179537, + "loss": 0.2264, + "step": 2863 + }, + { + "epoch": 0.5796397490386561, + "grad_norm": 0.2555520832538605, + "learning_rate": 0.0001614747604490843, + "loss": 0.223, + "step": 2864 + }, + { + "epoch": 0.5798421372191864, + "grad_norm": 0.30228936672210693, + "learning_rate": 0.00016144967027700204, + "loss": 0.2645, + "step": 2865 + }, + { + "epoch": 0.5800445253997166, + "grad_norm": 0.5504426956176758, + "learning_rate": 0.00016142457388808695, + "loss": 0.2548, + "step": 2866 + }, + { + "epoch": 0.5802469135802469, + "grad_norm": 0.324349582195282, + "learning_rate": 0.00016139947128487796, + "loss": 0.2272, + "step": 2867 + }, + { + "epoch": 0.5804493017607771, + "grad_norm": 0.29885464906692505, + "learning_rate": 0.0001613743624699147, + "loss": 0.2381, + "step": 2868 + }, + { + "epoch": 0.5806516899413074, + "grad_norm": 0.2752251625061035, + "learning_rate": 0.00016134924744573746, + "loss": 0.2473, + "step": 2869 + }, + { + "epoch": 0.5808540781218376, + "grad_norm": 0.40981918573379517, + "learning_rate": 0.0001613241262148871, + "loss": 0.2638, + "step": 2870 + }, + { + "epoch": 0.581056466302368, + "grad_norm": 0.367501437664032, + "learning_rate": 0.00016129899877990512, + "loss": 0.254, + "step": 2871 + }, + { + "epoch": 0.5812588544828982, + "grad_norm": 0.36186274886131287, + "learning_rate": 0.00016127386514333368, + "loss": 0.291, + "step": 2872 + }, + { + "epoch": 0.5814612426634285, + "grad_norm": 0.30635300278663635, + "learning_rate": 0.00016124872530771546, + "loss": 0.24, + "step": 2873 + }, + { + "epoch": 0.5816636308439587, + "grad_norm": 0.3640815019607544, + "learning_rate": 0.00016122357927559388, + "loss": 0.2207, + "step": 2874 + }, + { + "epoch": 0.581866019024489, + "grad_norm": 0.37263694405555725, + "learning_rate": 0.00016119842704951298, + "loss": 0.2656, + "step": 2875 + }, + { + "epoch": 0.5820684072050192, + "grad_norm": 0.34229201078414917, + "learning_rate": 0.00016117326863201737, + "loss": 0.27, + "step": 2876 + }, + { + "epoch": 0.5822707953855495, + "grad_norm": 0.3659871518611908, + "learning_rate": 0.00016114810402565236, + "loss": 0.2576, + "step": 2877 + }, + { + "epoch": 0.5824731835660797, + "grad_norm": 0.2683655619621277, + "learning_rate": 0.00016112293323296376, + "loss": 0.2232, + "step": 2878 + }, + { + "epoch": 0.58267557174661, + "grad_norm": 0.2627432644367218, + "learning_rate": 0.00016109775625649815, + "loss": 0.2384, + "step": 2879 + }, + { + "epoch": 0.5828779599271403, + "grad_norm": 0.5119886994361877, + "learning_rate": 0.0001610725730988026, + "loss": 0.2156, + "step": 2880 + }, + { + "epoch": 0.5830803481076705, + "grad_norm": 0.3302849531173706, + "learning_rate": 0.00016104738376242501, + "loss": 0.211, + "step": 2881 + }, + { + "epoch": 0.5832827362882008, + "grad_norm": 0.34468814730644226, + "learning_rate": 0.00016102218824991365, + "loss": 0.2415, + "step": 2882 + }, + { + "epoch": 0.583485124468731, + "grad_norm": 0.2802000045776367, + "learning_rate": 0.00016099698656381762, + "loss": 0.2105, + "step": 2883 + }, + { + "epoch": 0.5836875126492613, + "grad_norm": 0.3089223802089691, + "learning_rate": 0.0001609717787066865, + "loss": 0.2407, + "step": 2884 + }, + { + "epoch": 0.5838899008297915, + "grad_norm": 0.3115883767604828, + "learning_rate": 0.00016094656468107057, + "loss": 0.2526, + "step": 2885 + }, + { + "epoch": 0.5840922890103218, + "grad_norm": 0.35136011242866516, + "learning_rate": 0.00016092134448952074, + "loss": 0.2089, + "step": 2886 + }, + { + "epoch": 0.584294677190852, + "grad_norm": 0.5641045570373535, + "learning_rate": 0.0001608961181345885, + "loss": 0.2535, + "step": 2887 + }, + { + "epoch": 0.5844970653713824, + "grad_norm": 0.44940614700317383, + "learning_rate": 0.00016087088561882605, + "loss": 0.2245, + "step": 2888 + }, + { + "epoch": 0.5846994535519126, + "grad_norm": 0.26401445269584656, + "learning_rate": 0.00016084564694478605, + "loss": 0.2194, + "step": 2889 + }, + { + "epoch": 0.5849018417324429, + "grad_norm": 0.28557443618774414, + "learning_rate": 0.00016082040211502197, + "loss": 0.2326, + "step": 2890 + }, + { + "epoch": 0.5851042299129731, + "grad_norm": 0.3739382028579712, + "learning_rate": 0.00016079515113208776, + "loss": 0.2574, + "step": 2891 + }, + { + "epoch": 0.5853066180935034, + "grad_norm": 0.40503665804862976, + "learning_rate": 0.00016076989399853807, + "loss": 0.2534, + "step": 2892 + }, + { + "epoch": 0.5855090062740336, + "grad_norm": 0.34489211440086365, + "learning_rate": 0.0001607446307169282, + "loss": 0.261, + "step": 2893 + }, + { + "epoch": 0.5857113944545639, + "grad_norm": 0.4603923261165619, + "learning_rate": 0.00016071936128981396, + "loss": 0.248, + "step": 2894 + }, + { + "epoch": 0.5859137826350941, + "grad_norm": 0.45358771085739136, + "learning_rate": 0.00016069408571975187, + "loss": 0.2285, + "step": 2895 + }, + { + "epoch": 0.5861161708156244, + "grad_norm": 0.34958937764167786, + "learning_rate": 0.000160668804009299, + "loss": 0.2142, + "step": 2896 + }, + { + "epoch": 0.5863185589961546, + "grad_norm": 0.40218639373779297, + "learning_rate": 0.00016064351616101318, + "loss": 0.2736, + "step": 2897 + }, + { + "epoch": 0.5865209471766849, + "grad_norm": 0.2853103280067444, + "learning_rate": 0.0001606182221774527, + "loss": 0.2249, + "step": 2898 + }, + { + "epoch": 0.5867233353572151, + "grad_norm": 0.35319843888282776, + "learning_rate": 0.00016059292206117655, + "loss": 0.2332, + "step": 2899 + }, + { + "epoch": 0.5869257235377454, + "grad_norm": 0.3640676736831665, + "learning_rate": 0.00016056761581474438, + "loss": 0.2451, + "step": 2900 + }, + { + "epoch": 0.5869257235377454, + "eval_loss": 0.2811625003814697, + "eval_runtime": 0.7382, + "eval_samples_per_second": 6.774, + "eval_steps_per_second": 1.355, + "step": 2900 + }, + { + "epoch": 0.5871281117182756, + "grad_norm": 0.426506370306015, + "learning_rate": 0.00016054230344071636, + "loss": 0.2452, + "step": 2901 + }, + { + "epoch": 0.5873304998988059, + "grad_norm": 0.4590941071510315, + "learning_rate": 0.0001605169849416533, + "loss": 0.2189, + "step": 2902 + }, + { + "epoch": 0.5875328880793361, + "grad_norm": 0.2869848310947418, + "learning_rate": 0.00016049166032011672, + "loss": 0.2412, + "step": 2903 + }, + { + "epoch": 0.5877352762598664, + "grad_norm": 0.3106136918067932, + "learning_rate": 0.0001604663295786687, + "loss": 0.2225, + "step": 2904 + }, + { + "epoch": 0.5879376644403966, + "grad_norm": 0.3410221040248871, + "learning_rate": 0.0001604409927198719, + "loss": 0.2274, + "step": 2905 + }, + { + "epoch": 0.588140052620927, + "grad_norm": 0.3309195935726166, + "learning_rate": 0.0001604156497462897, + "loss": 0.2532, + "step": 2906 + }, + { + "epoch": 0.5883424408014571, + "grad_norm": 0.31511515378952026, + "learning_rate": 0.00016039030066048593, + "loss": 0.2601, + "step": 2907 + }, + { + "epoch": 0.5885448289819875, + "grad_norm": 0.3180387318134308, + "learning_rate": 0.00016036494546502525, + "loss": 0.2318, + "step": 2908 + }, + { + "epoch": 0.5887472171625177, + "grad_norm": 0.3602931797504425, + "learning_rate": 0.00016033958416247277, + "loss": 0.3103, + "step": 2909 + }, + { + "epoch": 0.588949605343048, + "grad_norm": 0.3041614294052124, + "learning_rate": 0.00016031421675539428, + "loss": 0.2982, + "step": 2910 + }, + { + "epoch": 0.5891519935235783, + "grad_norm": 0.35607126355171204, + "learning_rate": 0.00016028884324635625, + "loss": 0.2401, + "step": 2911 + }, + { + "epoch": 0.5893543817041085, + "grad_norm": 0.37263765931129456, + "learning_rate": 0.00016026346363792567, + "loss": 0.2691, + "step": 2912 + }, + { + "epoch": 0.5895567698846388, + "grad_norm": 0.315032422542572, + "learning_rate": 0.00016023807793267013, + "loss": 0.2549, + "step": 2913 + }, + { + "epoch": 0.589759158065169, + "grad_norm": 0.35683658719062805, + "learning_rate": 0.00016021268613315796, + "loss": 0.25, + "step": 2914 + }, + { + "epoch": 0.5899615462456993, + "grad_norm": 0.3754711449146271, + "learning_rate": 0.000160187288241958, + "loss": 0.2467, + "step": 2915 + }, + { + "epoch": 0.5901639344262295, + "grad_norm": 0.31732767820358276, + "learning_rate": 0.0001601618842616398, + "loss": 0.2295, + "step": 2916 + }, + { + "epoch": 0.5903663226067598, + "grad_norm": 0.2917217016220093, + "learning_rate": 0.00016013647419477339, + "loss": 0.2308, + "step": 2917 + }, + { + "epoch": 0.59056871078729, + "grad_norm": 0.31983745098114014, + "learning_rate": 0.00016011105804392953, + "loss": 0.2232, + "step": 2918 + }, + { + "epoch": 0.5907710989678203, + "grad_norm": 0.2851954698562622, + "learning_rate": 0.00016008563581167953, + "loss": 0.2266, + "step": 2919 + }, + { + "epoch": 0.5909734871483505, + "grad_norm": 0.38204529881477356, + "learning_rate": 0.00016006020750059538, + "loss": 0.22, + "step": 2920 + }, + { + "epoch": 0.5911758753288808, + "grad_norm": 0.2778181731700897, + "learning_rate": 0.00016003477311324964, + "loss": 0.2385, + "step": 2921 + }, + { + "epoch": 0.591378263509411, + "grad_norm": 0.3321495056152344, + "learning_rate": 0.0001600093326522155, + "loss": 0.2163, + "step": 2922 + }, + { + "epoch": 0.5915806516899413, + "grad_norm": 0.3323008418083191, + "learning_rate": 0.00015998388612006677, + "loss": 0.2798, + "step": 2923 + }, + { + "epoch": 0.5917830398704715, + "grad_norm": 0.28340160846710205, + "learning_rate": 0.00015995843351937781, + "loss": 0.219, + "step": 2924 + }, + { + "epoch": 0.5919854280510018, + "grad_norm": 0.6043773293495178, + "learning_rate": 0.00015993297485272372, + "loss": 0.2479, + "step": 2925 + }, + { + "epoch": 0.592187816231532, + "grad_norm": 0.5740264058113098, + "learning_rate": 0.00015990751012268009, + "loss": 0.2423, + "step": 2926 + }, + { + "epoch": 0.5923902044120624, + "grad_norm": 0.2924440801143646, + "learning_rate": 0.00015988203933182317, + "loss": 0.2033, + "step": 2927 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 0.2408752590417862, + "learning_rate": 0.00015985656248272988, + "loss": 0.186, + "step": 2928 + }, + { + "epoch": 0.5927949807731229, + "grad_norm": 0.3335459232330322, + "learning_rate": 0.00015983107957797765, + "loss": 0.2791, + "step": 2929 + }, + { + "epoch": 0.5929973689536531, + "grad_norm": 0.31838342547416687, + "learning_rate": 0.0001598055906201446, + "loss": 0.228, + "step": 2930 + }, + { + "epoch": 0.5931997571341834, + "grad_norm": 0.37246522307395935, + "learning_rate": 0.0001597800956118094, + "loss": 0.2954, + "step": 2931 + }, + { + "epoch": 0.5934021453147136, + "grad_norm": 0.2864042818546295, + "learning_rate": 0.0001597545945555514, + "loss": 0.2385, + "step": 2932 + }, + { + "epoch": 0.5936045334952439, + "grad_norm": 0.30606046319007874, + "learning_rate": 0.00015972908745395052, + "loss": 0.2636, + "step": 2933 + }, + { + "epoch": 0.5938069216757741, + "grad_norm": 0.3238597512245178, + "learning_rate": 0.0001597035743095873, + "loss": 0.2839, + "step": 2934 + }, + { + "epoch": 0.5940093098563044, + "grad_norm": 0.2814807593822479, + "learning_rate": 0.0001596780551250429, + "loss": 0.2309, + "step": 2935 + }, + { + "epoch": 0.5942116980368346, + "grad_norm": 0.33480608463287354, + "learning_rate": 0.00015965252990289908, + "loss": 0.2248, + "step": 2936 + }, + { + "epoch": 0.5944140862173649, + "grad_norm": 0.5626258254051208, + "learning_rate": 0.0001596269986457382, + "loss": 0.2519, + "step": 2937 + }, + { + "epoch": 0.5946164743978951, + "grad_norm": 0.32387876510620117, + "learning_rate": 0.00015960146135614328, + "loss": 0.2625, + "step": 2938 + }, + { + "epoch": 0.5948188625784254, + "grad_norm": 0.2990744113922119, + "learning_rate": 0.00015957591803669784, + "loss": 0.2248, + "step": 2939 + }, + { + "epoch": 0.5950212507589556, + "grad_norm": 0.28303173184394836, + "learning_rate": 0.00015955036868998618, + "loss": 0.2518, + "step": 2940 + }, + { + "epoch": 0.5952236389394859, + "grad_norm": 0.5194461941719055, + "learning_rate": 0.00015952481331859306, + "loss": 0.2418, + "step": 2941 + }, + { + "epoch": 0.5954260271200162, + "grad_norm": 0.3277747929096222, + "learning_rate": 0.00015949925192510392, + "loss": 0.2527, + "step": 2942 + }, + { + "epoch": 0.5956284153005464, + "grad_norm": 0.3047173321247101, + "learning_rate": 0.00015947368451210478, + "loss": 0.2458, + "step": 2943 + }, + { + "epoch": 0.5958308034810768, + "grad_norm": 0.27463477849960327, + "learning_rate": 0.0001594481110821823, + "loss": 0.2188, + "step": 2944 + }, + { + "epoch": 0.596033191661607, + "grad_norm": 0.2871634364128113, + "learning_rate": 0.00015942253163792373, + "loss": 0.2431, + "step": 2945 + }, + { + "epoch": 0.5962355798421373, + "grad_norm": 0.28732845187187195, + "learning_rate": 0.0001593969461819169, + "loss": 0.1976, + "step": 2946 + }, + { + "epoch": 0.5964379680226675, + "grad_norm": 0.3001699447631836, + "learning_rate": 0.0001593713547167503, + "loss": 0.2232, + "step": 2947 + }, + { + "epoch": 0.5966403562031978, + "grad_norm": 0.819691002368927, + "learning_rate": 0.000159345757245013, + "loss": 0.2685, + "step": 2948 + }, + { + "epoch": 0.596842744383728, + "grad_norm": 0.28296270966529846, + "learning_rate": 0.00015932015376929475, + "loss": 0.249, + "step": 2949 + }, + { + "epoch": 0.5970451325642583, + "grad_norm": 0.3961198627948761, + "learning_rate": 0.00015929454429218574, + "loss": 0.2562, + "step": 2950 + }, + { + "epoch": 0.5970451325642583, + "eval_loss": 0.2750966250896454, + "eval_runtime": 0.7417, + "eval_samples_per_second": 6.741, + "eval_steps_per_second": 1.348, + "step": 2950 + }, + { + "epoch": 0.5972475207447885, + "grad_norm": 0.33126339316368103, + "learning_rate": 0.00015926892881627688, + "loss": 0.2809, + "step": 2951 + }, + { + "epoch": 0.5974499089253188, + "grad_norm": 0.3269531726837158, + "learning_rate": 0.00015924330734415975, + "loss": 0.2657, + "step": 2952 + }, + { + "epoch": 0.597652297105849, + "grad_norm": 0.29826268553733826, + "learning_rate": 0.0001592176798784264, + "loss": 0.2303, + "step": 2953 + }, + { + "epoch": 0.5978546852863793, + "grad_norm": 0.29660069942474365, + "learning_rate": 0.00015919204642166954, + "loss": 0.2494, + "step": 2954 + }, + { + "epoch": 0.5980570734669095, + "grad_norm": 0.49443933367729187, + "learning_rate": 0.00015916640697648254, + "loss": 0.2448, + "step": 2955 + }, + { + "epoch": 0.5982594616474398, + "grad_norm": 0.27439481019973755, + "learning_rate": 0.00015914076154545931, + "loss": 0.2036, + "step": 2956 + }, + { + "epoch": 0.59846184982797, + "grad_norm": 0.29085221886634827, + "learning_rate": 0.00015911511013119438, + "loss": 0.2785, + "step": 2957 + }, + { + "epoch": 0.5986642380085003, + "grad_norm": 0.2833772599697113, + "learning_rate": 0.0001590894527362829, + "loss": 0.205, + "step": 2958 + }, + { + "epoch": 0.5988666261890305, + "grad_norm": 0.3200933039188385, + "learning_rate": 0.00015906378936332062, + "loss": 0.2875, + "step": 2959 + }, + { + "epoch": 0.5990690143695608, + "grad_norm": 0.28993549942970276, + "learning_rate": 0.00015903812001490385, + "loss": 0.2417, + "step": 2960 + }, + { + "epoch": 0.599271402550091, + "grad_norm": 0.2662200927734375, + "learning_rate": 0.00015901244469362962, + "loss": 0.2315, + "step": 2961 + }, + { + "epoch": 0.5994737907306213, + "grad_norm": 0.26462310552597046, + "learning_rate": 0.0001589867634020954, + "loss": 0.2452, + "step": 2962 + }, + { + "epoch": 0.5996761789111515, + "grad_norm": 0.3398553431034088, + "learning_rate": 0.00015896107614289944, + "loss": 0.2189, + "step": 2963 + }, + { + "epoch": 0.5998785670916819, + "grad_norm": 0.49576640129089355, + "learning_rate": 0.00015893538291864045, + "loss": 0.279, + "step": 2964 + }, + { + "epoch": 0.6000809552722121, + "grad_norm": 0.2650597095489502, + "learning_rate": 0.0001589096837319178, + "loss": 0.2216, + "step": 2965 + }, + { + "epoch": 0.6002833434527424, + "grad_norm": 0.2608364522457123, + "learning_rate": 0.00015888397858533152, + "loss": 0.2431, + "step": 2966 + }, + { + "epoch": 0.6004857316332726, + "grad_norm": 0.29076239466667175, + "learning_rate": 0.00015885826748148212, + "loss": 0.2963, + "step": 2967 + }, + { + "epoch": 0.6006881198138029, + "grad_norm": 0.2960323095321655, + "learning_rate": 0.0001588325504229708, + "loss": 0.2443, + "step": 2968 + }, + { + "epoch": 0.6008905079943331, + "grad_norm": 0.2616303265094757, + "learning_rate": 0.0001588068274123994, + "loss": 0.2154, + "step": 2969 + }, + { + "epoch": 0.6010928961748634, + "grad_norm": 0.3281814157962799, + "learning_rate": 0.00015878109845237018, + "loss": 0.2384, + "step": 2970 + }, + { + "epoch": 0.6012952843553937, + "grad_norm": 0.32332029938697815, + "learning_rate": 0.00015875536354548628, + "loss": 0.2669, + "step": 2971 + }, + { + "epoch": 0.6014976725359239, + "grad_norm": 0.3212827742099762, + "learning_rate": 0.0001587296226943512, + "loss": 0.2387, + "step": 2972 + }, + { + "epoch": 0.6017000607164542, + "grad_norm": 0.38467156887054443, + "learning_rate": 0.0001587038759015691, + "loss": 0.2722, + "step": 2973 + }, + { + "epoch": 0.6019024488969844, + "grad_norm": 0.3750758767127991, + "learning_rate": 0.00015867812316974482, + "loss": 0.2462, + "step": 2974 + }, + { + "epoch": 0.6021048370775147, + "grad_norm": 0.3365626931190491, + "learning_rate": 0.00015865236450148372, + "loss": 0.2425, + "step": 2975 + }, + { + "epoch": 0.6023072252580449, + "grad_norm": 0.23194032907485962, + "learning_rate": 0.00015862659989939184, + "loss": 0.1905, + "step": 2976 + }, + { + "epoch": 0.6025096134385752, + "grad_norm": 0.31470999121665955, + "learning_rate": 0.00015860082936607574, + "loss": 0.2203, + "step": 2977 + }, + { + "epoch": 0.6027120016191054, + "grad_norm": 0.2919837534427643, + "learning_rate": 0.00015857505290414262, + "loss": 0.2329, + "step": 2978 + }, + { + "epoch": 0.6029143897996357, + "grad_norm": 0.3553715944290161, + "learning_rate": 0.00015854927051620025, + "loss": 0.2522, + "step": 2979 + }, + { + "epoch": 0.6031167779801659, + "grad_norm": 0.30528631806373596, + "learning_rate": 0.00015852348220485706, + "loss": 0.2241, + "step": 2980 + }, + { + "epoch": 0.6033191661606963, + "grad_norm": 0.2882617115974426, + "learning_rate": 0.00015849768797272201, + "loss": 0.2102, + "step": 2981 + }, + { + "epoch": 0.6035215543412265, + "grad_norm": 0.33525657653808594, + "learning_rate": 0.0001584718878224047, + "loss": 0.2158, + "step": 2982 + }, + { + "epoch": 0.6037239425217568, + "grad_norm": 0.33792588114738464, + "learning_rate": 0.00015844608175651534, + "loss": 0.2676, + "step": 2983 + }, + { + "epoch": 0.603926330702287, + "grad_norm": 0.3096262812614441, + "learning_rate": 0.0001584202697776647, + "loss": 0.2145, + "step": 2984 + }, + { + "epoch": 0.6041287188828173, + "grad_norm": 0.4227834641933441, + "learning_rate": 0.00015839445188846414, + "loss": 0.2795, + "step": 2985 + }, + { + "epoch": 0.6043311070633475, + "grad_norm": 0.32572686672210693, + "learning_rate": 0.00015836862809152566, + "loss": 0.2603, + "step": 2986 + }, + { + "epoch": 0.6045334952438778, + "grad_norm": 0.7060582041740417, + "learning_rate": 0.0001583427983894618, + "loss": 0.2504, + "step": 2987 + }, + { + "epoch": 0.604735883424408, + "grad_norm": 0.42503786087036133, + "learning_rate": 0.00015831696278488587, + "loss": 0.2309, + "step": 2988 + }, + { + "epoch": 0.6049382716049383, + "grad_norm": 0.29326146841049194, + "learning_rate": 0.00015829112128041151, + "loss": 0.2404, + "step": 2989 + }, + { + "epoch": 0.6051406597854685, + "grad_norm": 0.2777334749698639, + "learning_rate": 0.00015826527387865314, + "loss": 0.2759, + "step": 2990 + }, + { + "epoch": 0.6053430479659988, + "grad_norm": 0.26709550619125366, + "learning_rate": 0.00015823942058222574, + "loss": 0.2435, + "step": 2991 + }, + { + "epoch": 0.605545436146529, + "grad_norm": 0.31071168184280396, + "learning_rate": 0.00015821356139374487, + "loss": 0.2403, + "step": 2992 + }, + { + "epoch": 0.6057478243270593, + "grad_norm": 0.2868014872074127, + "learning_rate": 0.00015818769631582668, + "loss": 0.2331, + "step": 2993 + }, + { + "epoch": 0.6059502125075895, + "grad_norm": 0.32010069489479065, + "learning_rate": 0.00015816182535108796, + "loss": 0.239, + "step": 2994 + }, + { + "epoch": 0.6061526006881198, + "grad_norm": 0.34212884306907654, + "learning_rate": 0.000158135948502146, + "loss": 0.2375, + "step": 2995 + }, + { + "epoch": 0.60635498886865, + "grad_norm": 0.27103322744369507, + "learning_rate": 0.0001581100657716188, + "loss": 0.2451, + "step": 2996 + }, + { + "epoch": 0.6065573770491803, + "grad_norm": 0.3346523940563202, + "learning_rate": 0.00015808417716212488, + "loss": 0.253, + "step": 2997 + }, + { + "epoch": 0.6067597652297105, + "grad_norm": 0.25295400619506836, + "learning_rate": 0.00015805828267628338, + "loss": 0.2306, + "step": 2998 + }, + { + "epoch": 0.6069621534102408, + "grad_norm": 0.46044930815696716, + "learning_rate": 0.00015803238231671405, + "loss": 0.2422, + "step": 2999 + }, + { + "epoch": 0.607164541590771, + "grad_norm": 0.3438783884048462, + "learning_rate": 0.0001580064760860372, + "loss": 0.2628, + "step": 3000 + }, + { + "epoch": 0.607164541590771, + "eval_loss": 0.2774945795536041, + "eval_runtime": 0.7407, + "eval_samples_per_second": 6.75, + "eval_steps_per_second": 1.35, + "step": 3000 + }, + { + "epoch": 0.6073669297713014, + "grad_norm": 0.2550566494464874, + "learning_rate": 0.00015798056398687375, + "loss": 0.2401, + "step": 3001 + }, + { + "epoch": 0.6075693179518317, + "grad_norm": 0.40897125005722046, + "learning_rate": 0.0001579546460218452, + "loss": 0.2242, + "step": 3002 + }, + { + "epoch": 0.6077717061323619, + "grad_norm": 0.33101367950439453, + "learning_rate": 0.0001579287221935737, + "loss": 0.2573, + "step": 3003 + }, + { + "epoch": 0.6079740943128922, + "grad_norm": 0.3354332745075226, + "learning_rate": 0.00015790279250468194, + "loss": 0.2868, + "step": 3004 + }, + { + "epoch": 0.6081764824934224, + "grad_norm": 0.280505508184433, + "learning_rate": 0.0001578768569577932, + "loss": 0.236, + "step": 3005 + }, + { + "epoch": 0.6083788706739527, + "grad_norm": 0.3221960663795471, + "learning_rate": 0.00015785091555553136, + "loss": 0.2703, + "step": 3006 + }, + { + "epoch": 0.6085812588544829, + "grad_norm": 0.29745444655418396, + "learning_rate": 0.00015782496830052089, + "loss": 0.2662, + "step": 3007 + }, + { + "epoch": 0.6087836470350132, + "grad_norm": 0.3765595257282257, + "learning_rate": 0.00015779901519538688, + "loss": 0.29, + "step": 3008 + }, + { + "epoch": 0.6089860352155434, + "grad_norm": 0.4914434850215912, + "learning_rate": 0.00015777305624275502, + "loss": 0.2435, + "step": 3009 + }, + { + "epoch": 0.6091884233960737, + "grad_norm": 0.3370167016983032, + "learning_rate": 0.0001577470914452515, + "loss": 0.219, + "step": 3010 + }, + { + "epoch": 0.6093908115766039, + "grad_norm": 0.268723726272583, + "learning_rate": 0.0001577211208055032, + "loss": 0.232, + "step": 3011 + }, + { + "epoch": 0.6095931997571342, + "grad_norm": 0.29349520802497864, + "learning_rate": 0.0001576951443261376, + "loss": 0.2005, + "step": 3012 + }, + { + "epoch": 0.6097955879376644, + "grad_norm": 0.3591350317001343, + "learning_rate": 0.00015766916200978266, + "loss": 0.2533, + "step": 3013 + }, + { + "epoch": 0.6099979761181947, + "grad_norm": 0.38604936003685, + "learning_rate": 0.00015764317385906702, + "loss": 0.2507, + "step": 3014 + }, + { + "epoch": 0.6102003642987249, + "grad_norm": 0.3372388780117035, + "learning_rate": 0.0001576171798766199, + "loss": 0.2661, + "step": 3015 + }, + { + "epoch": 0.6104027524792552, + "grad_norm": 0.27703356742858887, + "learning_rate": 0.0001575911800650711, + "loss": 0.2547, + "step": 3016 + }, + { + "epoch": 0.6106051406597854, + "grad_norm": 0.32858291268348694, + "learning_rate": 0.00015756517442705098, + "loss": 0.2417, + "step": 3017 + }, + { + "epoch": 0.6108075288403157, + "grad_norm": 0.30357709527015686, + "learning_rate": 0.00015753916296519055, + "loss": 0.2563, + "step": 3018 + }, + { + "epoch": 0.611009917020846, + "grad_norm": 0.354936808347702, + "learning_rate": 0.00015751314568212134, + "loss": 0.2869, + "step": 3019 + }, + { + "epoch": 0.6112123052013763, + "grad_norm": 0.30504146218299866, + "learning_rate": 0.00015748712258047552, + "loss": 0.2704, + "step": 3020 + }, + { + "epoch": 0.6114146933819065, + "grad_norm": 0.3150479197502136, + "learning_rate": 0.0001574610936628859, + "loss": 0.2273, + "step": 3021 + }, + { + "epoch": 0.6116170815624368, + "grad_norm": 0.32766446471214294, + "learning_rate": 0.0001574350589319857, + "loss": 0.2693, + "step": 3022 + }, + { + "epoch": 0.611819469742967, + "grad_norm": 0.2615867257118225, + "learning_rate": 0.00015740901839040894, + "loss": 0.2343, + "step": 3023 + }, + { + "epoch": 0.6120218579234973, + "grad_norm": 0.3711775839328766, + "learning_rate": 0.00015738297204079005, + "loss": 0.2863, + "step": 3024 + }, + { + "epoch": 0.6122242461040275, + "grad_norm": 0.33009645342826843, + "learning_rate": 0.00015735691988576415, + "loss": 0.2445, + "step": 3025 + }, + { + "epoch": 0.6124266342845578, + "grad_norm": 0.39618271589279175, + "learning_rate": 0.00015733086192796697, + "loss": 0.312, + "step": 3026 + }, + { + "epoch": 0.612629022465088, + "grad_norm": 0.29640382528305054, + "learning_rate": 0.00015730479817003474, + "loss": 0.2673, + "step": 3027 + }, + { + "epoch": 0.6128314106456183, + "grad_norm": 0.2582768499851227, + "learning_rate": 0.00015727872861460434, + "loss": 0.2052, + "step": 3028 + }, + { + "epoch": 0.6130337988261485, + "grad_norm": 0.2651136517524719, + "learning_rate": 0.00015725265326431315, + "loss": 0.2238, + "step": 3029 + }, + { + "epoch": 0.6132361870066788, + "grad_norm": 0.2757776081562042, + "learning_rate": 0.00015722657212179927, + "loss": 0.244, + "step": 3030 + }, + { + "epoch": 0.613438575187209, + "grad_norm": 0.28894442319869995, + "learning_rate": 0.00015720048518970132, + "loss": 0.2371, + "step": 3031 + }, + { + "epoch": 0.6136409633677393, + "grad_norm": 0.3005363643169403, + "learning_rate": 0.00015717439247065843, + "loss": 0.2292, + "step": 3032 + }, + { + "epoch": 0.6138433515482696, + "grad_norm": 0.2538183629512787, + "learning_rate": 0.00015714829396731048, + "loss": 0.2208, + "step": 3033 + }, + { + "epoch": 0.6140457397287998, + "grad_norm": 0.44336000084877014, + "learning_rate": 0.00015712218968229777, + "loss": 0.2328, + "step": 3034 + }, + { + "epoch": 0.6142481279093301, + "grad_norm": 0.25125953555107117, + "learning_rate": 0.00015709607961826127, + "loss": 0.2027, + "step": 3035 + }, + { + "epoch": 0.6144505160898603, + "grad_norm": 0.26832088828086853, + "learning_rate": 0.00015706996377784255, + "loss": 0.1998, + "step": 3036 + }, + { + "epoch": 0.6146529042703907, + "grad_norm": 0.29760006070137024, + "learning_rate": 0.00015704384216368373, + "loss": 0.2625, + "step": 3037 + }, + { + "epoch": 0.6148552924509209, + "grad_norm": 0.3440050482749939, + "learning_rate": 0.00015701771477842752, + "loss": 0.2569, + "step": 3038 + }, + { + "epoch": 0.6150576806314512, + "grad_norm": 0.6179642677307129, + "learning_rate": 0.0001569915816247172, + "loss": 0.2382, + "step": 3039 + }, + { + "epoch": 0.6152600688119814, + "grad_norm": 0.27009207010269165, + "learning_rate": 0.00015696544270519665, + "loss": 0.1881, + "step": 3040 + }, + { + "epoch": 0.6154624569925117, + "grad_norm": 0.37599071860313416, + "learning_rate": 0.00015693929802251038, + "loss": 0.2555, + "step": 3041 + }, + { + "epoch": 0.6156648451730419, + "grad_norm": 0.24889563024044037, + "learning_rate": 0.00015691314757930336, + "loss": 0.2501, + "step": 3042 + }, + { + "epoch": 0.6158672333535722, + "grad_norm": 0.2929196357727051, + "learning_rate": 0.00015688699137822126, + "loss": 0.2259, + "step": 3043 + }, + { + "epoch": 0.6160696215341024, + "grad_norm": 0.40753883123397827, + "learning_rate": 0.00015686082942191026, + "loss": 0.2045, + "step": 3044 + }, + { + "epoch": 0.6162720097146327, + "grad_norm": 0.266451358795166, + "learning_rate": 0.0001568346617130172, + "loss": 0.2347, + "step": 3045 + }, + { + "epoch": 0.6164743978951629, + "grad_norm": 0.38660845160484314, + "learning_rate": 0.0001568084882541894, + "loss": 0.2777, + "step": 3046 + }, + { + "epoch": 0.6166767860756932, + "grad_norm": 0.3965816795825958, + "learning_rate": 0.00015678230904807484, + "loss": 0.2591, + "step": 3047 + }, + { + "epoch": 0.6168791742562234, + "grad_norm": 0.2663493752479553, + "learning_rate": 0.00015675612409732207, + "loss": 0.2057, + "step": 3048 + }, + { + "epoch": 0.6170815624367537, + "grad_norm": 0.32578763365745544, + "learning_rate": 0.00015672993340458023, + "loss": 0.2711, + "step": 3049 + }, + { + "epoch": 0.6172839506172839, + "grad_norm": 0.4756195843219757, + "learning_rate": 0.00015670373697249896, + "loss": 0.2418, + "step": 3050 + }, + { + "epoch": 0.6172839506172839, + "eval_loss": 0.27323392033576965, + "eval_runtime": 0.7372, + "eval_samples_per_second": 6.783, + "eval_steps_per_second": 1.357, + "step": 3050 + }, + { + "epoch": 0.6174863387978142, + "grad_norm": 0.34247496724128723, + "learning_rate": 0.00015667753480372857, + "loss": 0.2175, + "step": 3051 + }, + { + "epoch": 0.6176887269783444, + "grad_norm": 0.330746591091156, + "learning_rate": 0.00015665132690091994, + "loss": 0.238, + "step": 3052 + }, + { + "epoch": 0.6178911151588747, + "grad_norm": 0.3871874511241913, + "learning_rate": 0.00015662511326672448, + "loss": 0.2575, + "step": 3053 + }, + { + "epoch": 0.6180935033394049, + "grad_norm": 0.3050987720489502, + "learning_rate": 0.00015659889390379417, + "loss": 0.2641, + "step": 3054 + }, + { + "epoch": 0.6182958915199352, + "grad_norm": 0.4178610146045685, + "learning_rate": 0.00015657266881478172, + "loss": 0.271, + "step": 3055 + }, + { + "epoch": 0.6184982797004654, + "grad_norm": 0.42879247665405273, + "learning_rate": 0.0001565464380023402, + "loss": 0.2352, + "step": 3056 + }, + { + "epoch": 0.6187006678809958, + "grad_norm": 0.27890655398368835, + "learning_rate": 0.00015652020146912347, + "loss": 0.238, + "step": 3057 + }, + { + "epoch": 0.618903056061526, + "grad_norm": 0.30058541893959045, + "learning_rate": 0.0001564939592177858, + "loss": 0.2579, + "step": 3058 + }, + { + "epoch": 0.6191054442420563, + "grad_norm": 0.31102004647254944, + "learning_rate": 0.0001564677112509821, + "loss": 0.2746, + "step": 3059 + }, + { + "epoch": 0.6193078324225865, + "grad_norm": 0.36177563667297363, + "learning_rate": 0.00015644145757136792, + "loss": 0.2381, + "step": 3060 + }, + { + "epoch": 0.6195102206031168, + "grad_norm": 0.3283332586288452, + "learning_rate": 0.00015641519818159928, + "loss": 0.2636, + "step": 3061 + }, + { + "epoch": 0.6197126087836471, + "grad_norm": 0.34870678186416626, + "learning_rate": 0.00015638893308433284, + "loss": 0.2404, + "step": 3062 + }, + { + "epoch": 0.6199149969641773, + "grad_norm": 0.3281346261501312, + "learning_rate": 0.00015636266228222584, + "loss": 0.26, + "step": 3063 + }, + { + "epoch": 0.6201173851447076, + "grad_norm": 0.30716973543167114, + "learning_rate": 0.00015633638577793607, + "loss": 0.2574, + "step": 3064 + }, + { + "epoch": 0.6203197733252378, + "grad_norm": 0.3106836974620819, + "learning_rate": 0.0001563101035741219, + "loss": 0.2259, + "step": 3065 + }, + { + "epoch": 0.6205221615057681, + "grad_norm": 0.27169597148895264, + "learning_rate": 0.00015628381567344234, + "loss": 0.2637, + "step": 3066 + }, + { + "epoch": 0.6207245496862983, + "grad_norm": 0.39207571744918823, + "learning_rate": 0.00015625752207855688, + "loss": 0.2649, + "step": 3067 + }, + { + "epoch": 0.6209269378668286, + "grad_norm": 0.530310869216919, + "learning_rate": 0.00015623122279212562, + "loss": 0.2235, + "step": 3068 + }, + { + "epoch": 0.6211293260473588, + "grad_norm": 0.2757643461227417, + "learning_rate": 0.00015620491781680927, + "loss": 0.2359, + "step": 3069 + }, + { + "epoch": 0.6213317142278891, + "grad_norm": 0.35047096014022827, + "learning_rate": 0.0001561786071552691, + "loss": 0.2595, + "step": 3070 + }, + { + "epoch": 0.6215341024084193, + "grad_norm": 0.31434178352355957, + "learning_rate": 0.00015615229081016697, + "loss": 0.252, + "step": 3071 + }, + { + "epoch": 0.6217364905889496, + "grad_norm": 0.2799963653087616, + "learning_rate": 0.00015612596878416518, + "loss": 0.2675, + "step": 3072 + }, + { + "epoch": 0.6219388787694798, + "grad_norm": 0.2696954011917114, + "learning_rate": 0.00015609964107992684, + "loss": 0.2212, + "step": 3073 + }, + { + "epoch": 0.6221412669500102, + "grad_norm": 0.31906068325042725, + "learning_rate": 0.00015607330770011545, + "loss": 0.2672, + "step": 3074 + }, + { + "epoch": 0.6223436551305404, + "grad_norm": 0.3010803461074829, + "learning_rate": 0.00015604696864739517, + "loss": 0.2167, + "step": 3075 + }, + { + "epoch": 0.6225460433110707, + "grad_norm": 0.4235513508319855, + "learning_rate": 0.00015602062392443066, + "loss": 0.2296, + "step": 3076 + }, + { + "epoch": 0.6227484314916009, + "grad_norm": 0.2939774692058563, + "learning_rate": 0.00015599427353388728, + "loss": 0.2296, + "step": 3077 + }, + { + "epoch": 0.6229508196721312, + "grad_norm": 0.2979620695114136, + "learning_rate": 0.0001559679174784308, + "loss": 0.2394, + "step": 3078 + }, + { + "epoch": 0.6231532078526614, + "grad_norm": 0.37031883001327515, + "learning_rate": 0.00015594155576072777, + "loss": 0.2558, + "step": 3079 + }, + { + "epoch": 0.6233555960331917, + "grad_norm": 0.3406575918197632, + "learning_rate": 0.00015591518838344506, + "loss": 0.2652, + "step": 3080 + }, + { + "epoch": 0.6235579842137219, + "grad_norm": 0.3246553838253021, + "learning_rate": 0.0001558888153492503, + "loss": 0.2106, + "step": 3081 + }, + { + "epoch": 0.6237603723942522, + "grad_norm": 0.2892184555530548, + "learning_rate": 0.00015586243666081164, + "loss": 0.2133, + "step": 3082 + }, + { + "epoch": 0.6239627605747824, + "grad_norm": 0.32574543356895447, + "learning_rate": 0.00015583605232079783, + "loss": 0.2278, + "step": 3083 + }, + { + "epoch": 0.6241651487553127, + "grad_norm": 0.4512995779514313, + "learning_rate": 0.00015580966233187812, + "loss": 0.266, + "step": 3084 + }, + { + "epoch": 0.6243675369358429, + "grad_norm": 0.25693219900131226, + "learning_rate": 0.00015578326669672232, + "loss": 0.218, + "step": 3085 + }, + { + "epoch": 0.6245699251163732, + "grad_norm": 0.3269275724887848, + "learning_rate": 0.00015575686541800096, + "loss": 0.2117, + "step": 3086 + }, + { + "epoch": 0.6247723132969034, + "grad_norm": 0.3049417734146118, + "learning_rate": 0.000155730458498385, + "loss": 0.1994, + "step": 3087 + }, + { + "epoch": 0.6249747014774337, + "grad_norm": 0.2842578887939453, + "learning_rate": 0.00015570404594054604, + "loss": 0.222, + "step": 3088 + }, + { + "epoch": 0.6251770896579639, + "grad_norm": 0.33271604776382446, + "learning_rate": 0.00015567762774715618, + "loss": 0.2239, + "step": 3089 + }, + { + "epoch": 0.6253794778384942, + "grad_norm": 0.24872200191020966, + "learning_rate": 0.0001556512039208882, + "loss": 0.2068, + "step": 3090 + }, + { + "epoch": 0.6255818660190244, + "grad_norm": 0.2888675034046173, + "learning_rate": 0.00015562477446441535, + "loss": 0.2172, + "step": 3091 + }, + { + "epoch": 0.6257842541995547, + "grad_norm": 0.28715941309928894, + "learning_rate": 0.00015559833938041145, + "loss": 0.2363, + "step": 3092 + }, + { + "epoch": 0.6259866423800851, + "grad_norm": 0.24771295487880707, + "learning_rate": 0.00015557189867155099, + "loss": 0.2325, + "step": 3093 + }, + { + "epoch": 0.6261890305606153, + "grad_norm": 0.3012368679046631, + "learning_rate": 0.00015554545234050892, + "loss": 0.2324, + "step": 3094 + }, + { + "epoch": 0.6263914187411456, + "grad_norm": 0.34395089745521545, + "learning_rate": 0.00015551900038996078, + "loss": 0.2547, + "step": 3095 + }, + { + "epoch": 0.6265938069216758, + "grad_norm": 0.4432348310947418, + "learning_rate": 0.00015549254282258278, + "loss": 0.2417, + "step": 3096 + }, + { + "epoch": 0.6267961951022061, + "grad_norm": 0.2741510570049286, + "learning_rate": 0.00015546607964105156, + "loss": 0.2071, + "step": 3097 + }, + { + "epoch": 0.6269985832827363, + "grad_norm": 0.36685729026794434, + "learning_rate": 0.0001554396108480444, + "loss": 0.2855, + "step": 3098 + }, + { + "epoch": 0.6272009714632666, + "grad_norm": 0.38427332043647766, + "learning_rate": 0.00015541313644623912, + "loss": 0.2506, + "step": 3099 + }, + { + "epoch": 0.6274033596437968, + "grad_norm": 0.41870930790901184, + "learning_rate": 0.0001553866564383142, + "loss": 0.2346, + "step": 3100 + }, + { + "epoch": 0.6274033596437968, + "eval_loss": 0.26668718457221985, + "eval_runtime": 0.7392, + "eval_samples_per_second": 6.764, + "eval_steps_per_second": 1.353, + "step": 3100 + }, + { + "epoch": 0.6276057478243271, + "grad_norm": 0.31527113914489746, + "learning_rate": 0.00015536017082694846, + "loss": 0.2279, + "step": 3101 + }, + { + "epoch": 0.6278081360048573, + "grad_norm": 0.26014941930770874, + "learning_rate": 0.00015533367961482157, + "loss": 0.2159, + "step": 3102 + }, + { + "epoch": 0.6280105241853876, + "grad_norm": 0.2955648601055145, + "learning_rate": 0.00015530718280461355, + "loss": 0.2377, + "step": 3103 + }, + { + "epoch": 0.6282129123659178, + "grad_norm": 0.29161566495895386, + "learning_rate": 0.00015528068039900514, + "loss": 0.2391, + "step": 3104 + }, + { + "epoch": 0.6284153005464481, + "grad_norm": 0.3259400427341461, + "learning_rate": 0.00015525417240067757, + "loss": 0.2595, + "step": 3105 + }, + { + "epoch": 0.6286176887269783, + "grad_norm": 0.3628864288330078, + "learning_rate": 0.0001552276588123126, + "loss": 0.2553, + "step": 3106 + }, + { + "epoch": 0.6288200769075086, + "grad_norm": 0.29265254735946655, + "learning_rate": 0.00015520113963659254, + "loss": 0.2661, + "step": 3107 + }, + { + "epoch": 0.6290224650880388, + "grad_norm": 0.2571503818035126, + "learning_rate": 0.00015517461487620047, + "loss": 0.2588, + "step": 3108 + }, + { + "epoch": 0.6292248532685691, + "grad_norm": 0.2192489504814148, + "learning_rate": 0.00015514808453381975, + "loss": 0.197, + "step": 3109 + }, + { + "epoch": 0.6294272414490993, + "grad_norm": 0.2925032675266266, + "learning_rate": 0.00015512154861213452, + "loss": 0.2244, + "step": 3110 + }, + { + "epoch": 0.6296296296296297, + "grad_norm": 0.37553200125694275, + "learning_rate": 0.0001550950071138294, + "loss": 0.2445, + "step": 3111 + }, + { + "epoch": 0.6298320178101598, + "grad_norm": 0.3246464729309082, + "learning_rate": 0.00015506846004158955, + "loss": 0.2406, + "step": 3112 + }, + { + "epoch": 0.6300344059906902, + "grad_norm": 0.4171786904335022, + "learning_rate": 0.00015504190739810074, + "loss": 0.2209, + "step": 3113 + }, + { + "epoch": 0.6302367941712204, + "grad_norm": 0.30422425270080566, + "learning_rate": 0.00015501534918604926, + "loss": 0.2543, + "step": 3114 + }, + { + "epoch": 0.6304391823517507, + "grad_norm": 0.3078934848308563, + "learning_rate": 0.00015498878540812208, + "loss": 0.2423, + "step": 3115 + }, + { + "epoch": 0.6306415705322809, + "grad_norm": 0.3099100887775421, + "learning_rate": 0.00015496221606700657, + "loss": 0.2655, + "step": 3116 + }, + { + "epoch": 0.6308439587128112, + "grad_norm": 0.29486164450645447, + "learning_rate": 0.00015493564116539072, + "loss": 0.2356, + "step": 3117 + }, + { + "epoch": 0.6310463468933414, + "grad_norm": 0.3040613830089569, + "learning_rate": 0.00015490906070596316, + "loss": 0.1921, + "step": 3118 + }, + { + "epoch": 0.6312487350738717, + "grad_norm": 0.2945636808872223, + "learning_rate": 0.00015488247469141295, + "loss": 0.2215, + "step": 3119 + }, + { + "epoch": 0.6314511232544019, + "grad_norm": 0.30915266275405884, + "learning_rate": 0.00015485588312442986, + "loss": 0.2574, + "step": 3120 + }, + { + "epoch": 0.6316535114349322, + "grad_norm": 0.37940701842308044, + "learning_rate": 0.0001548292860077041, + "loss": 0.2501, + "step": 3121 + }, + { + "epoch": 0.6318558996154624, + "grad_norm": 0.43181759119033813, + "learning_rate": 0.0001548026833439265, + "loss": 0.2947, + "step": 3122 + }, + { + "epoch": 0.6320582877959927, + "grad_norm": 0.33247214555740356, + "learning_rate": 0.00015477607513578846, + "loss": 0.2125, + "step": 3123 + }, + { + "epoch": 0.632260675976523, + "grad_norm": 0.2787618935108185, + "learning_rate": 0.00015474946138598186, + "loss": 0.2297, + "step": 3124 + }, + { + "epoch": 0.6324630641570532, + "grad_norm": 0.3035429120063782, + "learning_rate": 0.00015472284209719925, + "loss": 0.2171, + "step": 3125 + }, + { + "epoch": 0.6326654523375835, + "grad_norm": 0.2825091779232025, + "learning_rate": 0.00015469621727213367, + "loss": 0.2304, + "step": 3126 + }, + { + "epoch": 0.6328678405181137, + "grad_norm": 0.2831633687019348, + "learning_rate": 0.0001546695869134788, + "loss": 0.244, + "step": 3127 + }, + { + "epoch": 0.633070228698644, + "grad_norm": 0.4898656904697418, + "learning_rate": 0.00015464295102392872, + "loss": 0.2374, + "step": 3128 + }, + { + "epoch": 0.6332726168791742, + "grad_norm": 0.3614266812801361, + "learning_rate": 0.0001546163096061782, + "loss": 0.2265, + "step": 3129 + }, + { + "epoch": 0.6334750050597046, + "grad_norm": 0.4073461890220642, + "learning_rate": 0.0001545896626629226, + "loss": 0.1959, + "step": 3130 + }, + { + "epoch": 0.6336773932402348, + "grad_norm": 0.33134597539901733, + "learning_rate": 0.00015456301019685769, + "loss": 0.2228, + "step": 3131 + }, + { + "epoch": 0.6338797814207651, + "grad_norm": 0.5032749772071838, + "learning_rate": 0.00015453635221067996, + "loss": 0.2398, + "step": 3132 + }, + { + "epoch": 0.6340821696012953, + "grad_norm": 0.2859143316745758, + "learning_rate": 0.00015450968870708636, + "loss": 0.24, + "step": 3133 + }, + { + "epoch": 0.6342845577818256, + "grad_norm": 0.3137022852897644, + "learning_rate": 0.00015448301968877442, + "loss": 0.2478, + "step": 3134 + }, + { + "epoch": 0.6344869459623558, + "grad_norm": 0.3397473692893982, + "learning_rate": 0.00015445634515844222, + "loss": 0.2476, + "step": 3135 + }, + { + "epoch": 0.6346893341428861, + "grad_norm": 0.35663914680480957, + "learning_rate": 0.0001544296651187884, + "loss": 0.236, + "step": 3136 + }, + { + "epoch": 0.6348917223234163, + "grad_norm": 0.28017348051071167, + "learning_rate": 0.0001544029795725122, + "loss": 0.2308, + "step": 3137 + }, + { + "epoch": 0.6350941105039466, + "grad_norm": 0.2992730438709259, + "learning_rate": 0.0001543762885223134, + "loss": 0.2429, + "step": 3138 + }, + { + "epoch": 0.6352964986844768, + "grad_norm": 0.3374902307987213, + "learning_rate": 0.00015434959197089228, + "loss": 0.2499, + "step": 3139 + }, + { + "epoch": 0.6354988868650071, + "grad_norm": 0.37910881638526917, + "learning_rate": 0.0001543228899209497, + "loss": 0.2635, + "step": 3140 + }, + { + "epoch": 0.6357012750455373, + "grad_norm": 0.26263922452926636, + "learning_rate": 0.00015429618237518716, + "loss": 0.2048, + "step": 3141 + }, + { + "epoch": 0.6359036632260676, + "grad_norm": 0.27847379446029663, + "learning_rate": 0.0001542694693363066, + "loss": 0.2265, + "step": 3142 + }, + { + "epoch": 0.6361060514065978, + "grad_norm": 0.28324779868125916, + "learning_rate": 0.00015424275080701055, + "loss": 0.2349, + "step": 3143 + }, + { + "epoch": 0.6363084395871281, + "grad_norm": 0.278072714805603, + "learning_rate": 0.00015421602679000217, + "loss": 0.2323, + "step": 3144 + }, + { + "epoch": 0.6365108277676583, + "grad_norm": 0.3826614022254944, + "learning_rate": 0.00015418929728798505, + "loss": 0.2197, + "step": 3145 + }, + { + "epoch": 0.6367132159481886, + "grad_norm": 0.3305886387825012, + "learning_rate": 0.00015416256230366346, + "loss": 0.2245, + "step": 3146 + }, + { + "epoch": 0.6369156041287188, + "grad_norm": 0.2796514332294464, + "learning_rate": 0.00015413582183974213, + "loss": 0.2098, + "step": 3147 + }, + { + "epoch": 0.6371179923092491, + "grad_norm": 0.3792613744735718, + "learning_rate": 0.00015410907589892637, + "loss": 0.2411, + "step": 3148 + }, + { + "epoch": 0.6373203804897793, + "grad_norm": 0.2925693392753601, + "learning_rate": 0.0001540823244839221, + "loss": 0.2683, + "step": 3149 + }, + { + "epoch": 0.6375227686703097, + "grad_norm": 0.3847316801548004, + "learning_rate": 0.0001540555675974357, + "loss": 0.263, + "step": 3150 + }, + { + "epoch": 0.6375227686703097, + "eval_loss": 0.2652246356010437, + "eval_runtime": 0.7391, + "eval_samples_per_second": 6.765, + "eval_steps_per_second": 1.353, + "step": 3150 + }, + { + "epoch": 0.6377251568508399, + "grad_norm": 0.2546859681606293, + "learning_rate": 0.0001540288052421742, + "loss": 0.2343, + "step": 3151 + }, + { + "epoch": 0.6379275450313702, + "grad_norm": 0.2868305444717407, + "learning_rate": 0.00015400203742084508, + "loss": 0.2726, + "step": 3152 + }, + { + "epoch": 0.6381299332119005, + "grad_norm": 0.2870636284351349, + "learning_rate": 0.0001539752641361564, + "loss": 0.2055, + "step": 3153 + }, + { + "epoch": 0.6383323213924307, + "grad_norm": 0.7042229175567627, + "learning_rate": 0.0001539484853908169, + "loss": 0.2114, + "step": 3154 + }, + { + "epoch": 0.638534709572961, + "grad_norm": 0.30285945534706116, + "learning_rate": 0.00015392170118753575, + "loss": 0.2451, + "step": 3155 + }, + { + "epoch": 0.6387370977534912, + "grad_norm": 0.48774808645248413, + "learning_rate": 0.00015389491152902263, + "loss": 0.2731, + "step": 3156 + }, + { + "epoch": 0.6389394859340215, + "grad_norm": 0.25389939546585083, + "learning_rate": 0.00015386811641798786, + "loss": 0.1723, + "step": 3157 + }, + { + "epoch": 0.6391418741145517, + "grad_norm": 0.3468737304210663, + "learning_rate": 0.00015384131585714235, + "loss": 0.2381, + "step": 3158 + }, + { + "epoch": 0.639344262295082, + "grad_norm": 0.3124346137046814, + "learning_rate": 0.0001538145098491974, + "loss": 0.217, + "step": 3159 + }, + { + "epoch": 0.6395466504756122, + "grad_norm": 0.41017088294029236, + "learning_rate": 0.00015378769839686504, + "loss": 0.2213, + "step": 3160 + }, + { + "epoch": 0.6397490386561425, + "grad_norm": 0.6051793694496155, + "learning_rate": 0.00015376088150285773, + "loss": 0.2691, + "step": 3161 + }, + { + "epoch": 0.6399514268366727, + "grad_norm": 0.2914044260978699, + "learning_rate": 0.00015373405916988857, + "loss": 0.2465, + "step": 3162 + }, + { + "epoch": 0.640153815017203, + "grad_norm": 0.29912036657333374, + "learning_rate": 0.0001537072314006711, + "loss": 0.2338, + "step": 3163 + }, + { + "epoch": 0.6403562031977332, + "grad_norm": 0.31532540917396545, + "learning_rate": 0.0001536803981979195, + "loss": 0.247, + "step": 3164 + }, + { + "epoch": 0.6405585913782635, + "grad_norm": 0.2610158622264862, + "learning_rate": 0.0001536535595643485, + "loss": 0.2389, + "step": 3165 + }, + { + "epoch": 0.6407609795587937, + "grad_norm": 0.3037506341934204, + "learning_rate": 0.0001536267155026733, + "loss": 0.207, + "step": 3166 + }, + { + "epoch": 0.640963367739324, + "grad_norm": 0.3930741548538208, + "learning_rate": 0.0001535998660156097, + "loss": 0.2291, + "step": 3167 + }, + { + "epoch": 0.6411657559198543, + "grad_norm": 0.3472108840942383, + "learning_rate": 0.00015357301110587412, + "loss": 0.2352, + "step": 3168 + }, + { + "epoch": 0.6413681441003846, + "grad_norm": 0.2909805178642273, + "learning_rate": 0.0001535461507761834, + "loss": 0.2418, + "step": 3169 + }, + { + "epoch": 0.6415705322809148, + "grad_norm": 0.4504471719264984, + "learning_rate": 0.000153519285029255, + "loss": 0.2477, + "step": 3170 + }, + { + "epoch": 0.6417729204614451, + "grad_norm": 0.3161703646183014, + "learning_rate": 0.0001534924138678069, + "loss": 0.2547, + "step": 3171 + }, + { + "epoch": 0.6419753086419753, + "grad_norm": 0.4808140695095062, + "learning_rate": 0.00015346553729455765, + "loss": 0.2616, + "step": 3172 + }, + { + "epoch": 0.6421776968225056, + "grad_norm": 0.28160232305526733, + "learning_rate": 0.00015343865531222638, + "loss": 0.2363, + "step": 3173 + }, + { + "epoch": 0.6423800850030358, + "grad_norm": 0.307965487241745, + "learning_rate": 0.00015341176792353265, + "loss": 0.2445, + "step": 3174 + }, + { + "epoch": 0.6425824731835661, + "grad_norm": 0.2636444568634033, + "learning_rate": 0.00015338487513119668, + "loss": 0.2316, + "step": 3175 + }, + { + "epoch": 0.6427848613640963, + "grad_norm": 0.2725645899772644, + "learning_rate": 0.00015335797693793923, + "loss": 0.2147, + "step": 3176 + }, + { + "epoch": 0.6429872495446266, + "grad_norm": 0.3533676564693451, + "learning_rate": 0.00015333107334648154, + "loss": 0.2648, + "step": 3177 + }, + { + "epoch": 0.6431896377251568, + "grad_norm": 0.275126188993454, + "learning_rate": 0.00015330416435954547, + "loss": 0.2223, + "step": 3178 + }, + { + "epoch": 0.6433920259056871, + "grad_norm": 0.27795132994651794, + "learning_rate": 0.00015327724997985334, + "loss": 0.2137, + "step": 3179 + }, + { + "epoch": 0.6435944140862173, + "grad_norm": 0.2937318682670593, + "learning_rate": 0.00015325033021012808, + "loss": 0.2425, + "step": 3180 + }, + { + "epoch": 0.6437968022667476, + "grad_norm": 0.43675699830055237, + "learning_rate": 0.0001532234050530932, + "loss": 0.2199, + "step": 3181 + }, + { + "epoch": 0.6439991904472778, + "grad_norm": 0.3387317359447479, + "learning_rate": 0.0001531964745114726, + "loss": 0.2509, + "step": 3182 + }, + { + "epoch": 0.6442015786278081, + "grad_norm": 0.3405567407608032, + "learning_rate": 0.00015316953858799095, + "loss": 0.2375, + "step": 3183 + }, + { + "epoch": 0.6444039668083384, + "grad_norm": 0.28323695063591003, + "learning_rate": 0.00015314259728537325, + "loss": 0.2151, + "step": 3184 + }, + { + "epoch": 0.6446063549888686, + "grad_norm": 0.26877549290657043, + "learning_rate": 0.0001531156506063452, + "loss": 0.2309, + "step": 3185 + }, + { + "epoch": 0.644808743169399, + "grad_norm": 0.3151189088821411, + "learning_rate": 0.00015308869855363294, + "loss": 0.2441, + "step": 3186 + }, + { + "epoch": 0.6450111313499292, + "grad_norm": 0.3761087656021118, + "learning_rate": 0.00015306174112996324, + "loss": 0.2475, + "step": 3187 + }, + { + "epoch": 0.6452135195304595, + "grad_norm": 0.3229560852050781, + "learning_rate": 0.00015303477833806332, + "loss": 0.2677, + "step": 3188 + }, + { + "epoch": 0.6454159077109897, + "grad_norm": 0.2946084439754486, + "learning_rate": 0.000153007810180661, + "loss": 0.2017, + "step": 3189 + }, + { + "epoch": 0.64561829589152, + "grad_norm": 0.25200504064559937, + "learning_rate": 0.00015298083666048467, + "loss": 0.2215, + "step": 3190 + }, + { + "epoch": 0.6458206840720502, + "grad_norm": 0.3906277120113373, + "learning_rate": 0.0001529538577802632, + "loss": 0.332, + "step": 3191 + }, + { + "epoch": 0.6460230722525805, + "grad_norm": 0.39015209674835205, + "learning_rate": 0.0001529268735427261, + "loss": 0.2405, + "step": 3192 + }, + { + "epoch": 0.6462254604331107, + "grad_norm": 0.31589704751968384, + "learning_rate": 0.0001528998839506032, + "loss": 0.2484, + "step": 3193 + }, + { + "epoch": 0.646427848613641, + "grad_norm": 0.27482402324676514, + "learning_rate": 0.0001528728890066252, + "loss": 0.1976, + "step": 3194 + }, + { + "epoch": 0.6466302367941712, + "grad_norm": 0.3176118731498718, + "learning_rate": 0.000152845888713523, + "loss": 0.2141, + "step": 3195 + }, + { + "epoch": 0.6468326249747015, + "grad_norm": 0.240059494972229, + "learning_rate": 0.00015281888307402833, + "loss": 0.222, + "step": 3196 + }, + { + "epoch": 0.6470350131552317, + "grad_norm": 0.36541762948036194, + "learning_rate": 0.00015279187209087328, + "loss": 0.2182, + "step": 3197 + }, + { + "epoch": 0.647237401335762, + "grad_norm": 0.4202319383621216, + "learning_rate": 0.00015276485576679055, + "loss": 0.2354, + "step": 3198 + }, + { + "epoch": 0.6474397895162922, + "grad_norm": 0.29542794823646545, + "learning_rate": 0.00015273783410451336, + "loss": 0.2539, + "step": 3199 + }, + { + "epoch": 0.6476421776968225, + "grad_norm": 0.30133146047592163, + "learning_rate": 0.0001527108071067755, + "loss": 0.2162, + "step": 3200 + }, + { + "epoch": 0.6476421776968225, + "eval_loss": 0.26807427406311035, + "eval_runtime": 0.7402, + "eval_samples_per_second": 6.755, + "eval_steps_per_second": 1.351, + "step": 3200 + }, + { + "epoch": 0.6478445658773527, + "grad_norm": 0.2947642505168915, + "learning_rate": 0.00015268377477631128, + "loss": 0.2246, + "step": 3201 + }, + { + "epoch": 0.648046954057883, + "grad_norm": 0.2888360917568207, + "learning_rate": 0.0001526567371158555, + "loss": 0.2265, + "step": 3202 + }, + { + "epoch": 0.6482493422384132, + "grad_norm": 0.3799479305744171, + "learning_rate": 0.00015262969412814357, + "loss": 0.2374, + "step": 3203 + }, + { + "epoch": 0.6484517304189436, + "grad_norm": 0.27309316396713257, + "learning_rate": 0.0001526026458159115, + "loss": 0.2327, + "step": 3204 + }, + { + "epoch": 0.6486541185994738, + "grad_norm": 0.3100753426551819, + "learning_rate": 0.00015257559218189562, + "loss": 0.2542, + "step": 3205 + }, + { + "epoch": 0.6488565067800041, + "grad_norm": 0.3332715928554535, + "learning_rate": 0.000152548533228833, + "loss": 0.2389, + "step": 3206 + }, + { + "epoch": 0.6490588949605343, + "grad_norm": 0.33358773589134216, + "learning_rate": 0.0001525214689594612, + "loss": 0.2338, + "step": 3207 + }, + { + "epoch": 0.6492612831410646, + "grad_norm": 0.3007495701313019, + "learning_rate": 0.00015249439937651825, + "loss": 0.2366, + "step": 3208 + }, + { + "epoch": 0.6494636713215948, + "grad_norm": 0.28332725167274475, + "learning_rate": 0.00015246732448274275, + "loss": 0.2582, + "step": 3209 + }, + { + "epoch": 0.6496660595021251, + "grad_norm": 0.36708885431289673, + "learning_rate": 0.00015244024428087393, + "loss": 0.263, + "step": 3210 + }, + { + "epoch": 0.6498684476826553, + "grad_norm": 0.2604656517505646, + "learning_rate": 0.00015241315877365143, + "loss": 0.2179, + "step": 3211 + }, + { + "epoch": 0.6500708358631856, + "grad_norm": 0.41370195150375366, + "learning_rate": 0.00015238606796381553, + "loss": 0.2509, + "step": 3212 + }, + { + "epoch": 0.6502732240437158, + "grad_norm": 0.3503737151622772, + "learning_rate": 0.0001523589718541069, + "loss": 0.26, + "step": 3213 + }, + { + "epoch": 0.6504756122242461, + "grad_norm": 0.32839810848236084, + "learning_rate": 0.00015233187044726693, + "loss": 0.2243, + "step": 3214 + }, + { + "epoch": 0.6506780004047764, + "grad_norm": 0.45140987634658813, + "learning_rate": 0.00015230476374603738, + "loss": 0.2298, + "step": 3215 + }, + { + "epoch": 0.6508803885853066, + "grad_norm": 0.27483177185058594, + "learning_rate": 0.00015227765175316072, + "loss": 0.2455, + "step": 3216 + }, + { + "epoch": 0.6510827767658369, + "grad_norm": 0.2992781400680542, + "learning_rate": 0.0001522505344713798, + "loss": 0.2466, + "step": 3217 + }, + { + "epoch": 0.6512851649463671, + "grad_norm": 0.5257993340492249, + "learning_rate": 0.00015222341190343803, + "loss": 0.2274, + "step": 3218 + }, + { + "epoch": 0.6514875531268974, + "grad_norm": 0.3546026051044464, + "learning_rate": 0.00015219628405207942, + "loss": 0.2477, + "step": 3219 + }, + { + "epoch": 0.6516899413074276, + "grad_norm": 0.3486909866333008, + "learning_rate": 0.00015216915092004847, + "loss": 0.2557, + "step": 3220 + }, + { + "epoch": 0.6518923294879579, + "grad_norm": 0.3145160377025604, + "learning_rate": 0.00015214201251009023, + "loss": 0.2306, + "step": 3221 + }, + { + "epoch": 0.6520947176684881, + "grad_norm": 0.34794342517852783, + "learning_rate": 0.00015211486882495029, + "loss": 0.2659, + "step": 3222 + }, + { + "epoch": 0.6522971058490185, + "grad_norm": 0.34079018235206604, + "learning_rate": 0.00015208771986737477, + "loss": 0.2957, + "step": 3223 + }, + { + "epoch": 0.6524994940295487, + "grad_norm": 0.3173547089099884, + "learning_rate": 0.0001520605656401103, + "loss": 0.2304, + "step": 3224 + }, + { + "epoch": 0.652701882210079, + "grad_norm": 0.2905762791633606, + "learning_rate": 0.00015203340614590406, + "loss": 0.238, + "step": 3225 + }, + { + "epoch": 0.6529042703906092, + "grad_norm": 0.30235806107521057, + "learning_rate": 0.00015200624138750376, + "loss": 0.2483, + "step": 3226 + }, + { + "epoch": 0.6531066585711395, + "grad_norm": 0.32202184200286865, + "learning_rate": 0.0001519790713676577, + "loss": 0.2373, + "step": 3227 + }, + { + "epoch": 0.6533090467516697, + "grad_norm": 0.2946752905845642, + "learning_rate": 0.00015195189608911455, + "loss": 0.2486, + "step": 3228 + }, + { + "epoch": 0.6535114349322, + "grad_norm": 0.3170306086540222, + "learning_rate": 0.0001519247155546237, + "loss": 0.226, + "step": 3229 + }, + { + "epoch": 0.6537138231127302, + "grad_norm": 0.3037535548210144, + "learning_rate": 0.00015189752976693498, + "loss": 0.2358, + "step": 3230 + }, + { + "epoch": 0.6539162112932605, + "grad_norm": 0.27636289596557617, + "learning_rate": 0.00015187033872879875, + "loss": 0.2617, + "step": 3231 + }, + { + "epoch": 0.6541185994737907, + "grad_norm": 0.3105868399143219, + "learning_rate": 0.0001518431424429659, + "loss": 0.2888, + "step": 3232 + }, + { + "epoch": 0.654320987654321, + "grad_norm": 0.29909271001815796, + "learning_rate": 0.0001518159409121879, + "loss": 0.2462, + "step": 3233 + }, + { + "epoch": 0.6545233758348512, + "grad_norm": 0.29715684056282043, + "learning_rate": 0.00015178873413921665, + "loss": 0.2476, + "step": 3234 + }, + { + "epoch": 0.6547257640153815, + "grad_norm": 0.31502577662467957, + "learning_rate": 0.00015176152212680478, + "loss": 0.2044, + "step": 3235 + }, + { + "epoch": 0.6549281521959117, + "grad_norm": 0.29666459560394287, + "learning_rate": 0.00015173430487770513, + "loss": 0.2457, + "step": 3236 + }, + { + "epoch": 0.655130540376442, + "grad_norm": 1.0712119340896606, + "learning_rate": 0.0001517070823946714, + "loss": 0.267, + "step": 3237 + }, + { + "epoch": 0.6553329285569722, + "grad_norm": 0.34478285908699036, + "learning_rate": 0.00015167985468045764, + "loss": 0.2671, + "step": 3238 + }, + { + "epoch": 0.6555353167375025, + "grad_norm": 0.3465102016925812, + "learning_rate": 0.00015165262173781846, + "loss": 0.248, + "step": 3239 + }, + { + "epoch": 0.6557377049180327, + "grad_norm": 0.3330729901790619, + "learning_rate": 0.00015162538356950899, + "loss": 0.2555, + "step": 3240 + }, + { + "epoch": 0.655940093098563, + "grad_norm": 0.30930474400520325, + "learning_rate": 0.00015159814017828488, + "loss": 0.2541, + "step": 3241 + }, + { + "epoch": 0.6561424812790932, + "grad_norm": 0.3146357834339142, + "learning_rate": 0.00015157089156690238, + "loss": 0.2385, + "step": 3242 + }, + { + "epoch": 0.6563448694596236, + "grad_norm": 0.43063199520111084, + "learning_rate": 0.00015154363773811822, + "loss": 0.2822, + "step": 3243 + }, + { + "epoch": 0.6565472576401539, + "grad_norm": 0.3448871970176697, + "learning_rate": 0.0001515163786946896, + "loss": 0.2568, + "step": 3244 + }, + { + "epoch": 0.6567496458206841, + "grad_norm": 0.3220970928668976, + "learning_rate": 0.00015148911443937436, + "loss": 0.1936, + "step": 3245 + }, + { + "epoch": 0.6569520340012144, + "grad_norm": 0.28015419840812683, + "learning_rate": 0.0001514618449749308, + "loss": 0.215, + "step": 3246 + }, + { + "epoch": 0.6571544221817446, + "grad_norm": 0.3737340569496155, + "learning_rate": 0.00015143457030411775, + "loss": 0.2332, + "step": 3247 + }, + { + "epoch": 0.6573568103622749, + "grad_norm": 0.35730138421058655, + "learning_rate": 0.00015140729042969453, + "loss": 0.1969, + "step": 3248 + }, + { + "epoch": 0.6575591985428051, + "grad_norm": 0.3678499162197113, + "learning_rate": 0.00015138000535442112, + "loss": 0.2669, + "step": 3249 + }, + { + "epoch": 0.6577615867233354, + "grad_norm": 0.35763099789619446, + "learning_rate": 0.00015135271508105787, + "loss": 0.2693, + "step": 3250 + }, + { + "epoch": 0.6577615867233354, + "eval_loss": 0.27355989813804626, + "eval_runtime": 0.7378, + "eval_samples_per_second": 6.777, + "eval_steps_per_second": 1.355, + "step": 3250 + }, + { + "epoch": 0.6579639749038656, + "grad_norm": 0.27716264128685, + "learning_rate": 0.00015132541961236577, + "loss": 0.1913, + "step": 3251 + }, + { + "epoch": 0.6581663630843959, + "grad_norm": 0.3249410390853882, + "learning_rate": 0.00015129811895110625, + "loss": 0.2515, + "step": 3252 + }, + { + "epoch": 0.6583687512649261, + "grad_norm": 0.3644621670246124, + "learning_rate": 0.0001512708131000413, + "loss": 0.2511, + "step": 3253 + }, + { + "epoch": 0.6585711394454564, + "grad_norm": 0.36926156282424927, + "learning_rate": 0.00015124350206193347, + "loss": 0.2611, + "step": 3254 + }, + { + "epoch": 0.6587735276259866, + "grad_norm": 0.3121117949485779, + "learning_rate": 0.0001512161858395458, + "loss": 0.2429, + "step": 3255 + }, + { + "epoch": 0.6589759158065169, + "grad_norm": 0.28802725672721863, + "learning_rate": 0.00015118886443564185, + "loss": 0.2379, + "step": 3256 + }, + { + "epoch": 0.6591783039870471, + "grad_norm": 0.4395899772644043, + "learning_rate": 0.00015116153785298573, + "loss": 0.2528, + "step": 3257 + }, + { + "epoch": 0.6593806921675774, + "grad_norm": 0.38329094648361206, + "learning_rate": 0.000151134206094342, + "loss": 0.2519, + "step": 3258 + }, + { + "epoch": 0.6595830803481076, + "grad_norm": 0.327964186668396, + "learning_rate": 0.00015110686916247588, + "loss": 0.232, + "step": 3259 + }, + { + "epoch": 0.659785468528638, + "grad_norm": 0.27446261048316956, + "learning_rate": 0.000151079527060153, + "loss": 0.269, + "step": 3260 + }, + { + "epoch": 0.6599878567091682, + "grad_norm": 0.4416070580482483, + "learning_rate": 0.0001510521797901395, + "loss": 0.2213, + "step": 3261 + }, + { + "epoch": 0.6601902448896985, + "grad_norm": 0.35151076316833496, + "learning_rate": 0.00015102482735520218, + "loss": 0.2215, + "step": 3262 + }, + { + "epoch": 0.6603926330702287, + "grad_norm": 0.43818771839141846, + "learning_rate": 0.0001509974697581082, + "loss": 0.2437, + "step": 3263 + }, + { + "epoch": 0.660595021250759, + "grad_norm": 0.3801747262477875, + "learning_rate": 0.00015097010700162536, + "loss": 0.2498, + "step": 3264 + }, + { + "epoch": 0.6607974094312892, + "grad_norm": 0.28544607758522034, + "learning_rate": 0.0001509427390885219, + "loss": 0.2418, + "step": 3265 + }, + { + "epoch": 0.6609997976118195, + "grad_norm": 0.30231353640556335, + "learning_rate": 0.00015091536602156663, + "loss": 0.2459, + "step": 3266 + }, + { + "epoch": 0.6612021857923497, + "grad_norm": 0.32911112904548645, + "learning_rate": 0.00015088798780352888, + "loss": 0.2412, + "step": 3267 + }, + { + "epoch": 0.66140457397288, + "grad_norm": 0.28970471024513245, + "learning_rate": 0.00015086060443717848, + "loss": 0.2153, + "step": 3268 + }, + { + "epoch": 0.6616069621534102, + "grad_norm": 0.3324395418167114, + "learning_rate": 0.00015083321592528583, + "loss": 0.2095, + "step": 3269 + }, + { + "epoch": 0.6618093503339405, + "grad_norm": 0.2921914756298065, + "learning_rate": 0.00015080582227062174, + "loss": 0.2404, + "step": 3270 + }, + { + "epoch": 0.6620117385144707, + "grad_norm": 0.28326934576034546, + "learning_rate": 0.00015077842347595768, + "loss": 0.2368, + "step": 3271 + }, + { + "epoch": 0.662214126695001, + "grad_norm": 0.3168807029724121, + "learning_rate": 0.00015075101954406555, + "loss": 0.2439, + "step": 3272 + }, + { + "epoch": 0.6624165148755312, + "grad_norm": 0.3238939940929413, + "learning_rate": 0.00015072361047771776, + "loss": 0.2817, + "step": 3273 + }, + { + "epoch": 0.6626189030560615, + "grad_norm": 0.3251747786998749, + "learning_rate": 0.00015069619627968732, + "loss": 0.2511, + "step": 3274 + }, + { + "epoch": 0.6628212912365918, + "grad_norm": 0.37923121452331543, + "learning_rate": 0.00015066877695274768, + "loss": 0.2331, + "step": 3275 + }, + { + "epoch": 0.663023679417122, + "grad_norm": 0.37296178936958313, + "learning_rate": 0.00015064135249967287, + "loss": 0.2763, + "step": 3276 + }, + { + "epoch": 0.6632260675976523, + "grad_norm": 0.41110411286354065, + "learning_rate": 0.00015061392292323734, + "loss": 0.2665, + "step": 3277 + }, + { + "epoch": 0.6634284557781825, + "grad_norm": 0.3819164037704468, + "learning_rate": 0.00015058648822621618, + "loss": 0.2248, + "step": 3278 + }, + { + "epoch": 0.6636308439587129, + "grad_norm": 0.29319527745246887, + "learning_rate": 0.00015055904841138496, + "loss": 0.2383, + "step": 3279 + }, + { + "epoch": 0.6638332321392431, + "grad_norm": 0.4977886974811554, + "learning_rate": 0.00015053160348151975, + "loss": 0.2704, + "step": 3280 + }, + { + "epoch": 0.6640356203197734, + "grad_norm": 0.32568296790122986, + "learning_rate": 0.00015050415343939713, + "loss": 0.2232, + "step": 3281 + }, + { + "epoch": 0.6642380085003036, + "grad_norm": 0.336796373128891, + "learning_rate": 0.00015047669828779412, + "loss": 0.26, + "step": 3282 + }, + { + "epoch": 0.6644403966808339, + "grad_norm": 0.2726500630378723, + "learning_rate": 0.00015044923802948854, + "loss": 0.2414, + "step": 3283 + }, + { + "epoch": 0.6646427848613641, + "grad_norm": 0.28668951988220215, + "learning_rate": 0.00015042177266725833, + "loss": 0.2467, + "step": 3284 + }, + { + "epoch": 0.6648451730418944, + "grad_norm": 0.3118850886821747, + "learning_rate": 0.00015039430220388224, + "loss": 0.2444, + "step": 3285 + }, + { + "epoch": 0.6650475612224246, + "grad_norm": 0.2920859456062317, + "learning_rate": 0.00015036682664213943, + "loss": 0.2092, + "step": 3286 + }, + { + "epoch": 0.6652499494029549, + "grad_norm": 0.29595354199409485, + "learning_rate": 0.0001503393459848096, + "loss": 0.2176, + "step": 3287 + }, + { + "epoch": 0.6654523375834851, + "grad_norm": 0.3060019910335541, + "learning_rate": 0.00015031186023467297, + "loss": 0.2565, + "step": 3288 + }, + { + "epoch": 0.6656547257640154, + "grad_norm": 0.24143262207508087, + "learning_rate": 0.00015028436939451022, + "loss": 0.2045, + "step": 3289 + }, + { + "epoch": 0.6658571139445456, + "grad_norm": 0.27763256430625916, + "learning_rate": 0.0001502568734671026, + "loss": 0.2291, + "step": 3290 + }, + { + "epoch": 0.6660595021250759, + "grad_norm": 0.34247729182243347, + "learning_rate": 0.0001502293724552319, + "loss": 0.2684, + "step": 3291 + }, + { + "epoch": 0.6662618903056061, + "grad_norm": 0.27955156564712524, + "learning_rate": 0.0001502018663616803, + "loss": 0.2002, + "step": 3292 + }, + { + "epoch": 0.6664642784861364, + "grad_norm": 0.31482434272766113, + "learning_rate": 0.00015017435518923064, + "loss": 0.2126, + "step": 3293 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.3705318868160248, + "learning_rate": 0.0001501468389406662, + "loss": 0.2389, + "step": 3294 + }, + { + "epoch": 0.6668690548471969, + "grad_norm": 0.38991278409957886, + "learning_rate": 0.0001501193176187708, + "loss": 0.2238, + "step": 3295 + }, + { + "epoch": 0.6670714430277271, + "grad_norm": 0.39363741874694824, + "learning_rate": 0.00015009179122632872, + "loss": 0.2273, + "step": 3296 + }, + { + "epoch": 0.6672738312082575, + "grad_norm": 0.27503731846809387, + "learning_rate": 0.00015006425976612478, + "loss": 0.2356, + "step": 3297 + }, + { + "epoch": 0.6674762193887877, + "grad_norm": 0.3818816840648651, + "learning_rate": 0.00015003672324094439, + "loss": 0.2442, + "step": 3298 + }, + { + "epoch": 0.667678607569318, + "grad_norm": 0.2479441612958908, + "learning_rate": 0.00015000918165357336, + "loss": 0.2077, + "step": 3299 + }, + { + "epoch": 0.6678809957498482, + "grad_norm": 0.3392878770828247, + "learning_rate": 0.00014998163500679808, + "loss": 0.2185, + "step": 3300 + }, + { + "epoch": 0.6678809957498482, + "eval_loss": 0.2743344008922577, + "eval_runtime": 0.739, + "eval_samples_per_second": 6.766, + "eval_steps_per_second": 1.353, + "step": 3300 + }, + { + "epoch": 0.6680833839303785, + "grad_norm": 0.4281541705131531, + "learning_rate": 0.0001499540833034054, + "loss": 0.2606, + "step": 3301 + }, + { + "epoch": 0.6682857721109087, + "grad_norm": 0.3157845139503479, + "learning_rate": 0.00014992652654618275, + "loss": 0.2454, + "step": 3302 + }, + { + "epoch": 0.668488160291439, + "grad_norm": 0.35366958379745483, + "learning_rate": 0.00014989896473791803, + "loss": 0.2188, + "step": 3303 + }, + { + "epoch": 0.6686905484719692, + "grad_norm": 0.28584495186805725, + "learning_rate": 0.0001498713978813996, + "loss": 0.2106, + "step": 3304 + }, + { + "epoch": 0.6688929366524995, + "grad_norm": 0.32046595215797424, + "learning_rate": 0.0001498438259794165, + "loss": 0.2289, + "step": 3305 + }, + { + "epoch": 0.6690953248330298, + "grad_norm": 0.2838146388530731, + "learning_rate": 0.00014981624903475803, + "loss": 0.2355, + "step": 3306 + }, + { + "epoch": 0.66929771301356, + "grad_norm": 0.34235918521881104, + "learning_rate": 0.00014978866705021423, + "loss": 0.282, + "step": 3307 + }, + { + "epoch": 0.6695001011940903, + "grad_norm": 0.2716708183288574, + "learning_rate": 0.0001497610800285755, + "loss": 0.2327, + "step": 3308 + }, + { + "epoch": 0.6697024893746205, + "grad_norm": 0.30932801961898804, + "learning_rate": 0.00014973348797263283, + "loss": 0.2531, + "step": 3309 + }, + { + "epoch": 0.6699048775551508, + "grad_norm": 0.3900741636753082, + "learning_rate": 0.0001497058908851777, + "loss": 0.2124, + "step": 3310 + }, + { + "epoch": 0.670107265735681, + "grad_norm": 0.3228439688682556, + "learning_rate": 0.0001496782887690021, + "loss": 0.2598, + "step": 3311 + }, + { + "epoch": 0.6703096539162113, + "grad_norm": 0.2500062882900238, + "learning_rate": 0.0001496506816268985, + "loss": 0.2102, + "step": 3312 + }, + { + "epoch": 0.6705120420967415, + "grad_norm": 0.2975091338157654, + "learning_rate": 0.0001496230694616599, + "loss": 0.2676, + "step": 3313 + }, + { + "epoch": 0.6707144302772718, + "grad_norm": 0.3403729200363159, + "learning_rate": 0.00014959545227607982, + "loss": 0.2706, + "step": 3314 + }, + { + "epoch": 0.670916818457802, + "grad_norm": 0.2906215786933899, + "learning_rate": 0.0001495678300729523, + "loss": 0.2226, + "step": 3315 + }, + { + "epoch": 0.6711192066383324, + "grad_norm": 0.3046065866947174, + "learning_rate": 0.00014954020285507183, + "loss": 0.2482, + "step": 3316 + }, + { + "epoch": 0.6713215948188626, + "grad_norm": 0.3264645040035248, + "learning_rate": 0.0001495125706252335, + "loss": 0.2251, + "step": 3317 + }, + { + "epoch": 0.6715239829993929, + "grad_norm": 0.2815634608268738, + "learning_rate": 0.00014948493338623275, + "loss": 0.2253, + "step": 3318 + }, + { + "epoch": 0.6717263711799231, + "grad_norm": 0.2859968841075897, + "learning_rate": 0.00014945729114086568, + "loss": 0.2151, + "step": 3319 + }, + { + "epoch": 0.6719287593604534, + "grad_norm": 0.3038029968738556, + "learning_rate": 0.0001494296438919289, + "loss": 0.2243, + "step": 3320 + }, + { + "epoch": 0.6721311475409836, + "grad_norm": 0.3037087917327881, + "learning_rate": 0.00014940199164221936, + "loss": 0.2564, + "step": 3321 + }, + { + "epoch": 0.6723335357215139, + "grad_norm": 0.30161625146865845, + "learning_rate": 0.00014937433439453466, + "loss": 0.2691, + "step": 3322 + }, + { + "epoch": 0.6725359239020441, + "grad_norm": 0.37410643696784973, + "learning_rate": 0.0001493466721516729, + "loss": 0.244, + "step": 3323 + }, + { + "epoch": 0.6727383120825744, + "grad_norm": 0.4140486717224121, + "learning_rate": 0.00014931900491643266, + "loss": 0.305, + "step": 3324 + }, + { + "epoch": 0.6729407002631046, + "grad_norm": 0.355758935213089, + "learning_rate": 0.00014929133269161296, + "loss": 0.2452, + "step": 3325 + }, + { + "epoch": 0.6731430884436349, + "grad_norm": 0.32681307196617126, + "learning_rate": 0.00014926365548001346, + "loss": 0.2507, + "step": 3326 + }, + { + "epoch": 0.6733454766241651, + "grad_norm": 0.3159436583518982, + "learning_rate": 0.00014923597328443422, + "loss": 0.2556, + "step": 3327 + }, + { + "epoch": 0.6735478648046954, + "grad_norm": 0.32382968068122864, + "learning_rate": 0.00014920828610767584, + "loss": 0.2504, + "step": 3328 + }, + { + "epoch": 0.6737502529852256, + "grad_norm": 0.3211047649383545, + "learning_rate": 0.0001491805939525394, + "loss": 0.2735, + "step": 3329 + }, + { + "epoch": 0.6739526411657559, + "grad_norm": 0.4596484303474426, + "learning_rate": 0.0001491528968218265, + "loss": 0.2799, + "step": 3330 + }, + { + "epoch": 0.6741550293462861, + "grad_norm": 0.347531259059906, + "learning_rate": 0.00014912519471833922, + "loss": 0.3237, + "step": 3331 + }, + { + "epoch": 0.6743574175268164, + "grad_norm": 0.7053202986717224, + "learning_rate": 0.00014909748764488026, + "loss": 0.2516, + "step": 3332 + }, + { + "epoch": 0.6745598057073466, + "grad_norm": 0.2955659329891205, + "learning_rate": 0.00014906977560425264, + "loss": 0.2788, + "step": 3333 + }, + { + "epoch": 0.674762193887877, + "grad_norm": 0.32625612616539, + "learning_rate": 0.00014904205859926002, + "loss": 0.2353, + "step": 3334 + }, + { + "epoch": 0.6749645820684073, + "grad_norm": 0.28343072533607483, + "learning_rate": 0.00014901433663270649, + "loss": 0.2202, + "step": 3335 + }, + { + "epoch": 0.6751669702489375, + "grad_norm": 0.3389117121696472, + "learning_rate": 0.0001489866097073967, + "loss": 0.211, + "step": 3336 + }, + { + "epoch": 0.6753693584294678, + "grad_norm": 0.3158564865589142, + "learning_rate": 0.00014895887782613576, + "loss": 0.2403, + "step": 3337 + }, + { + "epoch": 0.675571746609998, + "grad_norm": 0.33958199620246887, + "learning_rate": 0.00014893114099172924, + "loss": 0.2384, + "step": 3338 + }, + { + "epoch": 0.6757741347905283, + "grad_norm": 0.2601412236690521, + "learning_rate": 0.00014890339920698334, + "loss": 0.2303, + "step": 3339 + }, + { + "epoch": 0.6759765229710585, + "grad_norm": 0.253071129322052, + "learning_rate": 0.00014887565247470464, + "loss": 0.235, + "step": 3340 + }, + { + "epoch": 0.6761789111515888, + "grad_norm": 0.3102857172489166, + "learning_rate": 0.00014884790079770026, + "loss": 0.2194, + "step": 3341 + }, + { + "epoch": 0.676381299332119, + "grad_norm": 0.2947596311569214, + "learning_rate": 0.00014882014417877783, + "loss": 0.2323, + "step": 3342 + }, + { + "epoch": 0.6765836875126493, + "grad_norm": 0.30757614970207214, + "learning_rate": 0.0001487923826207455, + "loss": 0.2188, + "step": 3343 + }, + { + "epoch": 0.6767860756931795, + "grad_norm": 0.27366071939468384, + "learning_rate": 0.00014876461612641184, + "loss": 0.2439, + "step": 3344 + }, + { + "epoch": 0.6769884638737098, + "grad_norm": 0.28723663091659546, + "learning_rate": 0.000148736844698586, + "loss": 0.2536, + "step": 3345 + }, + { + "epoch": 0.67719085205424, + "grad_norm": 0.25499916076660156, + "learning_rate": 0.00014870906834007762, + "loss": 0.2133, + "step": 3346 + }, + { + "epoch": 0.6773932402347703, + "grad_norm": 0.3012092113494873, + "learning_rate": 0.0001486812870536968, + "loss": 0.2519, + "step": 3347 + }, + { + "epoch": 0.6775956284153005, + "grad_norm": 0.35855376720428467, + "learning_rate": 0.00014865350084225415, + "loss": 0.3027, + "step": 3348 + }, + { + "epoch": 0.6777980165958308, + "grad_norm": 0.2527807652950287, + "learning_rate": 0.00014862570970856082, + "loss": 0.1778, + "step": 3349 + }, + { + "epoch": 0.678000404776361, + "grad_norm": 0.36591532826423645, + "learning_rate": 0.0001485979136554284, + "loss": 0.2506, + "step": 3350 + }, + { + "epoch": 0.678000404776361, + "eval_loss": 0.27230167388916016, + "eval_runtime": 0.7416, + "eval_samples_per_second": 6.743, + "eval_steps_per_second": 1.349, + "step": 3350 + }, + { + "epoch": 0.6782027929568913, + "grad_norm": 0.38087227940559387, + "learning_rate": 0.000148570112685669, + "loss": 0.2004, + "step": 3351 + }, + { + "epoch": 0.6784051811374215, + "grad_norm": 0.3207904100418091, + "learning_rate": 0.00014854230680209525, + "loss": 0.234, + "step": 3352 + }, + { + "epoch": 0.6786075693179519, + "grad_norm": 0.2659710943698883, + "learning_rate": 0.00014851449600752025, + "loss": 0.2075, + "step": 3353 + }, + { + "epoch": 0.678809957498482, + "grad_norm": 0.2757861614227295, + "learning_rate": 0.0001484866803047576, + "loss": 0.2163, + "step": 3354 + }, + { + "epoch": 0.6790123456790124, + "grad_norm": 0.32871562242507935, + "learning_rate": 0.0001484588596966214, + "loss": 0.2626, + "step": 3355 + }, + { + "epoch": 0.6792147338595426, + "grad_norm": 0.29668277502059937, + "learning_rate": 0.0001484310341859262, + "loss": 0.2489, + "step": 3356 + }, + { + "epoch": 0.6794171220400729, + "grad_norm": 0.2997402548789978, + "learning_rate": 0.0001484032037754872, + "loss": 0.2553, + "step": 3357 + }, + { + "epoch": 0.6796195102206031, + "grad_norm": 0.3057543933391571, + "learning_rate": 0.00014837536846811994, + "loss": 0.2624, + "step": 3358 + }, + { + "epoch": 0.6798218984011334, + "grad_norm": 0.310094952583313, + "learning_rate": 0.00014834752826664045, + "loss": 0.2447, + "step": 3359 + }, + { + "epoch": 0.6800242865816636, + "grad_norm": 0.317460834980011, + "learning_rate": 0.00014831968317386538, + "loss": 0.2225, + "step": 3360 + }, + { + "epoch": 0.6802266747621939, + "grad_norm": 0.2607171833515167, + "learning_rate": 0.0001482918331926118, + "loss": 0.2148, + "step": 3361 + }, + { + "epoch": 0.6804290629427241, + "grad_norm": 0.2784850001335144, + "learning_rate": 0.00014826397832569721, + "loss": 0.2148, + "step": 3362 + }, + { + "epoch": 0.6806314511232544, + "grad_norm": 0.3371258080005646, + "learning_rate": 0.00014823611857593972, + "loss": 0.2474, + "step": 3363 + }, + { + "epoch": 0.6808338393037846, + "grad_norm": 0.33104196190834045, + "learning_rate": 0.00014820825394615793, + "loss": 0.2485, + "step": 3364 + }, + { + "epoch": 0.6810362274843149, + "grad_norm": 0.3230952024459839, + "learning_rate": 0.00014818038443917083, + "loss": 0.2664, + "step": 3365 + }, + { + "epoch": 0.6812386156648452, + "grad_norm": 0.7129462957382202, + "learning_rate": 0.00014815251005779797, + "loss": 0.2436, + "step": 3366 + }, + { + "epoch": 0.6814410038453754, + "grad_norm": 0.3523089289665222, + "learning_rate": 0.00014812463080485943, + "loss": 0.2529, + "step": 3367 + }, + { + "epoch": 0.6816433920259057, + "grad_norm": 0.2964981198310852, + "learning_rate": 0.0001480967466831757, + "loss": 0.2468, + "step": 3368 + }, + { + "epoch": 0.6818457802064359, + "grad_norm": 0.3880394697189331, + "learning_rate": 0.0001480688576955678, + "loss": 0.2625, + "step": 3369 + }, + { + "epoch": 0.6820481683869662, + "grad_norm": 0.3858512341976166, + "learning_rate": 0.00014804096384485728, + "loss": 0.2777, + "step": 3370 + }, + { + "epoch": 0.6822505565674964, + "grad_norm": 0.323681116104126, + "learning_rate": 0.00014801306513386614, + "loss": 0.2733, + "step": 3371 + }, + { + "epoch": 0.6824529447480268, + "grad_norm": 0.309332937002182, + "learning_rate": 0.0001479851615654168, + "loss": 0.2427, + "step": 3372 + }, + { + "epoch": 0.682655332928557, + "grad_norm": 0.3008730411529541, + "learning_rate": 0.00014795725314233237, + "loss": 0.2485, + "step": 3373 + }, + { + "epoch": 0.6828577211090873, + "grad_norm": 0.34555160999298096, + "learning_rate": 0.0001479293398674363, + "loss": 0.2254, + "step": 3374 + }, + { + "epoch": 0.6830601092896175, + "grad_norm": 0.339863121509552, + "learning_rate": 0.0001479014217435525, + "loss": 0.2535, + "step": 3375 + }, + { + "epoch": 0.6832624974701478, + "grad_norm": 0.43282923102378845, + "learning_rate": 0.00014787349877350546, + "loss": 0.2536, + "step": 3376 + }, + { + "epoch": 0.683464885650678, + "grad_norm": 0.304993599653244, + "learning_rate": 0.00014784557096012016, + "loss": 0.1946, + "step": 3377 + }, + { + "epoch": 0.6836672738312083, + "grad_norm": 0.29952695965766907, + "learning_rate": 0.00014781763830622202, + "loss": 0.2514, + "step": 3378 + }, + { + "epoch": 0.6838696620117385, + "grad_norm": 0.4388863444328308, + "learning_rate": 0.00014778970081463699, + "loss": 0.247, + "step": 3379 + }, + { + "epoch": 0.6840720501922688, + "grad_norm": 0.3819953501224518, + "learning_rate": 0.0001477617584881915, + "loss": 0.2696, + "step": 3380 + }, + { + "epoch": 0.684274438372799, + "grad_norm": 0.2509651184082031, + "learning_rate": 0.00014773381132971241, + "loss": 0.2177, + "step": 3381 + }, + { + "epoch": 0.6844768265533293, + "grad_norm": 0.3718028962612152, + "learning_rate": 0.00014770585934202715, + "loss": 0.2678, + "step": 3382 + }, + { + "epoch": 0.6846792147338595, + "grad_norm": 0.3376442790031433, + "learning_rate": 0.00014767790252796366, + "loss": 0.251, + "step": 3383 + }, + { + "epoch": 0.6848816029143898, + "grad_norm": 0.28158965706825256, + "learning_rate": 0.00014764994089035027, + "loss": 0.2431, + "step": 3384 + }, + { + "epoch": 0.68508399109492, + "grad_norm": 0.23821482062339783, + "learning_rate": 0.00014762197443201583, + "loss": 0.2131, + "step": 3385 + }, + { + "epoch": 0.6852863792754503, + "grad_norm": 0.29754817485809326, + "learning_rate": 0.00014759400315578967, + "loss": 0.2601, + "step": 3386 + }, + { + "epoch": 0.6854887674559805, + "grad_norm": 0.27643054723739624, + "learning_rate": 0.0001475660270645017, + "loss": 0.2396, + "step": 3387 + }, + { + "epoch": 0.6856911556365108, + "grad_norm": 0.3064822554588318, + "learning_rate": 0.00014753804616098225, + "loss": 0.2026, + "step": 3388 + }, + { + "epoch": 0.685893543817041, + "grad_norm": 0.3296985626220703, + "learning_rate": 0.00014751006044806203, + "loss": 0.2439, + "step": 3389 + }, + { + "epoch": 0.6860959319975714, + "grad_norm": 0.3113783597946167, + "learning_rate": 0.00014748206992857245, + "loss": 0.2038, + "step": 3390 + }, + { + "epoch": 0.6862983201781016, + "grad_norm": 0.32076701521873474, + "learning_rate": 0.00014745407460534525, + "loss": 0.2497, + "step": 3391 + }, + { + "epoch": 0.6865007083586319, + "grad_norm": 0.290251225233078, + "learning_rate": 0.0001474260744812127, + "loss": 0.2094, + "step": 3392 + }, + { + "epoch": 0.6867030965391621, + "grad_norm": 0.33005815744400024, + "learning_rate": 0.0001473980695590076, + "loss": 0.1986, + "step": 3393 + }, + { + "epoch": 0.6869054847196924, + "grad_norm": 0.40082716941833496, + "learning_rate": 0.00014737005984156318, + "loss": 0.2553, + "step": 3394 + }, + { + "epoch": 0.6871078729002226, + "grad_norm": 0.29623132944107056, + "learning_rate": 0.00014734204533171311, + "loss": 0.2455, + "step": 3395 + }, + { + "epoch": 0.6873102610807529, + "grad_norm": 0.3471490144729614, + "learning_rate": 0.00014731402603229167, + "loss": 0.2865, + "step": 3396 + }, + { + "epoch": 0.6875126492612832, + "grad_norm": 0.4119548499584198, + "learning_rate": 0.00014728600194613355, + "loss": 0.2478, + "step": 3397 + }, + { + "epoch": 0.6877150374418134, + "grad_norm": 0.35273849964141846, + "learning_rate": 0.00014725797307607388, + "loss": 0.2914, + "step": 3398 + }, + { + "epoch": 0.6879174256223437, + "grad_norm": 0.33130013942718506, + "learning_rate": 0.0001472299394249484, + "loss": 0.2617, + "step": 3399 + }, + { + "epoch": 0.6881198138028739, + "grad_norm": 0.2648128569126129, + "learning_rate": 0.0001472019009955932, + "loss": 0.2046, + "step": 3400 + }, + { + "epoch": 0.6881198138028739, + "eval_loss": 0.26742979884147644, + "eval_runtime": 0.7415, + "eval_samples_per_second": 6.743, + "eval_steps_per_second": 1.349, + "step": 3400 + }, + { + "epoch": 0.6883222019834042, + "grad_norm": 0.26487231254577637, + "learning_rate": 0.00014717385779084493, + "loss": 0.2381, + "step": 3401 + }, + { + "epoch": 0.6885245901639344, + "grad_norm": 0.3537691831588745, + "learning_rate": 0.00014714580981354077, + "loss": 0.2545, + "step": 3402 + }, + { + "epoch": 0.6887269783444647, + "grad_norm": 0.27148503065109253, + "learning_rate": 0.00014711775706651822, + "loss": 0.2377, + "step": 3403 + }, + { + "epoch": 0.6889293665249949, + "grad_norm": 0.30252805352211, + "learning_rate": 0.00014708969955261545, + "loss": 0.2388, + "step": 3404 + }, + { + "epoch": 0.6891317547055252, + "grad_norm": 0.26469656825065613, + "learning_rate": 0.00014706163727467097, + "loss": 0.2158, + "step": 3405 + }, + { + "epoch": 0.6893341428860554, + "grad_norm": 0.25582897663116455, + "learning_rate": 0.00014703357023552384, + "loss": 0.1943, + "step": 3406 + }, + { + "epoch": 0.6895365310665857, + "grad_norm": 0.22913604974746704, + "learning_rate": 0.0001470054984380136, + "loss": 0.2137, + "step": 3407 + }, + { + "epoch": 0.689738919247116, + "grad_norm": 0.3478175103664398, + "learning_rate": 0.00014697742188498025, + "loss": 0.2524, + "step": 3408 + }, + { + "epoch": 0.6899413074276463, + "grad_norm": 0.3166928291320801, + "learning_rate": 0.00014694934057926426, + "loss": 0.2543, + "step": 3409 + }, + { + "epoch": 0.6901436956081765, + "grad_norm": 0.36977559328079224, + "learning_rate": 0.00014692125452370663, + "loss": 0.2227, + "step": 3410 + }, + { + "epoch": 0.6903460837887068, + "grad_norm": 0.24233773350715637, + "learning_rate": 0.00014689316372114883, + "loss": 0.2042, + "step": 3411 + }, + { + "epoch": 0.690548471969237, + "grad_norm": 0.3313998878002167, + "learning_rate": 0.00014686506817443274, + "loss": 0.2213, + "step": 3412 + }, + { + "epoch": 0.6907508601497673, + "grad_norm": 0.3995082378387451, + "learning_rate": 0.00014683696788640082, + "loss": 0.259, + "step": 3413 + }, + { + "epoch": 0.6909532483302975, + "grad_norm": 0.30752405524253845, + "learning_rate": 0.00014680886285989595, + "loss": 0.28, + "step": 3414 + }, + { + "epoch": 0.6911556365108278, + "grad_norm": 0.3397967219352722, + "learning_rate": 0.00014678075309776148, + "loss": 0.2488, + "step": 3415 + }, + { + "epoch": 0.691358024691358, + "grad_norm": 0.26995396614074707, + "learning_rate": 0.00014675263860284128, + "loss": 0.2255, + "step": 3416 + }, + { + "epoch": 0.6915604128718883, + "grad_norm": 0.2898835241794586, + "learning_rate": 0.00014672451937797968, + "loss": 0.2517, + "step": 3417 + }, + { + "epoch": 0.6917628010524185, + "grad_norm": 0.30538830161094666, + "learning_rate": 0.00014669639542602147, + "loss": 0.2225, + "step": 3418 + }, + { + "epoch": 0.6919651892329488, + "grad_norm": 0.2842797040939331, + "learning_rate": 0.00014666826674981196, + "loss": 0.2615, + "step": 3419 + }, + { + "epoch": 0.692167577413479, + "grad_norm": 0.36719465255737305, + "learning_rate": 0.0001466401333521969, + "loss": 0.218, + "step": 3420 + }, + { + "epoch": 0.6923699655940093, + "grad_norm": 0.33825409412384033, + "learning_rate": 0.00014661199523602255, + "loss": 0.2335, + "step": 3421 + }, + { + "epoch": 0.6925723537745395, + "grad_norm": 0.32189881801605225, + "learning_rate": 0.0001465838524041356, + "loss": 0.261, + "step": 3422 + }, + { + "epoch": 0.6927747419550698, + "grad_norm": 0.3095569610595703, + "learning_rate": 0.0001465557048593833, + "loss": 0.2225, + "step": 3423 + }, + { + "epoch": 0.6929771301356, + "grad_norm": 0.3033314645290375, + "learning_rate": 0.00014652755260461325, + "loss": 0.2272, + "step": 3424 + }, + { + "epoch": 0.6931795183161303, + "grad_norm": 0.2859857380390167, + "learning_rate": 0.00014649939564267362, + "loss": 0.2419, + "step": 3425 + }, + { + "epoch": 0.6933819064966606, + "grad_norm": 0.2597350776195526, + "learning_rate": 0.0001464712339764131, + "loss": 0.2393, + "step": 3426 + }, + { + "epoch": 0.6935842946771908, + "grad_norm": 0.305396169424057, + "learning_rate": 0.0001464430676086807, + "loss": 0.2382, + "step": 3427 + }, + { + "epoch": 0.6937866828577212, + "grad_norm": 0.3529883921146393, + "learning_rate": 0.0001464148965423261, + "loss": 0.2664, + "step": 3428 + }, + { + "epoch": 0.6939890710382514, + "grad_norm": 0.3151325285434723, + "learning_rate": 0.00014638672078019926, + "loss": 0.2202, + "step": 3429 + }, + { + "epoch": 0.6941914592187817, + "grad_norm": 0.27725449204444885, + "learning_rate": 0.00014635854032515072, + "loss": 0.2431, + "step": 3430 + }, + { + "epoch": 0.6943938473993119, + "grad_norm": 0.2587081491947174, + "learning_rate": 0.00014633035518003153, + "loss": 0.2224, + "step": 3431 + }, + { + "epoch": 0.6945962355798422, + "grad_norm": 0.26528117060661316, + "learning_rate": 0.00014630216534769312, + "loss": 0.2067, + "step": 3432 + }, + { + "epoch": 0.6947986237603724, + "grad_norm": 0.28663206100463867, + "learning_rate": 0.00014627397083098747, + "loss": 0.2205, + "step": 3433 + }, + { + "epoch": 0.6950010119409027, + "grad_norm": 0.2610286474227905, + "learning_rate": 0.00014624577163276702, + "loss": 0.2594, + "step": 3434 + }, + { + "epoch": 0.6952034001214329, + "grad_norm": 0.48873916268348694, + "learning_rate": 0.00014621756775588462, + "loss": 0.2415, + "step": 3435 + }, + { + "epoch": 0.6954057883019632, + "grad_norm": 0.27237123250961304, + "learning_rate": 0.00014618935920319368, + "loss": 0.2283, + "step": 3436 + }, + { + "epoch": 0.6956081764824934, + "grad_norm": 0.3596338629722595, + "learning_rate": 0.00014616114597754804, + "loss": 0.2363, + "step": 3437 + }, + { + "epoch": 0.6958105646630237, + "grad_norm": 0.30172982811927795, + "learning_rate": 0.00014613292808180202, + "loss": 0.2657, + "step": 3438 + }, + { + "epoch": 0.6960129528435539, + "grad_norm": 0.2471524178981781, + "learning_rate": 0.00014610470551881037, + "loss": 0.2093, + "step": 3439 + }, + { + "epoch": 0.6962153410240842, + "grad_norm": 0.3113197088241577, + "learning_rate": 0.00014607647829142844, + "loss": 0.2763, + "step": 3440 + }, + { + "epoch": 0.6964177292046144, + "grad_norm": 0.614896833896637, + "learning_rate": 0.00014604824640251186, + "loss": 0.2017, + "step": 3441 + }, + { + "epoch": 0.6966201173851447, + "grad_norm": 0.27417445182800293, + "learning_rate": 0.00014602000985491692, + "loss": 0.2402, + "step": 3442 + }, + { + "epoch": 0.6968225055656749, + "grad_norm": 0.29607483744621277, + "learning_rate": 0.00014599176865150027, + "loss": 0.263, + "step": 3443 + }, + { + "epoch": 0.6970248937462052, + "grad_norm": 0.348371684551239, + "learning_rate": 0.00014596352279511903, + "loss": 0.2236, + "step": 3444 + }, + { + "epoch": 0.6972272819267354, + "grad_norm": 0.2907358407974243, + "learning_rate": 0.00014593527228863083, + "loss": 0.213, + "step": 3445 + }, + { + "epoch": 0.6974296701072658, + "grad_norm": 0.3474874496459961, + "learning_rate": 0.00014590701713489382, + "loss": 0.2123, + "step": 3446 + }, + { + "epoch": 0.697632058287796, + "grad_norm": 0.2992868721485138, + "learning_rate": 0.00014587875733676646, + "loss": 0.2418, + "step": 3447 + }, + { + "epoch": 0.6978344464683263, + "grad_norm": 0.37498465180397034, + "learning_rate": 0.0001458504928971079, + "loss": 0.2517, + "step": 3448 + }, + { + "epoch": 0.6980368346488565, + "grad_norm": 0.24434925615787506, + "learning_rate": 0.0001458222238187775, + "loss": 0.2309, + "step": 3449 + }, + { + "epoch": 0.6982392228293868, + "grad_norm": 0.3073577880859375, + "learning_rate": 0.00014579395010463537, + "loss": 0.2517, + "step": 3450 + }, + { + "epoch": 0.6982392228293868, + "eval_loss": 0.26842841506004333, + "eval_runtime": 0.7378, + "eval_samples_per_second": 6.777, + "eval_steps_per_second": 1.355, + "step": 3450 + }, + { + "epoch": 0.698441611009917, + "grad_norm": 0.3026072382926941, + "learning_rate": 0.00014576567175754183, + "loss": 0.2191, + "step": 3451 + }, + { + "epoch": 0.6986439991904473, + "grad_norm": 0.3033480644226074, + "learning_rate": 0.00014573738878035785, + "loss": 0.2746, + "step": 3452 + }, + { + "epoch": 0.6988463873709775, + "grad_norm": 0.32554948329925537, + "learning_rate": 0.0001457091011759448, + "loss": 0.2847, + "step": 3453 + }, + { + "epoch": 0.6990487755515078, + "grad_norm": 0.33661654591560364, + "learning_rate": 0.0001456808089471645, + "loss": 0.2413, + "step": 3454 + }, + { + "epoch": 0.699251163732038, + "grad_norm": 0.24390999972820282, + "learning_rate": 0.00014565251209687927, + "loss": 0.2142, + "step": 3455 + }, + { + "epoch": 0.6994535519125683, + "grad_norm": 0.3766001760959625, + "learning_rate": 0.00014562421062795192, + "loss": 0.2578, + "step": 3456 + }, + { + "epoch": 0.6996559400930986, + "grad_norm": 0.31514355540275574, + "learning_rate": 0.00014559590454324564, + "loss": 0.223, + "step": 3457 + }, + { + "epoch": 0.6998583282736288, + "grad_norm": 0.43481573462486267, + "learning_rate": 0.00014556759384562416, + "loss": 0.2558, + "step": 3458 + }, + { + "epoch": 0.7000607164541591, + "grad_norm": 0.2606533169746399, + "learning_rate": 0.00014553927853795169, + "loss": 0.207, + "step": 3459 + }, + { + "epoch": 0.7002631046346893, + "grad_norm": 0.3021109402179718, + "learning_rate": 0.00014551095862309286, + "loss": 0.2596, + "step": 3460 + }, + { + "epoch": 0.7004654928152196, + "grad_norm": 0.4843895733356476, + "learning_rate": 0.0001454826341039128, + "loss": 0.1994, + "step": 3461 + }, + { + "epoch": 0.7006678809957498, + "grad_norm": 0.313245564699173, + "learning_rate": 0.00014545430498327702, + "loss": 0.2487, + "step": 3462 + }, + { + "epoch": 0.7008702691762801, + "grad_norm": 0.24608662724494934, + "learning_rate": 0.00014542597126405163, + "loss": 0.219, + "step": 3463 + }, + { + "epoch": 0.7010726573568103, + "grad_norm": 0.27048546075820923, + "learning_rate": 0.00014539763294910311, + "loss": 0.2236, + "step": 3464 + }, + { + "epoch": 0.7012750455373407, + "grad_norm": 0.3017890453338623, + "learning_rate": 0.00014536929004129844, + "loss": 0.2483, + "step": 3465 + }, + { + "epoch": 0.7014774337178709, + "grad_norm": 0.40976661443710327, + "learning_rate": 0.00014534094254350506, + "loss": 0.2344, + "step": 3466 + }, + { + "epoch": 0.7016798218984012, + "grad_norm": 0.3057340979576111, + "learning_rate": 0.00014531259045859086, + "loss": 0.2256, + "step": 3467 + }, + { + "epoch": 0.7018822100789314, + "grad_norm": 0.37210866808891296, + "learning_rate": 0.0001452842337894242, + "loss": 0.254, + "step": 3468 + }, + { + "epoch": 0.7020845982594617, + "grad_norm": 0.2670847177505493, + "learning_rate": 0.00014525587253887395, + "loss": 0.246, + "step": 3469 + }, + { + "epoch": 0.7022869864399919, + "grad_norm": 0.5012633800506592, + "learning_rate": 0.0001452275067098094, + "loss": 0.2463, + "step": 3470 + }, + { + "epoch": 0.7024893746205222, + "grad_norm": 0.38355302810668945, + "learning_rate": 0.00014519913630510028, + "loss": 0.2273, + "step": 3471 + }, + { + "epoch": 0.7026917628010524, + "grad_norm": 0.34492430090904236, + "learning_rate": 0.00014517076132761686, + "loss": 0.2533, + "step": 3472 + }, + { + "epoch": 0.7028941509815827, + "grad_norm": 0.4136013984680176, + "learning_rate": 0.0001451423817802297, + "loss": 0.2704, + "step": 3473 + }, + { + "epoch": 0.7030965391621129, + "grad_norm": 0.3448164761066437, + "learning_rate": 0.00014511399766581006, + "loss": 0.2584, + "step": 3474 + }, + { + "epoch": 0.7032989273426432, + "grad_norm": 0.287451833486557, + "learning_rate": 0.00014508560898722952, + "loss": 0.2235, + "step": 3475 + }, + { + "epoch": 0.7035013155231734, + "grad_norm": 0.3065091669559479, + "learning_rate": 0.0001450572157473601, + "loss": 0.2619, + "step": 3476 + }, + { + "epoch": 0.7037037037037037, + "grad_norm": 0.2735513746738434, + "learning_rate": 0.00014502881794907442, + "loss": 0.2376, + "step": 3477 + }, + { + "epoch": 0.7039060918842339, + "grad_norm": 0.3324407637119293, + "learning_rate": 0.0001450004155952454, + "loss": 0.2468, + "step": 3478 + }, + { + "epoch": 0.7041084800647642, + "grad_norm": 0.3772326707839966, + "learning_rate": 0.00014497200868874652, + "loss": 0.241, + "step": 3479 + }, + { + "epoch": 0.7043108682452944, + "grad_norm": 0.2763902246952057, + "learning_rate": 0.00014494359723245167, + "loss": 0.2414, + "step": 3480 + }, + { + "epoch": 0.7045132564258247, + "grad_norm": 0.3471759259700775, + "learning_rate": 0.00014491518122923528, + "loss": 0.2279, + "step": 3481 + }, + { + "epoch": 0.7047156446063549, + "grad_norm": 0.3083358108997345, + "learning_rate": 0.00014488676068197208, + "loss": 0.2776, + "step": 3482 + }, + { + "epoch": 0.7049180327868853, + "grad_norm": 0.28121310472488403, + "learning_rate": 0.00014485833559353748, + "loss": 0.225, + "step": 3483 + }, + { + "epoch": 0.7051204209674155, + "grad_norm": 0.3246133625507355, + "learning_rate": 0.00014482990596680718, + "loss": 0.2256, + "step": 3484 + }, + { + "epoch": 0.7053228091479458, + "grad_norm": 0.29464036226272583, + "learning_rate": 0.00014480147180465734, + "loss": 0.2081, + "step": 3485 + }, + { + "epoch": 0.7055251973284761, + "grad_norm": 0.31715965270996094, + "learning_rate": 0.00014477303310996473, + "loss": 0.254, + "step": 3486 + }, + { + "epoch": 0.7057275855090063, + "grad_norm": 0.27292385697364807, + "learning_rate": 0.0001447445898856064, + "loss": 0.2361, + "step": 3487 + }, + { + "epoch": 0.7059299736895366, + "grad_norm": 0.2801492512226105, + "learning_rate": 0.00014471614213445996, + "loss": 0.2489, + "step": 3488 + }, + { + "epoch": 0.7061323618700668, + "grad_norm": 0.44114720821380615, + "learning_rate": 0.0001446876898594035, + "loss": 0.2458, + "step": 3489 + }, + { + "epoch": 0.7063347500505971, + "grad_norm": 0.33379796147346497, + "learning_rate": 0.00014465923306331544, + "loss": 0.2793, + "step": 3490 + }, + { + "epoch": 0.7065371382311273, + "grad_norm": 0.3142834007740021, + "learning_rate": 0.0001446307717490748, + "loss": 0.2649, + "step": 3491 + }, + { + "epoch": 0.7067395264116576, + "grad_norm": 0.31089064478874207, + "learning_rate": 0.00014460230591956097, + "loss": 0.2507, + "step": 3492 + }, + { + "epoch": 0.7069419145921878, + "grad_norm": 0.2820049524307251, + "learning_rate": 0.00014457383557765386, + "loss": 0.2474, + "step": 3493 + }, + { + "epoch": 0.7071443027727181, + "grad_norm": 0.2603655457496643, + "learning_rate": 0.00014454536072623373, + "loss": 0.2381, + "step": 3494 + }, + { + "epoch": 0.7073466909532483, + "grad_norm": 0.33057910203933716, + "learning_rate": 0.00014451688136818145, + "loss": 0.2445, + "step": 3495 + }, + { + "epoch": 0.7075490791337786, + "grad_norm": 0.30307960510253906, + "learning_rate": 0.0001444883975063782, + "loss": 0.2287, + "step": 3496 + }, + { + "epoch": 0.7077514673143088, + "grad_norm": 0.40663906931877136, + "learning_rate": 0.0001444599091437057, + "loss": 0.2353, + "step": 3497 + }, + { + "epoch": 0.7079538554948391, + "grad_norm": 0.30998459458351135, + "learning_rate": 0.0001444314162830461, + "loss": 0.2435, + "step": 3498 + }, + { + "epoch": 0.7081562436753693, + "grad_norm": 0.328166663646698, + "learning_rate": 0.00014440291892728205, + "loss": 0.2454, + "step": 3499 + }, + { + "epoch": 0.7083586318558996, + "grad_norm": 0.297428160905838, + "learning_rate": 0.00014437441707929657, + "loss": 0.2609, + "step": 3500 + }, + { + "epoch": 0.7083586318558996, + "eval_loss": 0.26719534397125244, + "eval_runtime": 0.7389, + "eval_samples_per_second": 6.766, + "eval_steps_per_second": 1.353, + "step": 3500 + }, + { + "epoch": 0.7085610200364298, + "grad_norm": 0.3447844982147217, + "learning_rate": 0.00014434591074197317, + "loss": 0.2263, + "step": 3501 + }, + { + "epoch": 0.7087634082169602, + "grad_norm": 0.36139580607414246, + "learning_rate": 0.00014431739991819584, + "loss": 0.2489, + "step": 3502 + }, + { + "epoch": 0.7089657963974904, + "grad_norm": 0.2836248576641083, + "learning_rate": 0.000144288884610849, + "loss": 0.225, + "step": 3503 + }, + { + "epoch": 0.7091681845780207, + "grad_norm": 0.36919328570365906, + "learning_rate": 0.00014426036482281752, + "loss": 0.2653, + "step": 3504 + }, + { + "epoch": 0.7093705727585509, + "grad_norm": 0.27661412954330444, + "learning_rate": 0.00014423184055698676, + "loss": 0.221, + "step": 3505 + }, + { + "epoch": 0.7095729609390812, + "grad_norm": 0.30864477157592773, + "learning_rate": 0.0001442033118162425, + "loss": 0.2465, + "step": 3506 + }, + { + "epoch": 0.7097753491196114, + "grad_norm": 0.2647888958454132, + "learning_rate": 0.00014417477860347098, + "loss": 0.2402, + "step": 3507 + }, + { + "epoch": 0.7099777373001417, + "grad_norm": 0.24299830198287964, + "learning_rate": 0.00014414624092155885, + "loss": 0.1997, + "step": 3508 + }, + { + "epoch": 0.7101801254806719, + "grad_norm": 0.323573499917984, + "learning_rate": 0.00014411769877339332, + "loss": 0.2546, + "step": 3509 + }, + { + "epoch": 0.7103825136612022, + "grad_norm": 0.2725816071033478, + "learning_rate": 0.0001440891521618619, + "loss": 0.2336, + "step": 3510 + }, + { + "epoch": 0.7105849018417324, + "grad_norm": 0.3446320593357086, + "learning_rate": 0.00014406060108985275, + "loss": 0.2503, + "step": 3511 + }, + { + "epoch": 0.7107872900222627, + "grad_norm": 0.35565873980522156, + "learning_rate": 0.00014403204556025427, + "loss": 0.283, + "step": 3512 + }, + { + "epoch": 0.7109896782027929, + "grad_norm": 0.27790892124176025, + "learning_rate": 0.00014400348557595544, + "loss": 0.2213, + "step": 3513 + }, + { + "epoch": 0.7111920663833232, + "grad_norm": 0.29456159472465515, + "learning_rate": 0.0001439749211398457, + "loss": 0.2316, + "step": 3514 + }, + { + "epoch": 0.7113944545638534, + "grad_norm": 0.24280983209609985, + "learning_rate": 0.00014394635225481477, + "loss": 0.2218, + "step": 3515 + }, + { + "epoch": 0.7115968427443837, + "grad_norm": 0.3166976571083069, + "learning_rate": 0.00014391777892375313, + "loss": 0.1995, + "step": 3516 + }, + { + "epoch": 0.711799230924914, + "grad_norm": 0.24177271127700806, + "learning_rate": 0.00014388920114955143, + "loss": 0.187, + "step": 3517 + }, + { + "epoch": 0.7120016191054442, + "grad_norm": 0.2750643193721771, + "learning_rate": 0.00014386061893510087, + "loss": 0.2276, + "step": 3518 + }, + { + "epoch": 0.7122040072859745, + "grad_norm": 0.3380866050720215, + "learning_rate": 0.0001438320322832931, + "loss": 0.2652, + "step": 3519 + }, + { + "epoch": 0.7124063954665047, + "grad_norm": 0.30895286798477173, + "learning_rate": 0.00014380344119702023, + "loss": 0.2374, + "step": 3520 + }, + { + "epoch": 0.7126087836470351, + "grad_norm": 0.3229091763496399, + "learning_rate": 0.0001437748456791748, + "loss": 0.2416, + "step": 3521 + }, + { + "epoch": 0.7128111718275653, + "grad_norm": 0.2924569845199585, + "learning_rate": 0.00014374624573264982, + "loss": 0.2296, + "step": 3522 + }, + { + "epoch": 0.7130135600080956, + "grad_norm": 0.28520187735557556, + "learning_rate": 0.00014371764136033872, + "loss": 0.2299, + "step": 3523 + }, + { + "epoch": 0.7132159481886258, + "grad_norm": 0.284915953874588, + "learning_rate": 0.0001436890325651354, + "loss": 0.2233, + "step": 3524 + }, + { + "epoch": 0.7134183363691561, + "grad_norm": 0.3498363792896271, + "learning_rate": 0.00014366041934993416, + "loss": 0.2379, + "step": 3525 + }, + { + "epoch": 0.7136207245496863, + "grad_norm": 0.29052790999412537, + "learning_rate": 0.00014363180171762983, + "loss": 0.2445, + "step": 3526 + }, + { + "epoch": 0.7138231127302166, + "grad_norm": 0.4334908127784729, + "learning_rate": 0.00014360317967111765, + "loss": 0.2351, + "step": 3527 + }, + { + "epoch": 0.7140255009107468, + "grad_norm": 0.2792114317417145, + "learning_rate": 0.00014357455321329328, + "loss": 0.2175, + "step": 3528 + }, + { + "epoch": 0.7142278890912771, + "grad_norm": 0.3908980190753937, + "learning_rate": 0.0001435459223470528, + "loss": 0.279, + "step": 3529 + }, + { + "epoch": 0.7144302772718073, + "grad_norm": 0.2973538339138031, + "learning_rate": 0.0001435172870752928, + "loss": 0.2501, + "step": 3530 + }, + { + "epoch": 0.7146326654523376, + "grad_norm": 0.2654437720775604, + "learning_rate": 0.00014348864740091038, + "loss": 0.2043, + "step": 3531 + }, + { + "epoch": 0.7148350536328678, + "grad_norm": 0.2809726297855377, + "learning_rate": 0.0001434600033268029, + "loss": 0.2059, + "step": 3532 + }, + { + "epoch": 0.7150374418133981, + "grad_norm": 0.27874892950057983, + "learning_rate": 0.00014343135485586828, + "loss": 0.2401, + "step": 3533 + }, + { + "epoch": 0.7152398299939283, + "grad_norm": 0.3021167814731598, + "learning_rate": 0.00014340270199100495, + "loss": 0.2552, + "step": 3534 + }, + { + "epoch": 0.7154422181744586, + "grad_norm": 0.3264015316963196, + "learning_rate": 0.0001433740447351116, + "loss": 0.2306, + "step": 3535 + }, + { + "epoch": 0.7156446063549888, + "grad_norm": 0.26608943939208984, + "learning_rate": 0.00014334538309108757, + "loss": 0.2053, + "step": 3536 + }, + { + "epoch": 0.7158469945355191, + "grad_norm": 0.3866048753261566, + "learning_rate": 0.00014331671706183246, + "loss": 0.2176, + "step": 3537 + }, + { + "epoch": 0.7160493827160493, + "grad_norm": 0.27158334851264954, + "learning_rate": 0.00014328804665024645, + "loss": 0.2598, + "step": 3538 + }, + { + "epoch": 0.7162517708965797, + "grad_norm": 0.4619157016277313, + "learning_rate": 0.0001432593718592301, + "loss": 0.2526, + "step": 3539 + }, + { + "epoch": 0.7164541590771099, + "grad_norm": 0.32520925998687744, + "learning_rate": 0.00014323069269168444, + "loss": 0.2662, + "step": 3540 + }, + { + "epoch": 0.7166565472576402, + "grad_norm": 0.27976465225219727, + "learning_rate": 0.00014320200915051085, + "loss": 0.2724, + "step": 3541 + }, + { + "epoch": 0.7168589354381704, + "grad_norm": 0.2563546299934387, + "learning_rate": 0.00014317332123861133, + "loss": 0.2409, + "step": 3542 + }, + { + "epoch": 0.7170613236187007, + "grad_norm": 0.2647092044353485, + "learning_rate": 0.00014314462895888817, + "loss": 0.209, + "step": 3543 + }, + { + "epoch": 0.7172637117992309, + "grad_norm": 0.26110538840293884, + "learning_rate": 0.00014311593231424415, + "loss": 0.222, + "step": 3544 + }, + { + "epoch": 0.7174660999797612, + "grad_norm": 0.35597339272499084, + "learning_rate": 0.00014308723130758254, + "loss": 0.2434, + "step": 3545 + }, + { + "epoch": 0.7176684881602914, + "grad_norm": 0.2322133183479309, + "learning_rate": 0.00014305852594180692, + "loss": 0.1974, + "step": 3546 + }, + { + "epoch": 0.7178708763408217, + "grad_norm": 0.2905055582523346, + "learning_rate": 0.0001430298162198215, + "loss": 0.207, + "step": 3547 + }, + { + "epoch": 0.718073264521352, + "grad_norm": 0.35906949639320374, + "learning_rate": 0.00014300110214453078, + "loss": 0.2312, + "step": 3548 + }, + { + "epoch": 0.7182756527018822, + "grad_norm": 0.31365084648132324, + "learning_rate": 0.00014297238371883974, + "loss": 0.1912, + "step": 3549 + }, + { + "epoch": 0.7184780408824125, + "grad_norm": 0.36804234981536865, + "learning_rate": 0.00014294366094565384, + "loss": 0.2453, + "step": 3550 + }, + { + "epoch": 0.7184780408824125, + "eval_loss": 0.2697356641292572, + "eval_runtime": 0.7383, + "eval_samples_per_second": 6.772, + "eval_steps_per_second": 1.354, + "step": 3550 + }, + { + "epoch": 0.7186804290629427, + "grad_norm": 0.32128703594207764, + "learning_rate": 0.0001429149338278789, + "loss": 0.1886, + "step": 3551 + }, + { + "epoch": 0.718882817243473, + "grad_norm": 0.332903116941452, + "learning_rate": 0.00014288620236842128, + "loss": 0.2355, + "step": 3552 + }, + { + "epoch": 0.7190852054240032, + "grad_norm": 0.283093124628067, + "learning_rate": 0.00014285746657018768, + "loss": 0.2728, + "step": 3553 + }, + { + "epoch": 0.7192875936045335, + "grad_norm": 0.24236519634723663, + "learning_rate": 0.00014282872643608534, + "loss": 0.1791, + "step": 3554 + }, + { + "epoch": 0.7194899817850637, + "grad_norm": 0.4784996211528778, + "learning_rate": 0.00014279998196902182, + "loss": 0.2489, + "step": 3555 + }, + { + "epoch": 0.719692369965594, + "grad_norm": 0.26929202675819397, + "learning_rate": 0.00014277123317190524, + "loss": 0.215, + "step": 3556 + }, + { + "epoch": 0.7198947581461242, + "grad_norm": 0.3588574528694153, + "learning_rate": 0.0001427424800476441, + "loss": 0.2819, + "step": 3557 + }, + { + "epoch": 0.7200971463266546, + "grad_norm": 0.29234176874160767, + "learning_rate": 0.00014271372259914728, + "loss": 0.2522, + "step": 3558 + }, + { + "epoch": 0.7202995345071848, + "grad_norm": 0.3432823121547699, + "learning_rate": 0.00014268496082932422, + "loss": 0.2103, + "step": 3559 + }, + { + "epoch": 0.7205019226877151, + "grad_norm": 0.3589431941509247, + "learning_rate": 0.0001426561947410847, + "loss": 0.2251, + "step": 3560 + }, + { + "epoch": 0.7207043108682453, + "grad_norm": 0.2709294855594635, + "learning_rate": 0.00014262742433733902, + "loss": 0.2039, + "step": 3561 + }, + { + "epoch": 0.7209066990487756, + "grad_norm": 0.32245975732803345, + "learning_rate": 0.0001425986496209978, + "loss": 0.2192, + "step": 3562 + }, + { + "epoch": 0.7211090872293058, + "grad_norm": 0.3126862347126007, + "learning_rate": 0.0001425698705949722, + "loss": 0.2463, + "step": 3563 + }, + { + "epoch": 0.7213114754098361, + "grad_norm": 0.28890764713287354, + "learning_rate": 0.00014254108726217374, + "loss": 0.2187, + "step": 3564 + }, + { + "epoch": 0.7215138635903663, + "grad_norm": 0.3311489522457123, + "learning_rate": 0.0001425122996255145, + "loss": 0.2877, + "step": 3565 + }, + { + "epoch": 0.7217162517708966, + "grad_norm": 0.26622334122657776, + "learning_rate": 0.00014248350768790685, + "loss": 0.2306, + "step": 3566 + }, + { + "epoch": 0.7219186399514268, + "grad_norm": 0.27433136105537415, + "learning_rate": 0.00014245471145226364, + "loss": 0.2277, + "step": 3567 + }, + { + "epoch": 0.7221210281319571, + "grad_norm": 0.2935827970504761, + "learning_rate": 0.00014242591092149823, + "loss": 0.2446, + "step": 3568 + }, + { + "epoch": 0.7223234163124873, + "grad_norm": 0.2975723445415497, + "learning_rate": 0.00014239710609852428, + "loss": 0.2466, + "step": 3569 + }, + { + "epoch": 0.7225258044930176, + "grad_norm": 0.3258729577064514, + "learning_rate": 0.000142368296986256, + "loss": 0.2191, + "step": 3570 + }, + { + "epoch": 0.7227281926735478, + "grad_norm": 0.3339604139328003, + "learning_rate": 0.00014233948358760803, + "loss": 0.1956, + "step": 3571 + }, + { + "epoch": 0.7229305808540781, + "grad_norm": 0.39879077672958374, + "learning_rate": 0.0001423106659054954, + "loss": 0.2573, + "step": 3572 + }, + { + "epoch": 0.7231329690346083, + "grad_norm": 0.31401699781417847, + "learning_rate": 0.0001422818439428335, + "loss": 0.2251, + "step": 3573 + }, + { + "epoch": 0.7233353572151386, + "grad_norm": 0.26941850781440735, + "learning_rate": 0.0001422530177025383, + "loss": 0.216, + "step": 3574 + }, + { + "epoch": 0.7235377453956688, + "grad_norm": 0.27729305624961853, + "learning_rate": 0.00014222418718752615, + "loss": 0.1955, + "step": 3575 + }, + { + "epoch": 0.7237401335761992, + "grad_norm": 0.3241328299045563, + "learning_rate": 0.00014219535240071377, + "loss": 0.2437, + "step": 3576 + }, + { + "epoch": 0.7239425217567295, + "grad_norm": 0.2551616430282593, + "learning_rate": 0.0001421665133450184, + "loss": 0.2118, + "step": 3577 + }, + { + "epoch": 0.7241449099372597, + "grad_norm": 0.27195173501968384, + "learning_rate": 0.00014213767002335765, + "loss": 0.2311, + "step": 3578 + }, + { + "epoch": 0.72434729811779, + "grad_norm": 0.29431605339050293, + "learning_rate": 0.0001421088224386496, + "loss": 0.2416, + "step": 3579 + }, + { + "epoch": 0.7245496862983202, + "grad_norm": 0.42031076550483704, + "learning_rate": 0.00014207997059381274, + "loss": 0.2509, + "step": 3580 + }, + { + "epoch": 0.7247520744788505, + "grad_norm": 0.2811602056026459, + "learning_rate": 0.00014205111449176597, + "loss": 0.2103, + "step": 3581 + }, + { + "epoch": 0.7249544626593807, + "grad_norm": 0.6383354067802429, + "learning_rate": 0.00014202225413542871, + "loss": 0.2508, + "step": 3582 + }, + { + "epoch": 0.725156850839911, + "grad_norm": 0.28183814883232117, + "learning_rate": 0.0001419933895277207, + "loss": 0.2053, + "step": 3583 + }, + { + "epoch": 0.7253592390204412, + "grad_norm": 0.3368355631828308, + "learning_rate": 0.00014196452067156216, + "loss": 0.283, + "step": 3584 + }, + { + "epoch": 0.7255616272009715, + "grad_norm": 0.24328923225402832, + "learning_rate": 0.00014193564756987374, + "loss": 0.2156, + "step": 3585 + }, + { + "epoch": 0.7257640153815017, + "grad_norm": 0.28014034032821655, + "learning_rate": 0.00014190677022557654, + "loss": 0.2245, + "step": 3586 + }, + { + "epoch": 0.725966403562032, + "grad_norm": 0.3064032793045044, + "learning_rate": 0.00014187788864159206, + "loss": 0.2699, + "step": 3587 + }, + { + "epoch": 0.7261687917425622, + "grad_norm": 0.325112909078598, + "learning_rate": 0.0001418490028208422, + "loss": 0.2325, + "step": 3588 + }, + { + "epoch": 0.7263711799230925, + "grad_norm": 0.32190853357315063, + "learning_rate": 0.00014182011276624938, + "loss": 0.2499, + "step": 3589 + }, + { + "epoch": 0.7265735681036227, + "grad_norm": 0.299363911151886, + "learning_rate": 0.00014179121848073632, + "loss": 0.2269, + "step": 3590 + }, + { + "epoch": 0.726775956284153, + "grad_norm": 0.2915518879890442, + "learning_rate": 0.00014176231996722633, + "loss": 0.2687, + "step": 3591 + }, + { + "epoch": 0.7269783444646832, + "grad_norm": 0.4215262234210968, + "learning_rate": 0.000141733417228643, + "loss": 0.2414, + "step": 3592 + }, + { + "epoch": 0.7271807326452135, + "grad_norm": 0.28340229392051697, + "learning_rate": 0.0001417045102679104, + "loss": 0.2472, + "step": 3593 + }, + { + "epoch": 0.7273831208257437, + "grad_norm": 0.2818836569786072, + "learning_rate": 0.00014167559908795306, + "loss": 0.2392, + "step": 3594 + }, + { + "epoch": 0.7275855090062741, + "grad_norm": 0.3229272663593292, + "learning_rate": 0.00014164668369169591, + "loss": 0.2371, + "step": 3595 + }, + { + "epoch": 0.7277878971868043, + "grad_norm": 0.3302474319934845, + "learning_rate": 0.0001416177640820643, + "loss": 0.2704, + "step": 3596 + }, + { + "epoch": 0.7279902853673346, + "grad_norm": 0.5139620900154114, + "learning_rate": 0.000141588840261984, + "loss": 0.1976, + "step": 3597 + }, + { + "epoch": 0.7281926735478648, + "grad_norm": 0.30498552322387695, + "learning_rate": 0.00014155991223438122, + "loss": 0.2457, + "step": 3598 + }, + { + "epoch": 0.7283950617283951, + "grad_norm": 0.35138848423957825, + "learning_rate": 0.00014153098000218263, + "loss": 0.2284, + "step": 3599 + }, + { + "epoch": 0.7285974499089253, + "grad_norm": 0.281194806098938, + "learning_rate": 0.00014150204356831524, + "loss": 0.2158, + "step": 3600 + }, + { + "epoch": 0.7285974499089253, + "eval_loss": 0.26394587755203247, + "eval_runtime": 0.7386, + "eval_samples_per_second": 6.77, + "eval_steps_per_second": 1.354, + "step": 3600 + }, + { + "epoch": 0.7287998380894556, + "grad_norm": 0.2820914685726166, + "learning_rate": 0.00014147310293570657, + "loss": 0.2492, + "step": 3601 + }, + { + "epoch": 0.7290022262699858, + "grad_norm": 0.24649406969547272, + "learning_rate": 0.00014144415810728452, + "loss": 0.2352, + "step": 3602 + }, + { + "epoch": 0.7292046144505161, + "grad_norm": 0.2653089761734009, + "learning_rate": 0.00014141520908597741, + "loss": 0.2308, + "step": 3603 + }, + { + "epoch": 0.7294070026310463, + "grad_norm": 0.25622621178627014, + "learning_rate": 0.000141386255874714, + "loss": 0.2085, + "step": 3604 + }, + { + "epoch": 0.7296093908115766, + "grad_norm": 0.2600463926792145, + "learning_rate": 0.0001413572984764235, + "loss": 0.2411, + "step": 3605 + }, + { + "epoch": 0.7298117789921068, + "grad_norm": 0.36637434363365173, + "learning_rate": 0.0001413283368940355, + "loss": 0.2619, + "step": 3606 + }, + { + "epoch": 0.7300141671726371, + "grad_norm": 0.3341212868690491, + "learning_rate": 0.00014129937113048003, + "loss": 0.2627, + "step": 3607 + }, + { + "epoch": 0.7302165553531674, + "grad_norm": 0.2946942150592804, + "learning_rate": 0.00014127040118868753, + "loss": 0.2332, + "step": 3608 + }, + { + "epoch": 0.7304189435336976, + "grad_norm": 0.33799755573272705, + "learning_rate": 0.0001412414270715889, + "loss": 0.2814, + "step": 3609 + }, + { + "epoch": 0.7306213317142279, + "grad_norm": 0.4021568298339844, + "learning_rate": 0.00014121244878211538, + "loss": 0.2477, + "step": 3610 + }, + { + "epoch": 0.7308237198947581, + "grad_norm": 0.8903542160987854, + "learning_rate": 0.00014118346632319877, + "loss": 0.2525, + "step": 3611 + }, + { + "epoch": 0.7310261080752884, + "grad_norm": 0.27222204208374023, + "learning_rate": 0.00014115447969777114, + "loss": 0.2341, + "step": 3612 + }, + { + "epoch": 0.7312284962558186, + "grad_norm": 0.47628235816955566, + "learning_rate": 0.0001411254889087651, + "loss": 0.2444, + "step": 3613 + }, + { + "epoch": 0.731430884436349, + "grad_norm": 0.290464848279953, + "learning_rate": 0.0001410964939591136, + "loss": 0.2337, + "step": 3614 + }, + { + "epoch": 0.7316332726168792, + "grad_norm": 0.3501092791557312, + "learning_rate": 0.00014106749485175008, + "loss": 0.2542, + "step": 3615 + }, + { + "epoch": 0.7318356607974095, + "grad_norm": 0.3015660047531128, + "learning_rate": 0.00014103849158960834, + "loss": 0.2356, + "step": 3616 + }, + { + "epoch": 0.7320380489779397, + "grad_norm": 0.4557827413082123, + "learning_rate": 0.00014100948417562265, + "loss": 0.2674, + "step": 3617 + }, + { + "epoch": 0.73224043715847, + "grad_norm": 0.448643296957016, + "learning_rate": 0.00014098047261272765, + "loss": 0.2324, + "step": 3618 + }, + { + "epoch": 0.7324428253390002, + "grad_norm": 0.3245983123779297, + "learning_rate": 0.00014095145690385842, + "loss": 0.2622, + "step": 3619 + }, + { + "epoch": 0.7326452135195305, + "grad_norm": 0.3534657955169678, + "learning_rate": 0.00014092243705195046, + "loss": 0.2672, + "step": 3620 + }, + { + "epoch": 0.7328476017000607, + "grad_norm": 0.5434038043022156, + "learning_rate": 0.00014089341305993975, + "loss": 0.2324, + "step": 3621 + }, + { + "epoch": 0.733049989880591, + "grad_norm": 0.32712897658348083, + "learning_rate": 0.0001408643849307626, + "loss": 0.2647, + "step": 3622 + }, + { + "epoch": 0.7332523780611212, + "grad_norm": 0.38458430767059326, + "learning_rate": 0.00014083535266735576, + "loss": 0.2628, + "step": 3623 + }, + { + "epoch": 0.7334547662416515, + "grad_norm": 0.30817049741744995, + "learning_rate": 0.0001408063162726564, + "loss": 0.2344, + "step": 3624 + }, + { + "epoch": 0.7336571544221817, + "grad_norm": 0.5152879953384399, + "learning_rate": 0.00014077727574960213, + "loss": 0.2419, + "step": 3625 + }, + { + "epoch": 0.733859542602712, + "grad_norm": 0.3348987400531769, + "learning_rate": 0.000140748231101131, + "loss": 0.2133, + "step": 3626 + }, + { + "epoch": 0.7340619307832422, + "grad_norm": 0.28777918219566345, + "learning_rate": 0.0001407191823301814, + "loss": 0.2384, + "step": 3627 + }, + { + "epoch": 0.7342643189637725, + "grad_norm": 0.26365089416503906, + "learning_rate": 0.0001406901294396922, + "loss": 0.2235, + "step": 3628 + }, + { + "epoch": 0.7344667071443027, + "grad_norm": 0.2960914671421051, + "learning_rate": 0.00014066107243260268, + "loss": 0.2433, + "step": 3629 + }, + { + "epoch": 0.734669095324833, + "grad_norm": 0.2921745479106903, + "learning_rate": 0.00014063201131185246, + "loss": 0.2398, + "step": 3630 + }, + { + "epoch": 0.7348714835053632, + "grad_norm": 0.36642351746559143, + "learning_rate": 0.0001406029460803817, + "loss": 0.2413, + "step": 3631 + }, + { + "epoch": 0.7350738716858936, + "grad_norm": 0.23936298489570618, + "learning_rate": 0.0001405738767411309, + "loss": 0.2219, + "step": 3632 + }, + { + "epoch": 0.7352762598664238, + "grad_norm": 0.26252833008766174, + "learning_rate": 0.000140544803297041, + "loss": 0.2292, + "step": 3633 + }, + { + "epoch": 0.7354786480469541, + "grad_norm": 0.3257715702056885, + "learning_rate": 0.0001405157257510533, + "loss": 0.2147, + "step": 3634 + }, + { + "epoch": 0.7356810362274843, + "grad_norm": 0.27792125940322876, + "learning_rate": 0.00014048664410610962, + "loss": 0.2064, + "step": 3635 + }, + { + "epoch": 0.7358834244080146, + "grad_norm": 0.4515530467033386, + "learning_rate": 0.0001404575583651521, + "loss": 0.2262, + "step": 3636 + }, + { + "epoch": 0.7360858125885448, + "grad_norm": 0.3586551547050476, + "learning_rate": 0.00014042846853112335, + "loss": 0.2715, + "step": 3637 + }, + { + "epoch": 0.7362882007690751, + "grad_norm": 0.2839229702949524, + "learning_rate": 0.00014039937460696636, + "loss": 0.2308, + "step": 3638 + }, + { + "epoch": 0.7364905889496054, + "grad_norm": 0.2356303334236145, + "learning_rate": 0.0001403702765956246, + "loss": 0.1926, + "step": 3639 + }, + { + "epoch": 0.7366929771301356, + "grad_norm": 0.26308801770210266, + "learning_rate": 0.0001403411745000418, + "loss": 0.2178, + "step": 3640 + }, + { + "epoch": 0.7368953653106659, + "grad_norm": 0.3480371832847595, + "learning_rate": 0.00014031206832316225, + "loss": 0.2719, + "step": 3641 + }, + { + "epoch": 0.7370977534911961, + "grad_norm": 0.28486573696136475, + "learning_rate": 0.00014028295806793064, + "loss": 0.2486, + "step": 3642 + }, + { + "epoch": 0.7373001416717264, + "grad_norm": 0.30231842398643494, + "learning_rate": 0.000140253843737292, + "loss": 0.2635, + "step": 3643 + }, + { + "epoch": 0.7375025298522566, + "grad_norm": 0.31522294878959656, + "learning_rate": 0.00014022472533419187, + "loss": 0.2559, + "step": 3644 + }, + { + "epoch": 0.7377049180327869, + "grad_norm": 0.25341594219207764, + "learning_rate": 0.00014019560286157606, + "loss": 0.2218, + "step": 3645 + }, + { + "epoch": 0.7379073062133171, + "grad_norm": 0.3260419964790344, + "learning_rate": 0.00014016647632239093, + "loss": 0.2633, + "step": 3646 + }, + { + "epoch": 0.7381096943938474, + "grad_norm": 0.3874565362930298, + "learning_rate": 0.0001401373457195832, + "loss": 0.2387, + "step": 3647 + }, + { + "epoch": 0.7383120825743776, + "grad_norm": 0.288016676902771, + "learning_rate": 0.00014010821105609996, + "loss": 0.2298, + "step": 3648 + }, + { + "epoch": 0.738514470754908, + "grad_norm": 0.2668969929218292, + "learning_rate": 0.0001400790723348888, + "loss": 0.2076, + "step": 3649 + }, + { + "epoch": 0.7387168589354381, + "grad_norm": 0.33233851194381714, + "learning_rate": 0.00014004992955889766, + "loss": 0.2395, + "step": 3650 + }, + { + "epoch": 0.7387168589354381, + "eval_loss": 0.2731722295284271, + "eval_runtime": 0.74, + "eval_samples_per_second": 6.757, + "eval_steps_per_second": 1.351, + "step": 3650 + }, + { + "epoch": 0.7389192471159685, + "grad_norm": 0.43040069937705994, + "learning_rate": 0.00014002078273107487, + "loss": 0.3069, + "step": 3651 + }, + { + "epoch": 0.7391216352964987, + "grad_norm": 0.2473803013563156, + "learning_rate": 0.0001399916318543692, + "loss": 0.2209, + "step": 3652 + }, + { + "epoch": 0.739324023477029, + "grad_norm": 0.27324000000953674, + "learning_rate": 0.00013996247693172985, + "loss": 0.2068, + "step": 3653 + }, + { + "epoch": 0.7395264116575592, + "grad_norm": 0.275651216506958, + "learning_rate": 0.00013993331796610642, + "loss": 0.2329, + "step": 3654 + }, + { + "epoch": 0.7397287998380895, + "grad_norm": 0.30674195289611816, + "learning_rate": 0.0001399041549604489, + "loss": 0.2697, + "step": 3655 + }, + { + "epoch": 0.7399311880186197, + "grad_norm": 0.29092442989349365, + "learning_rate": 0.0001398749879177077, + "loss": 0.216, + "step": 3656 + }, + { + "epoch": 0.74013357619915, + "grad_norm": 0.29733288288116455, + "learning_rate": 0.0001398458168408336, + "loss": 0.2313, + "step": 3657 + }, + { + "epoch": 0.7403359643796802, + "grad_norm": 0.2697985768318176, + "learning_rate": 0.00013981664173277783, + "loss": 0.2458, + "step": 3658 + }, + { + "epoch": 0.7405383525602105, + "grad_norm": 0.3197477161884308, + "learning_rate": 0.00013978746259649209, + "loss": 0.2327, + "step": 3659 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 0.3050107955932617, + "learning_rate": 0.00013975827943492835, + "loss": 0.2245, + "step": 3660 + }, + { + "epoch": 0.740943128921271, + "grad_norm": 0.30876925587654114, + "learning_rate": 0.0001397290922510391, + "loss": 0.2203, + "step": 3661 + }, + { + "epoch": 0.7411455171018012, + "grad_norm": 0.2831538915634155, + "learning_rate": 0.00013969990104777713, + "loss": 0.2244, + "step": 3662 + }, + { + "epoch": 0.7413479052823315, + "grad_norm": 0.27325353026390076, + "learning_rate": 0.00013967070582809575, + "loss": 0.2191, + "step": 3663 + }, + { + "epoch": 0.7415502934628617, + "grad_norm": 0.26076215505599976, + "learning_rate": 0.0001396415065949486, + "loss": 0.2338, + "step": 3664 + }, + { + "epoch": 0.741752681643392, + "grad_norm": 0.39337509870529175, + "learning_rate": 0.0001396123033512898, + "loss": 0.2139, + "step": 3665 + }, + { + "epoch": 0.7419550698239222, + "grad_norm": 0.4706534743309021, + "learning_rate": 0.0001395830961000738, + "loss": 0.2194, + "step": 3666 + }, + { + "epoch": 0.7421574580044525, + "grad_norm": 0.3701160252094269, + "learning_rate": 0.00013955388484425543, + "loss": 0.2305, + "step": 3667 + }, + { + "epoch": 0.7423598461849829, + "grad_norm": 0.31697842478752136, + "learning_rate": 0.00013952466958679004, + "loss": 0.2156, + "step": 3668 + }, + { + "epoch": 0.742562234365513, + "grad_norm": 0.2615698277950287, + "learning_rate": 0.0001394954503306333, + "loss": 0.2608, + "step": 3669 + }, + { + "epoch": 0.7427646225460434, + "grad_norm": 0.25155559182167053, + "learning_rate": 0.00013946622707874135, + "loss": 0.2329, + "step": 3670 + }, + { + "epoch": 0.7429670107265736, + "grad_norm": 0.3719973862171173, + "learning_rate": 0.00013943699983407062, + "loss": 0.2478, + "step": 3671 + }, + { + "epoch": 0.7431693989071039, + "grad_norm": 0.2693372964859009, + "learning_rate": 0.00013940776859957808, + "loss": 0.2092, + "step": 3672 + }, + { + "epoch": 0.7433717870876341, + "grad_norm": 0.33275237679481506, + "learning_rate": 0.00013937853337822102, + "loss": 0.2544, + "step": 3673 + }, + { + "epoch": 0.7435741752681644, + "grad_norm": 0.30477502942085266, + "learning_rate": 0.00013934929417295714, + "loss": 0.2512, + "step": 3674 + }, + { + "epoch": 0.7437765634486946, + "grad_norm": 0.3011702299118042, + "learning_rate": 0.00013932005098674457, + "loss": 0.2213, + "step": 3675 + }, + { + "epoch": 0.7439789516292249, + "grad_norm": 0.3222098648548126, + "learning_rate": 0.00013929080382254182, + "loss": 0.2227, + "step": 3676 + }, + { + "epoch": 0.7441813398097551, + "grad_norm": 0.2595398724079132, + "learning_rate": 0.0001392615526833078, + "loss": 0.2112, + "step": 3677 + }, + { + "epoch": 0.7443837279902854, + "grad_norm": 0.3210145831108093, + "learning_rate": 0.00013923229757200185, + "loss": 0.2448, + "step": 3678 + }, + { + "epoch": 0.7445861161708156, + "grad_norm": 0.4244385063648224, + "learning_rate": 0.0001392030384915837, + "loss": 0.2458, + "step": 3679 + }, + { + "epoch": 0.7447885043513459, + "grad_norm": 0.3760313391685486, + "learning_rate": 0.00013917377544501344, + "loss": 0.228, + "step": 3680 + }, + { + "epoch": 0.7449908925318761, + "grad_norm": 0.25254809856414795, + "learning_rate": 0.00013914450843525167, + "loss": 0.2259, + "step": 3681 + }, + { + "epoch": 0.7451932807124064, + "grad_norm": 0.3769349157810211, + "learning_rate": 0.00013911523746525922, + "loss": 0.2457, + "step": 3682 + }, + { + "epoch": 0.7453956688929366, + "grad_norm": 0.35421350598335266, + "learning_rate": 0.00013908596253799752, + "loss": 0.2095, + "step": 3683 + }, + { + "epoch": 0.7455980570734669, + "grad_norm": 0.33458277583122253, + "learning_rate": 0.00013905668365642827, + "loss": 0.264, + "step": 3684 + }, + { + "epoch": 0.7458004452539971, + "grad_norm": 0.31940433382987976, + "learning_rate": 0.00013902740082351355, + "loss": 0.2836, + "step": 3685 + }, + { + "epoch": 0.7460028334345274, + "grad_norm": 0.29059967398643494, + "learning_rate": 0.00013899811404221595, + "loss": 0.2041, + "step": 3686 + }, + { + "epoch": 0.7462052216150576, + "grad_norm": 0.2859112024307251, + "learning_rate": 0.00013896882331549835, + "loss": 0.2211, + "step": 3687 + }, + { + "epoch": 0.746407609795588, + "grad_norm": 0.2609440088272095, + "learning_rate": 0.0001389395286463241, + "loss": 0.2189, + "step": 3688 + }, + { + "epoch": 0.7466099979761182, + "grad_norm": 0.2980991005897522, + "learning_rate": 0.00013891023003765693, + "loss": 0.2549, + "step": 3689 + }, + { + "epoch": 0.7468123861566485, + "grad_norm": 0.28233975172042847, + "learning_rate": 0.00013888092749246098, + "loss": 0.2378, + "step": 3690 + }, + { + "epoch": 0.7470147743371787, + "grad_norm": 0.41510292887687683, + "learning_rate": 0.00013885162101370075, + "loss": 0.2489, + "step": 3691 + }, + { + "epoch": 0.747217162517709, + "grad_norm": 0.28230106830596924, + "learning_rate": 0.00013882231060434116, + "loss": 0.193, + "step": 3692 + }, + { + "epoch": 0.7474195506982392, + "grad_norm": 0.5089678764343262, + "learning_rate": 0.00013879299626734756, + "loss": 0.2595, + "step": 3693 + }, + { + "epoch": 0.7476219388787695, + "grad_norm": 0.36559078097343445, + "learning_rate": 0.00013876367800568564, + "loss": 0.2262, + "step": 3694 + }, + { + "epoch": 0.7478243270592997, + "grad_norm": 0.3150606155395508, + "learning_rate": 0.00013873435582232156, + "loss": 0.2326, + "step": 3695 + }, + { + "epoch": 0.74802671523983, + "grad_norm": 0.3053276836872101, + "learning_rate": 0.00013870502972022173, + "loss": 0.2593, + "step": 3696 + }, + { + "epoch": 0.7482291034203602, + "grad_norm": 0.3074091374874115, + "learning_rate": 0.00013867569970235316, + "loss": 0.2233, + "step": 3697 + }, + { + "epoch": 0.7484314916008905, + "grad_norm": 0.3457677364349365, + "learning_rate": 0.0001386463657716831, + "loss": 0.2811, + "step": 3698 + }, + { + "epoch": 0.7486338797814208, + "grad_norm": 0.28982600569725037, + "learning_rate": 0.00013861702793117924, + "loss": 0.2437, + "step": 3699 + }, + { + "epoch": 0.748836267961951, + "grad_norm": 0.2695489823818207, + "learning_rate": 0.00013858768618380972, + "loss": 0.2155, + "step": 3700 + }, + { + "epoch": 0.748836267961951, + "eval_loss": 0.2748314142227173, + "eval_runtime": 0.7396, + "eval_samples_per_second": 6.761, + "eval_steps_per_second": 1.352, + "step": 3700 + }, + { + "epoch": 0.7490386561424813, + "grad_norm": 0.26668769121170044, + "learning_rate": 0.00013855834053254302, + "loss": 0.2064, + "step": 3701 + }, + { + "epoch": 0.7492410443230115, + "grad_norm": 0.2459854632616043, + "learning_rate": 0.000138528990980348, + "loss": 0.2249, + "step": 3702 + }, + { + "epoch": 0.7494434325035418, + "grad_norm": 0.3013794720172882, + "learning_rate": 0.00013849963753019394, + "loss": 0.2528, + "step": 3703 + }, + { + "epoch": 0.749645820684072, + "grad_norm": 0.3634145259857178, + "learning_rate": 0.00013847028018505056, + "loss": 0.2639, + "step": 3704 + }, + { + "epoch": 0.7498482088646024, + "grad_norm": 0.2606116831302643, + "learning_rate": 0.00013844091894788787, + "loss": 0.2357, + "step": 3705 + }, + { + "epoch": 0.7500505970451325, + "grad_norm": 0.2868957817554474, + "learning_rate": 0.00013841155382167636, + "loss": 0.2351, + "step": 3706 + }, + { + "epoch": 0.7502529852256629, + "grad_norm": 0.27671927213668823, + "learning_rate": 0.0001383821848093869, + "loss": 0.2715, + "step": 3707 + }, + { + "epoch": 0.7504553734061931, + "grad_norm": 0.2719424068927765, + "learning_rate": 0.0001383528119139907, + "loss": 0.2122, + "step": 3708 + }, + { + "epoch": 0.7506577615867234, + "grad_norm": 0.305531769990921, + "learning_rate": 0.00013832343513845943, + "loss": 0.2612, + "step": 3709 + }, + { + "epoch": 0.7508601497672536, + "grad_norm": 0.27149519324302673, + "learning_rate": 0.00013829405448576512, + "loss": 0.2141, + "step": 3710 + }, + { + "epoch": 0.7510625379477839, + "grad_norm": 0.30463123321533203, + "learning_rate": 0.00013826466995888018, + "loss": 0.2458, + "step": 3711 + }, + { + "epoch": 0.7512649261283141, + "grad_norm": 0.31632566452026367, + "learning_rate": 0.00013823528156077744, + "loss": 0.2942, + "step": 3712 + }, + { + "epoch": 0.7514673143088444, + "grad_norm": 0.25641223788261414, + "learning_rate": 0.00013820588929443014, + "loss": 0.2307, + "step": 3713 + }, + { + "epoch": 0.7516697024893746, + "grad_norm": 0.3119845688343048, + "learning_rate": 0.0001381764931628118, + "loss": 0.2242, + "step": 3714 + }, + { + "epoch": 0.7518720906699049, + "grad_norm": 0.3371204435825348, + "learning_rate": 0.00013814709316889648, + "loss": 0.228, + "step": 3715 + }, + { + "epoch": 0.7520744788504351, + "grad_norm": 0.26916682720184326, + "learning_rate": 0.00013811768931565855, + "loss": 0.2293, + "step": 3716 + }, + { + "epoch": 0.7522768670309654, + "grad_norm": 0.3246425986289978, + "learning_rate": 0.00013808828160607282, + "loss": 0.2447, + "step": 3717 + }, + { + "epoch": 0.7524792552114956, + "grad_norm": 0.2535354495048523, + "learning_rate": 0.00013805887004311436, + "loss": 0.2232, + "step": 3718 + }, + { + "epoch": 0.7526816433920259, + "grad_norm": 0.2773735225200653, + "learning_rate": 0.00013802945462975882, + "loss": 0.2405, + "step": 3719 + }, + { + "epoch": 0.7528840315725561, + "grad_norm": 0.2671958804130554, + "learning_rate": 0.00013800003536898207, + "loss": 0.2619, + "step": 3720 + }, + { + "epoch": 0.7530864197530864, + "grad_norm": 0.3313668668270111, + "learning_rate": 0.00013797061226376048, + "loss": 0.204, + "step": 3721 + }, + { + "epoch": 0.7532888079336166, + "grad_norm": 0.2732089161872864, + "learning_rate": 0.00013794118531707076, + "loss": 0.1993, + "step": 3722 + }, + { + "epoch": 0.7534911961141469, + "grad_norm": 0.3743329346179962, + "learning_rate": 0.00013791175453189, + "loss": 0.2622, + "step": 3723 + }, + { + "epoch": 0.7536935842946771, + "grad_norm": 0.5092394351959229, + "learning_rate": 0.00013788231991119577, + "loss": 0.2615, + "step": 3724 + }, + { + "epoch": 0.7538959724752075, + "grad_norm": 0.2902195453643799, + "learning_rate": 0.00013785288145796586, + "loss": 0.276, + "step": 3725 + }, + { + "epoch": 0.7540983606557377, + "grad_norm": 0.35427287220954895, + "learning_rate": 0.00013782343917517856, + "loss": 0.2512, + "step": 3726 + }, + { + "epoch": 0.754300748836268, + "grad_norm": 0.239785298705101, + "learning_rate": 0.00013779399306581262, + "loss": 0.2079, + "step": 3727 + }, + { + "epoch": 0.7545031370167982, + "grad_norm": 0.2598539888858795, + "learning_rate": 0.00013776454313284706, + "loss": 0.2382, + "step": 3728 + }, + { + "epoch": 0.7547055251973285, + "grad_norm": 0.2689754068851471, + "learning_rate": 0.00013773508937926123, + "loss": 0.2359, + "step": 3729 + }, + { + "epoch": 0.7549079133778588, + "grad_norm": 0.26831790804862976, + "learning_rate": 0.00013770563180803502, + "loss": 0.2067, + "step": 3730 + }, + { + "epoch": 0.755110301558389, + "grad_norm": 0.30765849351882935, + "learning_rate": 0.00013767617042214863, + "loss": 0.2504, + "step": 3731 + }, + { + "epoch": 0.7553126897389193, + "grad_norm": 0.33630865812301636, + "learning_rate": 0.00013764670522458262, + "loss": 0.2614, + "step": 3732 + }, + { + "epoch": 0.7555150779194495, + "grad_norm": 0.279212087392807, + "learning_rate": 0.00013761723621831803, + "loss": 0.2096, + "step": 3733 + }, + { + "epoch": 0.7557174660999798, + "grad_norm": 0.3570208251476288, + "learning_rate": 0.00013758776340633616, + "loss": 0.2428, + "step": 3734 + }, + { + "epoch": 0.75591985428051, + "grad_norm": 0.28167930245399475, + "learning_rate": 0.00013755828679161883, + "loss": 0.2107, + "step": 3735 + }, + { + "epoch": 0.7561222424610403, + "grad_norm": 0.2791215181350708, + "learning_rate": 0.00013752880637714812, + "loss": 0.2581, + "step": 3736 + }, + { + "epoch": 0.7563246306415705, + "grad_norm": 0.25869086384773254, + "learning_rate": 0.00013749932216590655, + "loss": 0.2606, + "step": 3737 + }, + { + "epoch": 0.7565270188221008, + "grad_norm": 0.32635021209716797, + "learning_rate": 0.00013746983416087707, + "loss": 0.2695, + "step": 3738 + }, + { + "epoch": 0.756729407002631, + "grad_norm": 0.3721697926521301, + "learning_rate": 0.00013744034236504293, + "loss": 0.2272, + "step": 3739 + }, + { + "epoch": 0.7569317951831613, + "grad_norm": 0.35051414370536804, + "learning_rate": 0.0001374108467813878, + "loss": 0.2694, + "step": 3740 + }, + { + "epoch": 0.7571341833636915, + "grad_norm": 0.3295314610004425, + "learning_rate": 0.0001373813474128957, + "loss": 0.2536, + "step": 3741 + }, + { + "epoch": 0.7573365715442218, + "grad_norm": 0.3035524785518646, + "learning_rate": 0.00013735184426255117, + "loss": 0.2624, + "step": 3742 + }, + { + "epoch": 0.757538959724752, + "grad_norm": 0.6171966791152954, + "learning_rate": 0.00013732233733333894, + "loss": 0.2573, + "step": 3743 + }, + { + "epoch": 0.7577413479052824, + "grad_norm": 0.519873857498169, + "learning_rate": 0.00013729282662824422, + "loss": 0.2199, + "step": 3744 + }, + { + "epoch": 0.7579437360858126, + "grad_norm": 0.3408195376396179, + "learning_rate": 0.00013726331215025266, + "loss": 0.2143, + "step": 3745 + }, + { + "epoch": 0.7581461242663429, + "grad_norm": 0.37395820021629333, + "learning_rate": 0.00013723379390235014, + "loss": 0.2184, + "step": 3746 + }, + { + "epoch": 0.7583485124468731, + "grad_norm": 0.40238866209983826, + "learning_rate": 0.00013720427188752306, + "loss": 0.218, + "step": 3747 + }, + { + "epoch": 0.7585509006274034, + "grad_norm": 0.2972874641418457, + "learning_rate": 0.0001371747461087581, + "loss": 0.2184, + "step": 3748 + }, + { + "epoch": 0.7587532888079336, + "grad_norm": 0.33098655939102173, + "learning_rate": 0.00013714521656904243, + "loss": 0.2227, + "step": 3749 + }, + { + "epoch": 0.7589556769884639, + "grad_norm": 0.4189983904361725, + "learning_rate": 0.00013711568327136347, + "loss": 0.2447, + "step": 3750 + }, + { + "epoch": 0.7589556769884639, + "eval_loss": 0.27769315242767334, + "eval_runtime": 0.7403, + "eval_samples_per_second": 6.754, + "eval_steps_per_second": 1.351, + "step": 3750 + }, + { + "epoch": 0.7591580651689941, + "grad_norm": 0.24924065172672272, + "learning_rate": 0.00013708614621870917, + "loss": 0.2007, + "step": 3751 + }, + { + "epoch": 0.7593604533495244, + "grad_norm": 0.33730489015579224, + "learning_rate": 0.0001370566054140677, + "loss": 0.2599, + "step": 3752 + }, + { + "epoch": 0.7595628415300546, + "grad_norm": 0.28673139214515686, + "learning_rate": 0.00013702706086042777, + "loss": 0.223, + "step": 3753 + }, + { + "epoch": 0.7597652297105849, + "grad_norm": 0.35743647813796997, + "learning_rate": 0.0001369975125607783, + "loss": 0.2624, + "step": 3754 + }, + { + "epoch": 0.7599676178911151, + "grad_norm": 0.3028452694416046, + "learning_rate": 0.00013696796051810873, + "loss": 0.2421, + "step": 3755 + }, + { + "epoch": 0.7601700060716454, + "grad_norm": 0.3120434582233429, + "learning_rate": 0.0001369384047354088, + "loss": 0.2582, + "step": 3756 + }, + { + "epoch": 0.7603723942521756, + "grad_norm": 0.38598236441612244, + "learning_rate": 0.0001369088452156687, + "loss": 0.2957, + "step": 3757 + }, + { + "epoch": 0.7605747824327059, + "grad_norm": 0.261727899312973, + "learning_rate": 0.0001368792819618789, + "loss": 0.2294, + "step": 3758 + }, + { + "epoch": 0.7607771706132362, + "grad_norm": 0.3340347409248352, + "learning_rate": 0.00013684971497703033, + "loss": 0.2494, + "step": 3759 + }, + { + "epoch": 0.7609795587937664, + "grad_norm": 0.32166656851768494, + "learning_rate": 0.00013682014426411428, + "loss": 0.2624, + "step": 3760 + }, + { + "epoch": 0.7611819469742968, + "grad_norm": 0.3166605830192566, + "learning_rate": 0.0001367905698261224, + "loss": 0.2406, + "step": 3761 + }, + { + "epoch": 0.761384335154827, + "grad_norm": 0.34740495681762695, + "learning_rate": 0.00013676099166604665, + "loss": 0.2369, + "step": 3762 + }, + { + "epoch": 0.7615867233353573, + "grad_norm": 0.3042324185371399, + "learning_rate": 0.0001367314097868795, + "loss": 0.1994, + "step": 3763 + }, + { + "epoch": 0.7617891115158875, + "grad_norm": 0.2825720012187958, + "learning_rate": 0.00013670182419161375, + "loss": 0.2239, + "step": 3764 + }, + { + "epoch": 0.7619914996964178, + "grad_norm": 0.29228103160858154, + "learning_rate": 0.0001366722348832425, + "loss": 0.2575, + "step": 3765 + }, + { + "epoch": 0.762193887876948, + "grad_norm": 0.31010910868644714, + "learning_rate": 0.00013664264186475934, + "loss": 0.2331, + "step": 3766 + }, + { + "epoch": 0.7623962760574783, + "grad_norm": 0.2906114459037781, + "learning_rate": 0.00013661304513915817, + "loss": 0.2519, + "step": 3767 + }, + { + "epoch": 0.7625986642380085, + "grad_norm": 0.3258095979690552, + "learning_rate": 0.00013658344470943328, + "loss": 0.2487, + "step": 3768 + }, + { + "epoch": 0.7628010524185388, + "grad_norm": 0.25708824396133423, + "learning_rate": 0.0001365538405785793, + "loss": 0.2267, + "step": 3769 + }, + { + "epoch": 0.763003440599069, + "grad_norm": 0.3476763367652893, + "learning_rate": 0.00013652423274959128, + "loss": 0.2114, + "step": 3770 + }, + { + "epoch": 0.7632058287795993, + "grad_norm": 0.34453055262565613, + "learning_rate": 0.00013649462122546465, + "loss": 0.211, + "step": 3771 + }, + { + "epoch": 0.7634082169601295, + "grad_norm": 0.24794785678386688, + "learning_rate": 0.00013646500600919515, + "loss": 0.2422, + "step": 3772 + }, + { + "epoch": 0.7636106051406598, + "grad_norm": 0.2998785376548767, + "learning_rate": 0.000136435387103779, + "loss": 0.2643, + "step": 3773 + }, + { + "epoch": 0.76381299332119, + "grad_norm": 0.33762747049331665, + "learning_rate": 0.00013640576451221268, + "loss": 0.2893, + "step": 3774 + }, + { + "epoch": 0.7640153815017203, + "grad_norm": 0.2480754256248474, + "learning_rate": 0.0001363761382374931, + "loss": 0.2511, + "step": 3775 + }, + { + "epoch": 0.7642177696822505, + "grad_norm": 0.3679254651069641, + "learning_rate": 0.0001363465082826176, + "loss": 0.2672, + "step": 3776 + }, + { + "epoch": 0.7644201578627808, + "grad_norm": 0.29588577151298523, + "learning_rate": 0.00013631687465058372, + "loss": 0.2382, + "step": 3777 + }, + { + "epoch": 0.764622546043311, + "grad_norm": 0.3305865228176117, + "learning_rate": 0.00013628723734438952, + "loss": 0.2683, + "step": 3778 + }, + { + "epoch": 0.7648249342238413, + "grad_norm": 0.2598002254962921, + "learning_rate": 0.00013625759636703343, + "loss": 0.1883, + "step": 3779 + }, + { + "epoch": 0.7650273224043715, + "grad_norm": 0.27733346819877625, + "learning_rate": 0.00013622795172151417, + "loss": 0.2434, + "step": 3780 + }, + { + "epoch": 0.7652297105849019, + "grad_norm": 0.29522767663002014, + "learning_rate": 0.0001361983034108309, + "loss": 0.2324, + "step": 3781 + }, + { + "epoch": 0.7654320987654321, + "grad_norm": 0.34732332825660706, + "learning_rate": 0.0001361686514379831, + "loss": 0.249, + "step": 3782 + }, + { + "epoch": 0.7656344869459624, + "grad_norm": 0.28269949555397034, + "learning_rate": 0.00013613899580597067, + "loss": 0.2494, + "step": 3783 + }, + { + "epoch": 0.7658368751264926, + "grad_norm": 0.29983946681022644, + "learning_rate": 0.0001361093365177939, + "loss": 0.2453, + "step": 3784 + }, + { + "epoch": 0.7660392633070229, + "grad_norm": 0.26113417744636536, + "learning_rate": 0.0001360796735764533, + "loss": 0.2142, + "step": 3785 + }, + { + "epoch": 0.7662416514875531, + "grad_norm": 0.2883092761039734, + "learning_rate": 0.0001360500069849499, + "loss": 0.2115, + "step": 3786 + }, + { + "epoch": 0.7664440396680834, + "grad_norm": 0.35741910338401794, + "learning_rate": 0.00013602033674628506, + "loss": 0.2536, + "step": 3787 + }, + { + "epoch": 0.7666464278486136, + "grad_norm": 0.3507242798805237, + "learning_rate": 0.00013599066286346052, + "loss": 0.234, + "step": 3788 + }, + { + "epoch": 0.7668488160291439, + "grad_norm": 0.31215426325798035, + "learning_rate": 0.00013596098533947835, + "loss": 0.2234, + "step": 3789 + }, + { + "epoch": 0.7670512042096742, + "grad_norm": 0.2733488976955414, + "learning_rate": 0.00013593130417734103, + "loss": 0.229, + "step": 3790 + }, + { + "epoch": 0.7672535923902044, + "grad_norm": 0.3050664961338043, + "learning_rate": 0.00013590161938005136, + "loss": 0.2256, + "step": 3791 + }, + { + "epoch": 0.7674559805707347, + "grad_norm": 0.3089406490325928, + "learning_rate": 0.00013587193095061255, + "loss": 0.2541, + "step": 3792 + }, + { + "epoch": 0.7676583687512649, + "grad_norm": 0.2558094263076782, + "learning_rate": 0.00013584223889202818, + "loss": 0.2159, + "step": 3793 + }, + { + "epoch": 0.7678607569317952, + "grad_norm": 0.27690911293029785, + "learning_rate": 0.00013581254320730216, + "loss": 0.2479, + "step": 3794 + }, + { + "epoch": 0.7680631451123254, + "grad_norm": 0.28298115730285645, + "learning_rate": 0.00013578284389943884, + "loss": 0.2667, + "step": 3795 + }, + { + "epoch": 0.7682655332928557, + "grad_norm": 0.2989111542701721, + "learning_rate": 0.00013575314097144278, + "loss": 0.2413, + "step": 3796 + }, + { + "epoch": 0.7684679214733859, + "grad_norm": 0.4821934998035431, + "learning_rate": 0.00013572343442631908, + "loss": 0.2648, + "step": 3797 + }, + { + "epoch": 0.7686703096539163, + "grad_norm": 0.26796281337738037, + "learning_rate": 0.00013569372426707314, + "loss": 0.242, + "step": 3798 + }, + { + "epoch": 0.7688726978344465, + "grad_norm": 0.2872363328933716, + "learning_rate": 0.00013566401049671073, + "loss": 0.2224, + "step": 3799 + }, + { + "epoch": 0.7690750860149768, + "grad_norm": 0.2876395583152771, + "learning_rate": 0.0001356342931182379, + "loss": 0.2371, + "step": 3800 + }, + { + "epoch": 0.7690750860149768, + "eval_loss": 0.2702665328979492, + "eval_runtime": 0.7392, + "eval_samples_per_second": 6.764, + "eval_steps_per_second": 1.353, + "step": 3800 + }, + { + "epoch": 0.769277474195507, + "grad_norm": 0.4570426642894745, + "learning_rate": 0.00013560457213466123, + "loss": 0.2508, + "step": 3801 + }, + { + "epoch": 0.7694798623760373, + "grad_norm": 0.2703198492527008, + "learning_rate": 0.00013557484754898752, + "loss": 0.2449, + "step": 3802 + }, + { + "epoch": 0.7696822505565675, + "grad_norm": 0.3182757794857025, + "learning_rate": 0.00013554511936422406, + "loss": 0.221, + "step": 3803 + }, + { + "epoch": 0.7698846387370978, + "grad_norm": 0.30589956045150757, + "learning_rate": 0.00013551538758337835, + "loss": 0.2433, + "step": 3804 + }, + { + "epoch": 0.770087026917628, + "grad_norm": 0.2872970402240753, + "learning_rate": 0.00013548565220945842, + "loss": 0.2522, + "step": 3805 + }, + { + "epoch": 0.7702894150981583, + "grad_norm": 0.442940890789032, + "learning_rate": 0.00013545591324547255, + "loss": 0.2421, + "step": 3806 + }, + { + "epoch": 0.7704918032786885, + "grad_norm": 0.3310321867465973, + "learning_rate": 0.0001354261706944294, + "loss": 0.2519, + "step": 3807 + }, + { + "epoch": 0.7706941914592188, + "grad_norm": 0.3431413173675537, + "learning_rate": 0.00013539642455933802, + "loss": 0.2626, + "step": 3808 + }, + { + "epoch": 0.770896579639749, + "grad_norm": 0.31048741936683655, + "learning_rate": 0.0001353666748432078, + "loss": 0.2482, + "step": 3809 + }, + { + "epoch": 0.7710989678202793, + "grad_norm": 0.31904760003089905, + "learning_rate": 0.00013533692154904853, + "loss": 0.2378, + "step": 3810 + }, + { + "epoch": 0.7713013560008095, + "grad_norm": 0.2913106083869934, + "learning_rate": 0.00013530716467987034, + "loss": 0.2479, + "step": 3811 + }, + { + "epoch": 0.7715037441813398, + "grad_norm": 0.3222017288208008, + "learning_rate": 0.00013527740423868368, + "loss": 0.2563, + "step": 3812 + }, + { + "epoch": 0.77170613236187, + "grad_norm": 0.31831422448158264, + "learning_rate": 0.00013524764022849944, + "loss": 0.2458, + "step": 3813 + }, + { + "epoch": 0.7719085205424003, + "grad_norm": 0.29415562748908997, + "learning_rate": 0.00013521787265232877, + "loss": 0.2653, + "step": 3814 + }, + { + "epoch": 0.7721109087229305, + "grad_norm": 0.35008999705314636, + "learning_rate": 0.0001351881015131833, + "loss": 0.2528, + "step": 3815 + }, + { + "epoch": 0.7723132969034608, + "grad_norm": 0.2886951267719269, + "learning_rate": 0.00013515832681407496, + "loss": 0.237, + "step": 3816 + }, + { + "epoch": 0.772515685083991, + "grad_norm": 0.259790301322937, + "learning_rate": 0.00013512854855801605, + "loss": 0.2272, + "step": 3817 + }, + { + "epoch": 0.7727180732645214, + "grad_norm": 0.2906879782676697, + "learning_rate": 0.00013509876674801916, + "loss": 0.2251, + "step": 3818 + }, + { + "epoch": 0.7729204614450516, + "grad_norm": 0.3026115298271179, + "learning_rate": 0.00013506898138709734, + "loss": 0.2541, + "step": 3819 + }, + { + "epoch": 0.7731228496255819, + "grad_norm": 0.34342706203460693, + "learning_rate": 0.00013503919247826395, + "loss": 0.2424, + "step": 3820 + }, + { + "epoch": 0.7733252378061122, + "grad_norm": 0.317668080329895, + "learning_rate": 0.00013500940002453274, + "loss": 0.2323, + "step": 3821 + }, + { + "epoch": 0.7735276259866424, + "grad_norm": 0.2953903079032898, + "learning_rate": 0.00013497960402891778, + "loss": 0.2357, + "step": 3822 + }, + { + "epoch": 0.7737300141671727, + "grad_norm": 0.3209197223186493, + "learning_rate": 0.00013494980449443354, + "loss": 0.2586, + "step": 3823 + }, + { + "epoch": 0.7739324023477029, + "grad_norm": 0.32878735661506653, + "learning_rate": 0.00013492000142409477, + "loss": 0.2189, + "step": 3824 + }, + { + "epoch": 0.7741347905282332, + "grad_norm": 0.28302595019340515, + "learning_rate": 0.0001348901948209167, + "loss": 0.2361, + "step": 3825 + }, + { + "epoch": 0.7743371787087634, + "grad_norm": 0.311894029378891, + "learning_rate": 0.0001348603846879148, + "loss": 0.1971, + "step": 3826 + }, + { + "epoch": 0.7745395668892937, + "grad_norm": 0.29674622416496277, + "learning_rate": 0.00013483057102810494, + "loss": 0.2246, + "step": 3827 + }, + { + "epoch": 0.7747419550698239, + "grad_norm": 0.2627718150615692, + "learning_rate": 0.00013480075384450342, + "loss": 0.2092, + "step": 3828 + }, + { + "epoch": 0.7749443432503542, + "grad_norm": 0.26441818475723267, + "learning_rate": 0.00013477093314012676, + "loss": 0.2176, + "step": 3829 + }, + { + "epoch": 0.7751467314308844, + "grad_norm": 0.3652019500732422, + "learning_rate": 0.00013474110891799194, + "loss": 0.2678, + "step": 3830 + }, + { + "epoch": 0.7753491196114147, + "grad_norm": 0.30413907766342163, + "learning_rate": 0.00013471128118111624, + "loss": 0.2403, + "step": 3831 + }, + { + "epoch": 0.7755515077919449, + "grad_norm": 0.5906927585601807, + "learning_rate": 0.00013468144993251734, + "loss": 0.2491, + "step": 3832 + }, + { + "epoch": 0.7757538959724752, + "grad_norm": 0.38794979453086853, + "learning_rate": 0.00013465161517521324, + "loss": 0.2314, + "step": 3833 + }, + { + "epoch": 0.7759562841530054, + "grad_norm": 0.3258013129234314, + "learning_rate": 0.00013462177691222235, + "loss": 0.2641, + "step": 3834 + }, + { + "epoch": 0.7761586723335357, + "grad_norm": 0.2821758985519409, + "learning_rate": 0.0001345919351465633, + "loss": 0.2221, + "step": 3835 + }, + { + "epoch": 0.776361060514066, + "grad_norm": 0.27629354596138, + "learning_rate": 0.00013456208988125526, + "loss": 0.213, + "step": 3836 + }, + { + "epoch": 0.7765634486945963, + "grad_norm": 0.29150551557540894, + "learning_rate": 0.0001345322411193176, + "loss": 0.2341, + "step": 3837 + }, + { + "epoch": 0.7767658368751265, + "grad_norm": 0.31135258078575134, + "learning_rate": 0.00013450238886377014, + "loss": 0.2542, + "step": 3838 + }, + { + "epoch": 0.7769682250556568, + "grad_norm": 0.3928423225879669, + "learning_rate": 0.00013447253311763303, + "loss": 0.2261, + "step": 3839 + }, + { + "epoch": 0.777170613236187, + "grad_norm": 0.35403716564178467, + "learning_rate": 0.0001344426738839267, + "loss": 0.2361, + "step": 3840 + }, + { + "epoch": 0.7773730014167173, + "grad_norm": 0.2817465662956238, + "learning_rate": 0.00013441281116567203, + "loss": 0.2367, + "step": 3841 + }, + { + "epoch": 0.7775753895972475, + "grad_norm": 0.3627435266971588, + "learning_rate": 0.0001343829449658902, + "loss": 0.2702, + "step": 3842 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.35481467843055725, + "learning_rate": 0.00013435307528760282, + "loss": 0.2577, + "step": 3843 + }, + { + "epoch": 0.777980165958308, + "grad_norm": 0.5557239055633545, + "learning_rate": 0.00013432320213383172, + "loss": 0.2507, + "step": 3844 + }, + { + "epoch": 0.7781825541388383, + "grad_norm": 0.40870675444602966, + "learning_rate": 0.00013429332550759916, + "loss": 0.2328, + "step": 3845 + }, + { + "epoch": 0.7783849423193685, + "grad_norm": 0.37118127942085266, + "learning_rate": 0.0001342634454119278, + "loss": 0.2839, + "step": 3846 + }, + { + "epoch": 0.7785873304998988, + "grad_norm": 0.34032946825027466, + "learning_rate": 0.00013423356184984054, + "loss": 0.2595, + "step": 3847 + }, + { + "epoch": 0.778789718680429, + "grad_norm": 0.25894200801849365, + "learning_rate": 0.00013420367482436067, + "loss": 0.2174, + "step": 3848 + }, + { + "epoch": 0.7789921068609593, + "grad_norm": 0.24875733256340027, + "learning_rate": 0.00013417378433851188, + "loss": 0.2056, + "step": 3849 + }, + { + "epoch": 0.7791944950414896, + "grad_norm": 0.2893766760826111, + "learning_rate": 0.00013414389039531822, + "loss": 0.2305, + "step": 3850 + }, + { + "epoch": 0.7791944950414896, + "eval_loss": 0.2665500342845917, + "eval_runtime": 0.7438, + "eval_samples_per_second": 6.722, + "eval_steps_per_second": 1.344, + "step": 3850 + }, + { + "epoch": 0.7793968832220198, + "grad_norm": 0.3141258955001831, + "learning_rate": 0.00013411399299780396, + "loss": 0.2666, + "step": 3851 + }, + { + "epoch": 0.7795992714025501, + "grad_norm": 0.2987823188304901, + "learning_rate": 0.00013408409214899384, + "loss": 0.2598, + "step": 3852 + }, + { + "epoch": 0.7798016595830803, + "grad_norm": 0.37383440136909485, + "learning_rate": 0.00013405418785191294, + "loss": 0.2609, + "step": 3853 + }, + { + "epoch": 0.7800040477636107, + "grad_norm": 0.324432909488678, + "learning_rate": 0.0001340242801095866, + "loss": 0.2469, + "step": 3854 + }, + { + "epoch": 0.7802064359441409, + "grad_norm": 0.3773249387741089, + "learning_rate": 0.00013399436892504065, + "loss": 0.2378, + "step": 3855 + }, + { + "epoch": 0.7804088241246712, + "grad_norm": 0.37897977232933044, + "learning_rate": 0.00013396445430130115, + "loss": 0.2207, + "step": 3856 + }, + { + "epoch": 0.7806112123052014, + "grad_norm": 0.3486320972442627, + "learning_rate": 0.00013393453624139455, + "loss": 0.2591, + "step": 3857 + }, + { + "epoch": 0.7808136004857317, + "grad_norm": 0.2976657450199127, + "learning_rate": 0.00013390461474834762, + "loss": 0.2193, + "step": 3858 + }, + { + "epoch": 0.7810159886662619, + "grad_norm": 0.2994661033153534, + "learning_rate": 0.00013387468982518753, + "loss": 0.2529, + "step": 3859 + }, + { + "epoch": 0.7812183768467922, + "grad_norm": 0.3973534405231476, + "learning_rate": 0.0001338447614749418, + "loss": 0.2323, + "step": 3860 + }, + { + "epoch": 0.7814207650273224, + "grad_norm": 0.2849239706993103, + "learning_rate": 0.0001338148297006382, + "loss": 0.2434, + "step": 3861 + }, + { + "epoch": 0.7816231532078527, + "grad_norm": 0.2878144085407257, + "learning_rate": 0.00013378489450530497, + "loss": 0.2419, + "step": 3862 + }, + { + "epoch": 0.7818255413883829, + "grad_norm": 0.32753539085388184, + "learning_rate": 0.0001337549558919706, + "loss": 0.2772, + "step": 3863 + }, + { + "epoch": 0.7820279295689132, + "grad_norm": 0.2736656367778778, + "learning_rate": 0.00013372501386366397, + "loss": 0.224, + "step": 3864 + }, + { + "epoch": 0.7822303177494434, + "grad_norm": 0.29261481761932373, + "learning_rate": 0.00013369506842341431, + "loss": 0.261, + "step": 3865 + }, + { + "epoch": 0.7824327059299737, + "grad_norm": 0.3649926781654358, + "learning_rate": 0.0001336651195742512, + "loss": 0.2538, + "step": 3866 + }, + { + "epoch": 0.7826350941105039, + "grad_norm": 0.30018988251686096, + "learning_rate": 0.00013363516731920453, + "loss": 0.2475, + "step": 3867 + }, + { + "epoch": 0.7828374822910342, + "grad_norm": 0.27778393030166626, + "learning_rate": 0.00013360521166130458, + "loss": 0.259, + "step": 3868 + }, + { + "epoch": 0.7830398704715644, + "grad_norm": 0.2849215865135193, + "learning_rate": 0.0001335752526035819, + "loss": 0.23, + "step": 3869 + }, + { + "epoch": 0.7832422586520947, + "grad_norm": 0.27813011407852173, + "learning_rate": 0.00013354529014906747, + "loss": 0.2454, + "step": 3870 + }, + { + "epoch": 0.7834446468326249, + "grad_norm": 0.30883848667144775, + "learning_rate": 0.00013351532430079256, + "loss": 0.2458, + "step": 3871 + }, + { + "epoch": 0.7836470350131552, + "grad_norm": 0.2674468159675598, + "learning_rate": 0.00013348535506178884, + "loss": 0.2559, + "step": 3872 + }, + { + "epoch": 0.7838494231936854, + "grad_norm": 0.26857122778892517, + "learning_rate": 0.00013345538243508825, + "loss": 0.2429, + "step": 3873 + }, + { + "epoch": 0.7840518113742158, + "grad_norm": 0.27286744117736816, + "learning_rate": 0.0001334254064237231, + "loss": 0.219, + "step": 3874 + }, + { + "epoch": 0.784254199554746, + "grad_norm": 0.27423974871635437, + "learning_rate": 0.00013339542703072604, + "loss": 0.2557, + "step": 3875 + }, + { + "epoch": 0.7844565877352763, + "grad_norm": 0.2912842333316803, + "learning_rate": 0.00013336544425913012, + "loss": 0.2284, + "step": 3876 + }, + { + "epoch": 0.7846589759158065, + "grad_norm": 0.31245914101600647, + "learning_rate": 0.0001333354581119686, + "loss": 0.2634, + "step": 3877 + }, + { + "epoch": 0.7848613640963368, + "grad_norm": 0.2883394658565521, + "learning_rate": 0.00013330546859227524, + "loss": 0.2106, + "step": 3878 + }, + { + "epoch": 0.785063752276867, + "grad_norm": 0.2798727750778198, + "learning_rate": 0.00013327547570308402, + "loss": 0.2401, + "step": 3879 + }, + { + "epoch": 0.7852661404573973, + "grad_norm": 0.2785923182964325, + "learning_rate": 0.00013324547944742934, + "loss": 0.259, + "step": 3880 + }, + { + "epoch": 0.7854685286379276, + "grad_norm": 0.26384279131889343, + "learning_rate": 0.0001332154798283459, + "loss": 0.2411, + "step": 3881 + }, + { + "epoch": 0.7856709168184578, + "grad_norm": 0.3410887122154236, + "learning_rate": 0.00013318547684886873, + "loss": 0.2398, + "step": 3882 + }, + { + "epoch": 0.7858733049989881, + "grad_norm": 0.2712464928627014, + "learning_rate": 0.0001331554705120332, + "loss": 0.2588, + "step": 3883 + }, + { + "epoch": 0.7860756931795183, + "grad_norm": 0.3876480460166931, + "learning_rate": 0.0001331254608208751, + "loss": 0.2451, + "step": 3884 + }, + { + "epoch": 0.7862780813600486, + "grad_norm": 0.26584160327911377, + "learning_rate": 0.00013309544777843045, + "loss": 0.244, + "step": 3885 + }, + { + "epoch": 0.7864804695405788, + "grad_norm": 0.3508015275001526, + "learning_rate": 0.00013306543138773567, + "loss": 0.2187, + "step": 3886 + }, + { + "epoch": 0.7866828577211091, + "grad_norm": 0.2587803602218628, + "learning_rate": 0.00013303541165182747, + "loss": 0.2203, + "step": 3887 + }, + { + "epoch": 0.7868852459016393, + "grad_norm": 0.24990132451057434, + "learning_rate": 0.00013300538857374296, + "loss": 0.2234, + "step": 3888 + }, + { + "epoch": 0.7870876340821696, + "grad_norm": 0.2898756265640259, + "learning_rate": 0.00013297536215651956, + "loss": 0.2462, + "step": 3889 + }, + { + "epoch": 0.7872900222626998, + "grad_norm": 0.2641238868236542, + "learning_rate": 0.000132945332403195, + "loss": 0.1869, + "step": 3890 + }, + { + "epoch": 0.7874924104432302, + "grad_norm": 0.28255870938301086, + "learning_rate": 0.00013291529931680742, + "loss": 0.2374, + "step": 3891 + }, + { + "epoch": 0.7876947986237604, + "grad_norm": 0.31957346200942993, + "learning_rate": 0.00013288526290039523, + "loss": 0.2374, + "step": 3892 + }, + { + "epoch": 0.7878971868042907, + "grad_norm": 0.30044397711753845, + "learning_rate": 0.0001328552231569972, + "loss": 0.2218, + "step": 3893 + }, + { + "epoch": 0.7880995749848209, + "grad_norm": 0.3349805176258087, + "learning_rate": 0.00013282518008965244, + "loss": 0.2455, + "step": 3894 + }, + { + "epoch": 0.7883019631653512, + "grad_norm": 0.29345056414604187, + "learning_rate": 0.0001327951337014004, + "loss": 0.2344, + "step": 3895 + }, + { + "epoch": 0.7885043513458814, + "grad_norm": 0.2971920669078827, + "learning_rate": 0.00013276508399528083, + "loss": 0.2442, + "step": 3896 + }, + { + "epoch": 0.7887067395264117, + "grad_norm": 0.3356497585773468, + "learning_rate": 0.00013273503097433387, + "loss": 0.2679, + "step": 3897 + }, + { + "epoch": 0.7889091277069419, + "grad_norm": 0.3730000853538513, + "learning_rate": 0.00013270497464159994, + "loss": 0.2319, + "step": 3898 + }, + { + "epoch": 0.7891115158874722, + "grad_norm": 0.2450721710920334, + "learning_rate": 0.00013267491500011986, + "loss": 0.2304, + "step": 3899 + }, + { + "epoch": 0.7893139040680024, + "grad_norm": 0.32902464270591736, + "learning_rate": 0.00013264485205293473, + "loss": 0.2454, + "step": 3900 + }, + { + "epoch": 0.7893139040680024, + "eval_loss": 0.2717682421207428, + "eval_runtime": 0.7399, + "eval_samples_per_second": 6.758, + "eval_steps_per_second": 1.352, + "step": 3900 + }, + { + "epoch": 0.7895162922485327, + "grad_norm": 0.2387569099664688, + "learning_rate": 0.000132614785803086, + "loss": 0.191, + "step": 3901 + }, + { + "epoch": 0.7897186804290629, + "grad_norm": 0.28501030802726746, + "learning_rate": 0.00013258471625361552, + "loss": 0.2262, + "step": 3902 + }, + { + "epoch": 0.7899210686095932, + "grad_norm": 0.673722505569458, + "learning_rate": 0.0001325546434075653, + "loss": 0.2345, + "step": 3903 + }, + { + "epoch": 0.7901234567901234, + "grad_norm": 0.3353344798088074, + "learning_rate": 0.00013252456726797786, + "loss": 0.2099, + "step": 3904 + }, + { + "epoch": 0.7903258449706537, + "grad_norm": 0.34284526109695435, + "learning_rate": 0.00013249448783789598, + "loss": 0.2502, + "step": 3905 + }, + { + "epoch": 0.7905282331511839, + "grad_norm": 0.33169737458229065, + "learning_rate": 0.0001324644051203628, + "loss": 0.2368, + "step": 3906 + }, + { + "epoch": 0.7907306213317142, + "grad_norm": 0.30647405982017517, + "learning_rate": 0.00013243431911842175, + "loss": 0.2523, + "step": 3907 + }, + { + "epoch": 0.7909330095122444, + "grad_norm": 0.29891833662986755, + "learning_rate": 0.0001324042298351166, + "loss": 0.2269, + "step": 3908 + }, + { + "epoch": 0.7911353976927747, + "grad_norm": 0.29664790630340576, + "learning_rate": 0.0001323741372734915, + "loss": 0.249, + "step": 3909 + }, + { + "epoch": 0.7913377858733049, + "grad_norm": 0.253517210483551, + "learning_rate": 0.0001323440414365909, + "loss": 0.1896, + "step": 3910 + }, + { + "epoch": 0.7915401740538353, + "grad_norm": 0.3175423741340637, + "learning_rate": 0.00013231394232745959, + "loss": 0.2201, + "step": 3911 + }, + { + "epoch": 0.7917425622343656, + "grad_norm": 0.2625952363014221, + "learning_rate": 0.0001322838399491426, + "loss": 0.232, + "step": 3912 + }, + { + "epoch": 0.7919449504148958, + "grad_norm": 0.2627769708633423, + "learning_rate": 0.00013225373430468545, + "loss": 0.2129, + "step": 3913 + }, + { + "epoch": 0.7921473385954261, + "grad_norm": 0.3566378653049469, + "learning_rate": 0.00013222362539713393, + "loss": 0.2348, + "step": 3914 + }, + { + "epoch": 0.7923497267759563, + "grad_norm": 0.2556769549846649, + "learning_rate": 0.0001321935132295341, + "loss": 0.1982, + "step": 3915 + }, + { + "epoch": 0.7925521149564866, + "grad_norm": 0.30709758400917053, + "learning_rate": 0.00013216339780493242, + "loss": 0.2415, + "step": 3916 + }, + { + "epoch": 0.7927545031370168, + "grad_norm": 0.32757511734962463, + "learning_rate": 0.00013213327912637562, + "loss": 0.2345, + "step": 3917 + }, + { + "epoch": 0.7929568913175471, + "grad_norm": 0.25708115100860596, + "learning_rate": 0.0001321031571969108, + "loss": 0.2151, + "step": 3918 + }, + { + "epoch": 0.7931592794980773, + "grad_norm": 0.2545197010040283, + "learning_rate": 0.0001320730320195854, + "loss": 0.2267, + "step": 3919 + }, + { + "epoch": 0.7933616676786076, + "grad_norm": 0.3093016743659973, + "learning_rate": 0.00013204290359744716, + "loss": 0.241, + "step": 3920 + }, + { + "epoch": 0.7935640558591378, + "grad_norm": 0.3456737697124481, + "learning_rate": 0.00013201277193354414, + "loss": 0.2537, + "step": 3921 + }, + { + "epoch": 0.7937664440396681, + "grad_norm": 0.29381608963012695, + "learning_rate": 0.00013198263703092478, + "loss": 0.2613, + "step": 3922 + }, + { + "epoch": 0.7939688322201983, + "grad_norm": 0.4102049171924591, + "learning_rate": 0.0001319524988926378, + "loss": 0.2701, + "step": 3923 + }, + { + "epoch": 0.7941712204007286, + "grad_norm": 0.28350502252578735, + "learning_rate": 0.00013192235752173222, + "loss": 0.245, + "step": 3924 + }, + { + "epoch": 0.7943736085812588, + "grad_norm": 0.3443673253059387, + "learning_rate": 0.0001318922129212575, + "loss": 0.2672, + "step": 3925 + }, + { + "epoch": 0.7945759967617891, + "grad_norm": 0.3078950345516205, + "learning_rate": 0.0001318620650942633, + "loss": 0.2396, + "step": 3926 + }, + { + "epoch": 0.7947783849423193, + "grad_norm": 0.33737850189208984, + "learning_rate": 0.0001318319140437997, + "loss": 0.2675, + "step": 3927 + }, + { + "epoch": 0.7949807731228496, + "grad_norm": 0.2881615161895752, + "learning_rate": 0.000131801759772917, + "loss": 0.2233, + "step": 3928 + }, + { + "epoch": 0.7951831613033798, + "grad_norm": 0.28353455662727356, + "learning_rate": 0.00013177160228466597, + "loss": 0.2557, + "step": 3929 + }, + { + "epoch": 0.7953855494839102, + "grad_norm": 0.40621811151504517, + "learning_rate": 0.0001317414415820976, + "loss": 0.2427, + "step": 3930 + }, + { + "epoch": 0.7955879376644404, + "grad_norm": 0.2879510223865509, + "learning_rate": 0.00013171127766826323, + "loss": 0.2544, + "step": 3931 + }, + { + "epoch": 0.7957903258449707, + "grad_norm": 0.2822043001651764, + "learning_rate": 0.00013168111054621452, + "loss": 0.2318, + "step": 3932 + }, + { + "epoch": 0.7959927140255009, + "grad_norm": 0.28354987502098083, + "learning_rate": 0.00013165094021900346, + "loss": 0.2519, + "step": 3933 + }, + { + "epoch": 0.7961951022060312, + "grad_norm": 0.3343326151371002, + "learning_rate": 0.0001316207666896824, + "loss": 0.2214, + "step": 3934 + }, + { + "epoch": 0.7963974903865614, + "grad_norm": 0.29361197352409363, + "learning_rate": 0.00013159058996130396, + "loss": 0.2384, + "step": 3935 + }, + { + "epoch": 0.7965998785670917, + "grad_norm": 0.2821219563484192, + "learning_rate": 0.00013156041003692108, + "loss": 0.2219, + "step": 3936 + }, + { + "epoch": 0.7968022667476219, + "grad_norm": 0.3084186315536499, + "learning_rate": 0.0001315302269195871, + "loss": 0.2611, + "step": 3937 + }, + { + "epoch": 0.7970046549281522, + "grad_norm": 0.2837059795856476, + "learning_rate": 0.00013150004061235557, + "loss": 0.2157, + "step": 3938 + }, + { + "epoch": 0.7972070431086824, + "grad_norm": 0.32660189270973206, + "learning_rate": 0.0001314698511182805, + "loss": 0.2524, + "step": 3939 + }, + { + "epoch": 0.7974094312892127, + "grad_norm": 0.27194464206695557, + "learning_rate": 0.00013143965844041608, + "loss": 0.2147, + "step": 3940 + }, + { + "epoch": 0.797611819469743, + "grad_norm": 0.26827967166900635, + "learning_rate": 0.00013140946258181693, + "loss": 0.2114, + "step": 3941 + }, + { + "epoch": 0.7978142076502732, + "grad_norm": 0.4834533631801605, + "learning_rate": 0.0001313792635455379, + "loss": 0.2276, + "step": 3942 + }, + { + "epoch": 0.7980165958308035, + "grad_norm": 0.2604771852493286, + "learning_rate": 0.00013134906133463424, + "loss": 0.2364, + "step": 3943 + }, + { + "epoch": 0.7982189840113337, + "grad_norm": 0.2622084319591522, + "learning_rate": 0.0001313188559521615, + "loss": 0.2335, + "step": 3944 + }, + { + "epoch": 0.798421372191864, + "grad_norm": 0.3305802047252655, + "learning_rate": 0.00013128864740117558, + "loss": 0.2401, + "step": 3945 + }, + { + "epoch": 0.7986237603723942, + "grad_norm": 0.2993631064891815, + "learning_rate": 0.0001312584356847326, + "loss": 0.2432, + "step": 3946 + }, + { + "epoch": 0.7988261485529246, + "grad_norm": 0.26266202330589294, + "learning_rate": 0.0001312282208058891, + "loss": 0.2166, + "step": 3947 + }, + { + "epoch": 0.7990285367334548, + "grad_norm": 0.26602888107299805, + "learning_rate": 0.00013119800276770188, + "loss": 0.2462, + "step": 3948 + }, + { + "epoch": 0.7992309249139851, + "grad_norm": 0.4510941803455353, + "learning_rate": 0.00013116778157322805, + "loss": 0.2508, + "step": 3949 + }, + { + "epoch": 0.7994333130945153, + "grad_norm": 0.26324382424354553, + "learning_rate": 0.0001311375572255252, + "loss": 0.1988, + "step": 3950 + }, + { + "epoch": 0.7994333130945153, + "eval_loss": 0.26543259620666504, + "eval_runtime": 0.7381, + "eval_samples_per_second": 6.774, + "eval_steps_per_second": 1.355, + "step": 3950 + }, + { + "epoch": 0.7996357012750456, + "grad_norm": 0.560707688331604, + "learning_rate": 0.00013110732972765102, + "loss": 0.2846, + "step": 3951 + }, + { + "epoch": 0.7998380894555758, + "grad_norm": 0.33031123876571655, + "learning_rate": 0.00013107709908266357, + "loss": 0.2273, + "step": 3952 + }, + { + "epoch": 0.8000404776361061, + "grad_norm": 0.34044280648231506, + "learning_rate": 0.00013104686529362137, + "loss": 0.2757, + "step": 3953 + }, + { + "epoch": 0.8002428658166363, + "grad_norm": 0.30630120635032654, + "learning_rate": 0.00013101662836358308, + "loss": 0.1898, + "step": 3954 + }, + { + "epoch": 0.8004452539971666, + "grad_norm": 0.3719131648540497, + "learning_rate": 0.00013098638829560778, + "loss": 0.2554, + "step": 3955 + }, + { + "epoch": 0.8006476421776968, + "grad_norm": 0.3335683047771454, + "learning_rate": 0.00013095614509275487, + "loss": 0.249, + "step": 3956 + }, + { + "epoch": 0.8008500303582271, + "grad_norm": 0.37623754143714905, + "learning_rate": 0.00013092589875808404, + "loss": 0.2631, + "step": 3957 + }, + { + "epoch": 0.8010524185387573, + "grad_norm": 0.2632478177547455, + "learning_rate": 0.00013089564929465522, + "loss": 0.2345, + "step": 3958 + }, + { + "epoch": 0.8012548067192876, + "grad_norm": 0.36044973134994507, + "learning_rate": 0.00013086539670552883, + "loss": 0.2686, + "step": 3959 + }, + { + "epoch": 0.8014571948998178, + "grad_norm": 0.32119905948638916, + "learning_rate": 0.00013083514099376545, + "loss": 0.2469, + "step": 3960 + }, + { + "epoch": 0.8016595830803481, + "grad_norm": 0.2759612500667572, + "learning_rate": 0.00013080488216242608, + "loss": 0.2274, + "step": 3961 + }, + { + "epoch": 0.8018619712608783, + "grad_norm": 0.3435122072696686, + "learning_rate": 0.00013077462021457195, + "loss": 0.247, + "step": 3962 + }, + { + "epoch": 0.8020643594414086, + "grad_norm": 0.29719454050064087, + "learning_rate": 0.00013074435515326467, + "loss": 0.2415, + "step": 3963 + }, + { + "epoch": 0.8022667476219388, + "grad_norm": 0.3413975238800049, + "learning_rate": 0.00013071408698156614, + "loss": 0.2576, + "step": 3964 + }, + { + "epoch": 0.8024691358024691, + "grad_norm": 0.33543604612350464, + "learning_rate": 0.00013068381570253856, + "loss": 0.2259, + "step": 3965 + }, + { + "epoch": 0.8026715239829993, + "grad_norm": 0.2922574579715729, + "learning_rate": 0.00013065354131924445, + "loss": 0.246, + "step": 3966 + }, + { + "epoch": 0.8028739121635297, + "grad_norm": 0.26383689045906067, + "learning_rate": 0.00013062326383474668, + "loss": 0.2267, + "step": 3967 + }, + { + "epoch": 0.8030763003440599, + "grad_norm": 0.3454423248767853, + "learning_rate": 0.0001305929832521084, + "loss": 0.2488, + "step": 3968 + }, + { + "epoch": 0.8032786885245902, + "grad_norm": 0.32194650173187256, + "learning_rate": 0.0001305626995743931, + "loss": 0.2327, + "step": 3969 + }, + { + "epoch": 0.8034810767051204, + "grad_norm": 0.40299713611602783, + "learning_rate": 0.00013053241280466452, + "loss": 0.2356, + "step": 3970 + }, + { + "epoch": 0.8036834648856507, + "grad_norm": 0.29903537034988403, + "learning_rate": 0.0001305021229459868, + "loss": 0.248, + "step": 3971 + }, + { + "epoch": 0.803885853066181, + "grad_norm": 0.43103647232055664, + "learning_rate": 0.00013047183000142437, + "loss": 0.2178, + "step": 3972 + }, + { + "epoch": 0.8040882412467112, + "grad_norm": 0.40093111991882324, + "learning_rate": 0.0001304415339740419, + "loss": 0.2566, + "step": 3973 + }, + { + "epoch": 0.8042906294272415, + "grad_norm": 0.287648469209671, + "learning_rate": 0.00013041123486690442, + "loss": 0.2448, + "step": 3974 + }, + { + "epoch": 0.8044930176077717, + "grad_norm": 0.2900941073894501, + "learning_rate": 0.0001303809326830773, + "loss": 0.2246, + "step": 3975 + }, + { + "epoch": 0.804695405788302, + "grad_norm": 0.31337985396385193, + "learning_rate": 0.00013035062742562618, + "loss": 0.2169, + "step": 3976 + }, + { + "epoch": 0.8048977939688322, + "grad_norm": 0.27474328875541687, + "learning_rate": 0.00013032031909761705, + "loss": 0.2235, + "step": 3977 + }, + { + "epoch": 0.8051001821493625, + "grad_norm": 0.30365967750549316, + "learning_rate": 0.0001302900077021162, + "loss": 0.2536, + "step": 3978 + }, + { + "epoch": 0.8053025703298927, + "grad_norm": 0.3188045024871826, + "learning_rate": 0.00013025969324219022, + "loss": 0.2359, + "step": 3979 + }, + { + "epoch": 0.805504958510423, + "grad_norm": 0.3143835961818695, + "learning_rate": 0.00013022937572090596, + "loss": 0.2544, + "step": 3980 + }, + { + "epoch": 0.8057073466909532, + "grad_norm": 0.2658599317073822, + "learning_rate": 0.00013019905514133063, + "loss": 0.2183, + "step": 3981 + }, + { + "epoch": 0.8059097348714835, + "grad_norm": 0.32992562651634216, + "learning_rate": 0.0001301687315065318, + "loss": 0.2483, + "step": 3982 + }, + { + "epoch": 0.8061121230520137, + "grad_norm": 0.30123093724250793, + "learning_rate": 0.0001301384048195773, + "loss": 0.219, + "step": 3983 + }, + { + "epoch": 0.806314511232544, + "grad_norm": 0.25195813179016113, + "learning_rate": 0.0001301080750835352, + "loss": 0.2223, + "step": 3984 + }, + { + "epoch": 0.8065168994130743, + "grad_norm": 0.2840067148208618, + "learning_rate": 0.000130077742301474, + "loss": 0.2412, + "step": 3985 + }, + { + "epoch": 0.8067192875936046, + "grad_norm": 0.26293548941612244, + "learning_rate": 0.00013004740647646246, + "loss": 0.2125, + "step": 3986 + }, + { + "epoch": 0.8069216757741348, + "grad_norm": 0.2873854339122772, + "learning_rate": 0.00013001706761156957, + "loss": 0.2239, + "step": 3987 + }, + { + "epoch": 0.8071240639546651, + "grad_norm": 0.3114651143550873, + "learning_rate": 0.00012998672570986477, + "loss": 0.2515, + "step": 3988 + }, + { + "epoch": 0.8073264521351953, + "grad_norm": 0.328300803899765, + "learning_rate": 0.00012995638077441772, + "loss": 0.2393, + "step": 3989 + }, + { + "epoch": 0.8075288403157256, + "grad_norm": 0.2859802842140198, + "learning_rate": 0.00012992603280829838, + "loss": 0.2563, + "step": 3990 + }, + { + "epoch": 0.8077312284962558, + "grad_norm": 0.26750272512435913, + "learning_rate": 0.00012989568181457704, + "loss": 0.2377, + "step": 3991 + }, + { + "epoch": 0.8079336166767861, + "grad_norm": 0.5690582394599915, + "learning_rate": 0.00012986532779632432, + "loss": 0.2893, + "step": 3992 + }, + { + "epoch": 0.8081360048573163, + "grad_norm": 0.26616957783699036, + "learning_rate": 0.00012983497075661111, + "loss": 0.2584, + "step": 3993 + }, + { + "epoch": 0.8083383930378466, + "grad_norm": 0.40735387802124023, + "learning_rate": 0.0001298046106985086, + "loss": 0.2282, + "step": 3994 + }, + { + "epoch": 0.8085407812183768, + "grad_norm": 0.26333582401275635, + "learning_rate": 0.00012977424762508833, + "loss": 0.2144, + "step": 3995 + }, + { + "epoch": 0.8087431693989071, + "grad_norm": 0.35248100757598877, + "learning_rate": 0.00012974388153942212, + "loss": 0.2685, + "step": 3996 + }, + { + "epoch": 0.8089455575794373, + "grad_norm": 0.3542243540287018, + "learning_rate": 0.00012971351244458202, + "loss": 0.2355, + "step": 3997 + }, + { + "epoch": 0.8091479457599676, + "grad_norm": 0.26675522327423096, + "learning_rate": 0.00012968314034364056, + "loss": 0.2498, + "step": 3998 + }, + { + "epoch": 0.8093503339404978, + "grad_norm": 0.33968645334243774, + "learning_rate": 0.00012965276523967042, + "loss": 0.2583, + "step": 3999 + }, + { + "epoch": 0.8095527221210281, + "grad_norm": 0.2958551049232483, + "learning_rate": 0.0001296223871357446, + "loss": 0.2385, + "step": 4000 + }, + { + "epoch": 0.8095527221210281, + "eval_loss": 0.26493722200393677, + "eval_runtime": 0.7357, + "eval_samples_per_second": 6.797, + "eval_steps_per_second": 1.359, + "step": 4000 + }, + { + "epoch": 0.8097551103015583, + "grad_norm": 0.2750597298145294, + "learning_rate": 0.00012959200603493648, + "loss": 0.2214, + "step": 4001 + }, + { + "epoch": 0.8099574984820886, + "grad_norm": 0.3002444803714752, + "learning_rate": 0.0001295616219403197, + "loss": 0.2292, + "step": 4002 + }, + { + "epoch": 0.810159886662619, + "grad_norm": 0.2769399583339691, + "learning_rate": 0.00012953123485496824, + "loss": 0.2599, + "step": 4003 + }, + { + "epoch": 0.8103622748431492, + "grad_norm": 0.2609650492668152, + "learning_rate": 0.00012950084478195625, + "loss": 0.2499, + "step": 4004 + }, + { + "epoch": 0.8105646630236795, + "grad_norm": 0.2414253205060959, + "learning_rate": 0.00012947045172435838, + "loss": 0.225, + "step": 4005 + }, + { + "epoch": 0.8107670512042097, + "grad_norm": 0.3793722093105316, + "learning_rate": 0.0001294400556852494, + "loss": 0.2173, + "step": 4006 + }, + { + "epoch": 0.81096943938474, + "grad_norm": 0.29010626673698425, + "learning_rate": 0.00012940965666770451, + "loss": 0.2417, + "step": 4007 + }, + { + "epoch": 0.8111718275652702, + "grad_norm": 0.2701972723007202, + "learning_rate": 0.00012937925467479912, + "loss": 0.2269, + "step": 4008 + }, + { + "epoch": 0.8113742157458005, + "grad_norm": 0.3228740990161896, + "learning_rate": 0.00012934884970960907, + "loss": 0.2531, + "step": 4009 + }, + { + "epoch": 0.8115766039263307, + "grad_norm": 0.3706228733062744, + "learning_rate": 0.0001293184417752103, + "loss": 0.236, + "step": 4010 + }, + { + "epoch": 0.811778992106861, + "grad_norm": 0.4635535776615143, + "learning_rate": 0.00012928803087467928, + "loss": 0.2882, + "step": 4011 + }, + { + "epoch": 0.8119813802873912, + "grad_norm": 0.32409602403640747, + "learning_rate": 0.00012925761701109258, + "loss": 0.2508, + "step": 4012 + }, + { + "epoch": 0.8121837684679215, + "grad_norm": 0.3405529856681824, + "learning_rate": 0.00012922720018752721, + "loss": 0.2153, + "step": 4013 + }, + { + "epoch": 0.8123861566484517, + "grad_norm": 0.3170727789402008, + "learning_rate": 0.0001291967804070604, + "loss": 0.2567, + "step": 4014 + }, + { + "epoch": 0.812588544828982, + "grad_norm": 0.2811383605003357, + "learning_rate": 0.0001291663576727697, + "loss": 0.2313, + "step": 4015 + }, + { + "epoch": 0.8127909330095122, + "grad_norm": 0.264926552772522, + "learning_rate": 0.00012913593198773295, + "loss": 0.238, + "step": 4016 + }, + { + "epoch": 0.8129933211900425, + "grad_norm": 0.2697588801383972, + "learning_rate": 0.00012910550335502836, + "loss": 0.2191, + "step": 4017 + }, + { + "epoch": 0.8131957093705727, + "grad_norm": 0.27033311128616333, + "learning_rate": 0.0001290750717777343, + "loss": 0.2307, + "step": 4018 + }, + { + "epoch": 0.813398097551103, + "grad_norm": 0.26735609769821167, + "learning_rate": 0.00012904463725892958, + "loss": 0.2433, + "step": 4019 + }, + { + "epoch": 0.8136004857316332, + "grad_norm": 0.3760308027267456, + "learning_rate": 0.00012901419980169322, + "loss": 0.2575, + "step": 4020 + }, + { + "epoch": 0.8138028739121635, + "grad_norm": 0.27508193254470825, + "learning_rate": 0.00012898375940910458, + "loss": 0.2081, + "step": 4021 + }, + { + "epoch": 0.8140052620926937, + "grad_norm": 0.2871641516685486, + "learning_rate": 0.0001289533160842433, + "loss": 0.2684, + "step": 4022 + }, + { + "epoch": 0.8142076502732241, + "grad_norm": 0.2664308547973633, + "learning_rate": 0.00012892286983018925, + "loss": 0.2353, + "step": 4023 + }, + { + "epoch": 0.8144100384537543, + "grad_norm": 0.3087598979473114, + "learning_rate": 0.00012889242065002273, + "loss": 0.2462, + "step": 4024 + }, + { + "epoch": 0.8146124266342846, + "grad_norm": 0.2692663073539734, + "learning_rate": 0.00012886196854682428, + "loss": 0.2344, + "step": 4025 + }, + { + "epoch": 0.8148148148148148, + "grad_norm": 0.29703715443611145, + "learning_rate": 0.0001288315135236747, + "loss": 0.2446, + "step": 4026 + }, + { + "epoch": 0.8150172029953451, + "grad_norm": 0.32801687717437744, + "learning_rate": 0.00012880105558365509, + "loss": 0.2691, + "step": 4027 + }, + { + "epoch": 0.8152195911758753, + "grad_norm": 0.2968493103981018, + "learning_rate": 0.0001287705947298469, + "loss": 0.2384, + "step": 4028 + }, + { + "epoch": 0.8154219793564056, + "grad_norm": 0.3002225160598755, + "learning_rate": 0.00012874013096533178, + "loss": 0.2361, + "step": 4029 + }, + { + "epoch": 0.8156243675369358, + "grad_norm": 0.3209449052810669, + "learning_rate": 0.0001287096642931918, + "loss": 0.2536, + "step": 4030 + }, + { + "epoch": 0.8158267557174661, + "grad_norm": 0.2750207781791687, + "learning_rate": 0.00012867919471650925, + "loss": 0.2123, + "step": 4031 + }, + { + "epoch": 0.8160291438979964, + "grad_norm": 0.46219751238822937, + "learning_rate": 0.00012864872223836667, + "loss": 0.2511, + "step": 4032 + }, + { + "epoch": 0.8162315320785266, + "grad_norm": 0.2897518575191498, + "learning_rate": 0.00012861824686184698, + "loss": 0.2251, + "step": 4033 + }, + { + "epoch": 0.8164339202590569, + "grad_norm": 0.23715439438819885, + "learning_rate": 0.00012858776859003338, + "loss": 0.2265, + "step": 4034 + }, + { + "epoch": 0.8166363084395871, + "grad_norm": 0.27081891894340515, + "learning_rate": 0.00012855728742600935, + "loss": 0.2126, + "step": 4035 + }, + { + "epoch": 0.8168386966201174, + "grad_norm": 0.2993212640285492, + "learning_rate": 0.0001285268033728586, + "loss": 0.2701, + "step": 4036 + }, + { + "epoch": 0.8170410848006476, + "grad_norm": 0.2464466542005539, + "learning_rate": 0.0001284963164336652, + "loss": 0.2014, + "step": 4037 + }, + { + "epoch": 0.8172434729811779, + "grad_norm": 0.27498987317085266, + "learning_rate": 0.00012846582661151353, + "loss": 0.247, + "step": 4038 + }, + { + "epoch": 0.8174458611617081, + "grad_norm": 0.31696516275405884, + "learning_rate": 0.0001284353339094882, + "loss": 0.2369, + "step": 4039 + }, + { + "epoch": 0.8176482493422385, + "grad_norm": 0.2576943635940552, + "learning_rate": 0.00012840483833067418, + "loss": 0.2366, + "step": 4040 + }, + { + "epoch": 0.8178506375227687, + "grad_norm": 0.4740954339504242, + "learning_rate": 0.00012837433987815663, + "loss": 0.2481, + "step": 4041 + }, + { + "epoch": 0.818053025703299, + "grad_norm": 0.27298733592033386, + "learning_rate": 0.00012834383855502113, + "loss": 0.2112, + "step": 4042 + }, + { + "epoch": 0.8182554138838292, + "grad_norm": 0.2754977345466614, + "learning_rate": 0.00012831333436435344, + "loss": 0.2545, + "step": 4043 + }, + { + "epoch": 0.8184578020643595, + "grad_norm": 0.379720538854599, + "learning_rate": 0.00012828282730923966, + "loss": 0.281, + "step": 4044 + }, + { + "epoch": 0.8186601902448897, + "grad_norm": 0.28790631890296936, + "learning_rate": 0.0001282523173927662, + "loss": 0.2221, + "step": 4045 + }, + { + "epoch": 0.81886257842542, + "grad_norm": 0.2704559862613678, + "learning_rate": 0.0001282218046180197, + "loss": 0.2507, + "step": 4046 + }, + { + "epoch": 0.8190649666059502, + "grad_norm": 0.29410356283187866, + "learning_rate": 0.00012819128898808714, + "loss": 0.2122, + "step": 4047 + }, + { + "epoch": 0.8192673547864805, + "grad_norm": 0.29373201727867126, + "learning_rate": 0.00012816077050605576, + "loss": 0.2455, + "step": 4048 + }, + { + "epoch": 0.8194697429670107, + "grad_norm": 0.27533572912216187, + "learning_rate": 0.0001281302491750131, + "loss": 0.209, + "step": 4049 + }, + { + "epoch": 0.819672131147541, + "grad_norm": 0.2924385070800781, + "learning_rate": 0.00012809972499804704, + "loss": 0.197, + "step": 4050 + }, + { + "epoch": 0.819672131147541, + "eval_loss": 0.27634698152542114, + "eval_runtime": 0.7401, + "eval_samples_per_second": 6.756, + "eval_steps_per_second": 1.351, + "step": 4050 + }, + { + "epoch": 0.8198745193280712, + "grad_norm": 0.5106258988380432, + "learning_rate": 0.00012806919797824564, + "loss": 0.2496, + "step": 4051 + }, + { + "epoch": 0.8200769075086015, + "grad_norm": 0.31579113006591797, + "learning_rate": 0.0001280386681186973, + "loss": 0.24, + "step": 4052 + }, + { + "epoch": 0.8202792956891317, + "grad_norm": 0.2940889000892639, + "learning_rate": 0.00012800813542249072, + "loss": 0.2201, + "step": 4053 + }, + { + "epoch": 0.820481683869662, + "grad_norm": 0.325055330991745, + "learning_rate": 0.0001279775998927149, + "loss": 0.2579, + "step": 4054 + }, + { + "epoch": 0.8206840720501922, + "grad_norm": 0.2981805205345154, + "learning_rate": 0.00012794706153245906, + "loss": 0.2451, + "step": 4055 + }, + { + "epoch": 0.8208864602307225, + "grad_norm": 0.280321329832077, + "learning_rate": 0.0001279165203448128, + "loss": 0.2306, + "step": 4056 + }, + { + "epoch": 0.8210888484112527, + "grad_norm": 0.336793452501297, + "learning_rate": 0.00012788597633286593, + "loss": 0.2391, + "step": 4057 + }, + { + "epoch": 0.821291236591783, + "grad_norm": 0.2743752896785736, + "learning_rate": 0.00012785542949970857, + "loss": 0.2433, + "step": 4058 + }, + { + "epoch": 0.8214936247723132, + "grad_norm": 0.270509272813797, + "learning_rate": 0.00012782487984843116, + "loss": 0.2349, + "step": 4059 + }, + { + "epoch": 0.8216960129528436, + "grad_norm": 0.4689193665981293, + "learning_rate": 0.00012779432738212437, + "loss": 0.2196, + "step": 4060 + }, + { + "epoch": 0.8218984011333738, + "grad_norm": 0.3306290805339813, + "learning_rate": 0.00012776377210387913, + "loss": 0.2352, + "step": 4061 + }, + { + "epoch": 0.8221007893139041, + "grad_norm": 0.28863513469696045, + "learning_rate": 0.0001277332140167868, + "loss": 0.2359, + "step": 4062 + }, + { + "epoch": 0.8223031774944344, + "grad_norm": 0.2727032005786896, + "learning_rate": 0.00012770265312393887, + "loss": 0.1916, + "step": 4063 + }, + { + "epoch": 0.8225055656749646, + "grad_norm": 0.2952982485294342, + "learning_rate": 0.00012767208942842715, + "loss": 0.2046, + "step": 4064 + }, + { + "epoch": 0.8227079538554949, + "grad_norm": 0.27022436261177063, + "learning_rate": 0.00012764152293334382, + "loss": 0.2266, + "step": 4065 + }, + { + "epoch": 0.8229103420360251, + "grad_norm": 0.35855549573898315, + "learning_rate": 0.00012761095364178124, + "loss": 0.2852, + "step": 4066 + }, + { + "epoch": 0.8231127302165554, + "grad_norm": 0.2672812044620514, + "learning_rate": 0.00012758038155683205, + "loss": 0.199, + "step": 4067 + }, + { + "epoch": 0.8233151183970856, + "grad_norm": 0.36959728598594666, + "learning_rate": 0.00012754980668158928, + "loss": 0.2756, + "step": 4068 + }, + { + "epoch": 0.8235175065776159, + "grad_norm": 0.27906864881515503, + "learning_rate": 0.00012751922901914616, + "loss": 0.2363, + "step": 4069 + }, + { + "epoch": 0.8237198947581461, + "grad_norm": 0.2816850244998932, + "learning_rate": 0.00012748864857259617, + "loss": 0.2152, + "step": 4070 + }, + { + "epoch": 0.8239222829386764, + "grad_norm": 0.27567291259765625, + "learning_rate": 0.00012745806534503315, + "loss": 0.2369, + "step": 4071 + }, + { + "epoch": 0.8241246711192066, + "grad_norm": 0.27985164523124695, + "learning_rate": 0.0001274274793395512, + "loss": 0.2626, + "step": 4072 + }, + { + "epoch": 0.8243270592997369, + "grad_norm": 0.26450982689857483, + "learning_rate": 0.00012739689055924473, + "loss": 0.2379, + "step": 4073 + }, + { + "epoch": 0.8245294474802671, + "grad_norm": 0.3406170904636383, + "learning_rate": 0.0001273662990072083, + "loss": 0.2737, + "step": 4074 + }, + { + "epoch": 0.8247318356607974, + "grad_norm": 0.31535694003105164, + "learning_rate": 0.0001273357046865369, + "loss": 0.2237, + "step": 4075 + }, + { + "epoch": 0.8249342238413276, + "grad_norm": 0.3160010576248169, + "learning_rate": 0.00012730510760032573, + "loss": 0.2622, + "step": 4076 + }, + { + "epoch": 0.825136612021858, + "grad_norm": 0.25679177045822144, + "learning_rate": 0.00012727450775167027, + "loss": 0.2174, + "step": 4077 + }, + { + "epoch": 0.8253390002023882, + "grad_norm": 0.2731407582759857, + "learning_rate": 0.00012724390514366632, + "loss": 0.2193, + "step": 4078 + }, + { + "epoch": 0.8255413883829185, + "grad_norm": 0.3082418143749237, + "learning_rate": 0.0001272132997794099, + "loss": 0.2555, + "step": 4079 + }, + { + "epoch": 0.8257437765634487, + "grad_norm": 0.2599988281726837, + "learning_rate": 0.00012718269166199736, + "loss": 0.236, + "step": 4080 + }, + { + "epoch": 0.825946164743979, + "grad_norm": 0.28256499767303467, + "learning_rate": 0.0001271520807945253, + "loss": 0.2459, + "step": 4081 + }, + { + "epoch": 0.8261485529245092, + "grad_norm": 0.32175830006599426, + "learning_rate": 0.00012712146718009062, + "loss": 0.2839, + "step": 4082 + }, + { + "epoch": 0.8263509411050395, + "grad_norm": 0.2378125935792923, + "learning_rate": 0.00012709085082179047, + "loss": 0.2149, + "step": 4083 + }, + { + "epoch": 0.8265533292855697, + "grad_norm": 0.3288223147392273, + "learning_rate": 0.00012706023172272228, + "loss": 0.2315, + "step": 4084 + }, + { + "epoch": 0.8267557174661, + "grad_norm": 0.28977319598197937, + "learning_rate": 0.00012702960988598378, + "loss": 0.2359, + "step": 4085 + }, + { + "epoch": 0.8269581056466302, + "grad_norm": 0.35099443793296814, + "learning_rate": 0.000126998985314673, + "loss": 0.2244, + "step": 4086 + }, + { + "epoch": 0.8271604938271605, + "grad_norm": 0.27095192670822144, + "learning_rate": 0.00012696835801188816, + "loss": 0.2226, + "step": 4087 + }, + { + "epoch": 0.8273628820076907, + "grad_norm": 0.27652791142463684, + "learning_rate": 0.00012693772798072784, + "loss": 0.206, + "step": 4088 + }, + { + "epoch": 0.827565270188221, + "grad_norm": 0.24089790880680084, + "learning_rate": 0.00012690709522429085, + "loss": 0.2188, + "step": 4089 + }, + { + "epoch": 0.8277676583687512, + "grad_norm": 0.32007068395614624, + "learning_rate": 0.0001268764597456763, + "loss": 0.244, + "step": 4090 + }, + { + "epoch": 0.8279700465492815, + "grad_norm": 0.2929043471813202, + "learning_rate": 0.00012684582154798356, + "loss": 0.2341, + "step": 4091 + }, + { + "epoch": 0.8281724347298117, + "grad_norm": 0.35205259919166565, + "learning_rate": 0.00012681518063431232, + "loss": 0.2495, + "step": 4092 + }, + { + "epoch": 0.828374822910342, + "grad_norm": 0.2678942382335663, + "learning_rate": 0.00012678453700776246, + "loss": 0.2477, + "step": 4093 + }, + { + "epoch": 0.8285772110908723, + "grad_norm": 0.3012922704219818, + "learning_rate": 0.00012675389067143416, + "loss": 0.2505, + "step": 4094 + }, + { + "epoch": 0.8287795992714025, + "grad_norm": 0.2550067901611328, + "learning_rate": 0.00012672324162842796, + "loss": 0.2433, + "step": 4095 + }, + { + "epoch": 0.8289819874519329, + "grad_norm": 0.26899462938308716, + "learning_rate": 0.00012669258988184457, + "loss": 0.2533, + "step": 4096 + }, + { + "epoch": 0.829184375632463, + "grad_norm": 0.335483193397522, + "learning_rate": 0.00012666193543478502, + "loss": 0.2151, + "step": 4097 + }, + { + "epoch": 0.8293867638129934, + "grad_norm": 0.29949456453323364, + "learning_rate": 0.00012663127829035058, + "loss": 0.2246, + "step": 4098 + }, + { + "epoch": 0.8295891519935236, + "grad_norm": 0.2744475305080414, + "learning_rate": 0.00012660061845164286, + "loss": 0.2368, + "step": 4099 + }, + { + "epoch": 0.8297915401740539, + "grad_norm": 0.28630226850509644, + "learning_rate": 0.0001265699559217637, + "loss": 0.2577, + "step": 4100 + }, + { + "epoch": 0.8297915401740539, + "eval_loss": 0.2671542763710022, + "eval_runtime": 0.7421, + "eval_samples_per_second": 6.738, + "eval_steps_per_second": 1.348, + "step": 4100 + }, + { + "epoch": 0.8299939283545841, + "grad_norm": 0.26792484521865845, + "learning_rate": 0.00012653929070381514, + "loss": 0.2567, + "step": 4101 + }, + { + "epoch": 0.8301963165351144, + "grad_norm": 0.3261709213256836, + "learning_rate": 0.00012650862280089967, + "loss": 0.2265, + "step": 4102 + }, + { + "epoch": 0.8303987047156446, + "grad_norm": 0.3107450306415558, + "learning_rate": 0.00012647795221611987, + "loss": 0.2699, + "step": 4103 + }, + { + "epoch": 0.8306010928961749, + "grad_norm": 0.3134928047657013, + "learning_rate": 0.00012644727895257872, + "loss": 0.2574, + "step": 4104 + }, + { + "epoch": 0.8308034810767051, + "grad_norm": 0.44662341475486755, + "learning_rate": 0.00012641660301337937, + "loss": 0.2677, + "step": 4105 + }, + { + "epoch": 0.8310058692572354, + "grad_norm": 0.3029615581035614, + "learning_rate": 0.00012638592440162533, + "loss": 0.2132, + "step": 4106 + }, + { + "epoch": 0.8312082574377656, + "grad_norm": 0.3081475496292114, + "learning_rate": 0.0001263552431204203, + "loss": 0.2333, + "step": 4107 + }, + { + "epoch": 0.8314106456182959, + "grad_norm": 0.2571415305137634, + "learning_rate": 0.0001263245591728683, + "loss": 0.2376, + "step": 4108 + }, + { + "epoch": 0.8316130337988261, + "grad_norm": 0.3064200282096863, + "learning_rate": 0.00012629387256207365, + "loss": 0.2657, + "step": 4109 + }, + { + "epoch": 0.8318154219793564, + "grad_norm": 0.34786108136177063, + "learning_rate": 0.00012626318329114089, + "loss": 0.2352, + "step": 4110 + }, + { + "epoch": 0.8320178101598866, + "grad_norm": 0.3301227390766144, + "learning_rate": 0.0001262324913631748, + "loss": 0.2761, + "step": 4111 + }, + { + "epoch": 0.8322201983404169, + "grad_norm": 0.2794135510921478, + "learning_rate": 0.00012620179678128051, + "loss": 0.2733, + "step": 4112 + }, + { + "epoch": 0.8324225865209471, + "grad_norm": 0.25935810804367065, + "learning_rate": 0.00012617109954856333, + "loss": 0.2474, + "step": 4113 + }, + { + "epoch": 0.8326249747014774, + "grad_norm": 0.34428855776786804, + "learning_rate": 0.00012614039966812892, + "loss": 0.2705, + "step": 4114 + }, + { + "epoch": 0.8328273628820076, + "grad_norm": 0.27258262038230896, + "learning_rate": 0.00012610969714308315, + "loss": 0.2574, + "step": 4115 + }, + { + "epoch": 0.833029751062538, + "grad_norm": 0.2827422320842743, + "learning_rate": 0.0001260789919765322, + "loss": 0.2316, + "step": 4116 + }, + { + "epoch": 0.8332321392430682, + "grad_norm": 0.41177263855934143, + "learning_rate": 0.00012604828417158248, + "loss": 0.2643, + "step": 4117 + }, + { + "epoch": 0.8334345274235985, + "grad_norm": 0.41825127601623535, + "learning_rate": 0.0001260175737313407, + "loss": 0.2359, + "step": 4118 + }, + { + "epoch": 0.8336369156041287, + "grad_norm": 0.2745526134967804, + "learning_rate": 0.0001259868606589138, + "loss": 0.2481, + "step": 4119 + }, + { + "epoch": 0.833839303784659, + "grad_norm": 0.330802321434021, + "learning_rate": 0.00012595614495740902, + "loss": 0.2572, + "step": 4120 + }, + { + "epoch": 0.8340416919651892, + "grad_norm": 0.2694607377052307, + "learning_rate": 0.00012592542662993384, + "loss": 0.2102, + "step": 4121 + }, + { + "epoch": 0.8342440801457195, + "grad_norm": 0.31097790598869324, + "learning_rate": 0.00012589470567959601, + "loss": 0.2288, + "step": 4122 + }, + { + "epoch": 0.8344464683262498, + "grad_norm": 0.30229610204696655, + "learning_rate": 0.0001258639821095036, + "loss": 0.2442, + "step": 4123 + }, + { + "epoch": 0.83464885650678, + "grad_norm": 0.28008151054382324, + "learning_rate": 0.00012583325592276486, + "loss": 0.249, + "step": 4124 + }, + { + "epoch": 0.8348512446873103, + "grad_norm": 0.3760617971420288, + "learning_rate": 0.00012580252712248832, + "loss": 0.2887, + "step": 4125 + }, + { + "epoch": 0.8350536328678405, + "grad_norm": 0.27306854724884033, + "learning_rate": 0.00012577179571178287, + "loss": 0.2536, + "step": 4126 + }, + { + "epoch": 0.8352560210483708, + "grad_norm": 0.3283601701259613, + "learning_rate": 0.0001257410616937575, + "loss": 0.253, + "step": 4127 + }, + { + "epoch": 0.835458409228901, + "grad_norm": 0.26647692918777466, + "learning_rate": 0.0001257103250715217, + "loss": 0.2347, + "step": 4128 + }, + { + "epoch": 0.8356607974094313, + "grad_norm": 0.31827718019485474, + "learning_rate": 0.00012567958584818492, + "loss": 0.2342, + "step": 4129 + }, + { + "epoch": 0.8358631855899615, + "grad_norm": 0.3126533031463623, + "learning_rate": 0.0001256488440268571, + "loss": 0.2565, + "step": 4130 + }, + { + "epoch": 0.8360655737704918, + "grad_norm": 0.2776491940021515, + "learning_rate": 0.00012561809961064837, + "loss": 0.2314, + "step": 4131 + }, + { + "epoch": 0.836267961951022, + "grad_norm": 0.25948649644851685, + "learning_rate": 0.00012558735260266915, + "loss": 0.224, + "step": 4132 + }, + { + "epoch": 0.8364703501315524, + "grad_norm": 0.3881494998931885, + "learning_rate": 0.00012555660300603004, + "loss": 0.2598, + "step": 4133 + }, + { + "epoch": 0.8366727383120826, + "grad_norm": 0.31026434898376465, + "learning_rate": 0.00012552585082384202, + "loss": 0.2294, + "step": 4134 + }, + { + "epoch": 0.8368751264926129, + "grad_norm": 0.2892588973045349, + "learning_rate": 0.00012549509605921626, + "loss": 0.2383, + "step": 4135 + }, + { + "epoch": 0.8370775146731431, + "grad_norm": 0.28856706619262695, + "learning_rate": 0.0001254643387152642, + "loss": 0.2234, + "step": 4136 + }, + { + "epoch": 0.8372799028536734, + "grad_norm": 0.32769984006881714, + "learning_rate": 0.0001254335787950975, + "loss": 0.28, + "step": 4137 + }, + { + "epoch": 0.8374822910342036, + "grad_norm": 0.34366121888160706, + "learning_rate": 0.0001254028163018282, + "loss": 0.2125, + "step": 4138 + }, + { + "epoch": 0.8376846792147339, + "grad_norm": 0.24694864451885223, + "learning_rate": 0.0001253720512385685, + "loss": 0.219, + "step": 4139 + }, + { + "epoch": 0.8378870673952641, + "grad_norm": 0.2664625644683838, + "learning_rate": 0.00012534128360843088, + "loss": 0.2167, + "step": 4140 + }, + { + "epoch": 0.8380894555757944, + "grad_norm": 0.27844277024269104, + "learning_rate": 0.0001253105134145281, + "loss": 0.2333, + "step": 4141 + }, + { + "epoch": 0.8382918437563246, + "grad_norm": 0.34461063146591187, + "learning_rate": 0.00012527974065997314, + "loss": 0.2255, + "step": 4142 + }, + { + "epoch": 0.8384942319368549, + "grad_norm": 0.37902191281318665, + "learning_rate": 0.00012524896534787927, + "loss": 0.2471, + "step": 4143 + }, + { + "epoch": 0.8386966201173851, + "grad_norm": 0.3178468942642212, + "learning_rate": 0.00012521818748136005, + "loss": 0.2689, + "step": 4144 + }, + { + "epoch": 0.8388990082979154, + "grad_norm": 0.37401625514030457, + "learning_rate": 0.0001251874070635292, + "loss": 0.2641, + "step": 4145 + }, + { + "epoch": 0.8391013964784456, + "grad_norm": 0.4163181781768799, + "learning_rate": 0.0001251566240975008, + "loss": 0.2422, + "step": 4146 + }, + { + "epoch": 0.8393037846589759, + "grad_norm": 0.30229562520980835, + "learning_rate": 0.00012512583858638915, + "loss": 0.2431, + "step": 4147 + }, + { + "epoch": 0.8395061728395061, + "grad_norm": 0.2687293291091919, + "learning_rate": 0.0001250950505333088, + "loss": 0.2104, + "step": 4148 + }, + { + "epoch": 0.8397085610200364, + "grad_norm": 0.3084995150566101, + "learning_rate": 0.00012506425994137453, + "loss": 0.2719, + "step": 4149 + }, + { + "epoch": 0.8399109492005666, + "grad_norm": 0.30836740136146545, + "learning_rate": 0.00012503346681370144, + "loss": 0.2236, + "step": 4150 + }, + { + "epoch": 0.8399109492005666, + "eval_loss": 0.2687932252883911, + "eval_runtime": 0.7396, + "eval_samples_per_second": 6.76, + "eval_steps_per_second": 1.352, + "step": 4150 + }, + { + "epoch": 0.840113337381097, + "grad_norm": 0.313474178314209, + "learning_rate": 0.00012500267115340489, + "loss": 0.229, + "step": 4151 + }, + { + "epoch": 0.8403157255616271, + "grad_norm": 0.25299134850502014, + "learning_rate": 0.0001249718729636004, + "loss": 0.2492, + "step": 4152 + }, + { + "epoch": 0.8405181137421575, + "grad_norm": 0.31801363825798035, + "learning_rate": 0.0001249410722474038, + "loss": 0.2544, + "step": 4153 + }, + { + "epoch": 0.8407205019226878, + "grad_norm": 0.3109778165817261, + "learning_rate": 0.00012491026900793127, + "loss": 0.238, + "step": 4154 + }, + { + "epoch": 0.840922890103218, + "grad_norm": 0.28446948528289795, + "learning_rate": 0.00012487946324829904, + "loss": 0.2125, + "step": 4155 + }, + { + "epoch": 0.8411252782837483, + "grad_norm": 0.3104979693889618, + "learning_rate": 0.0001248486549716238, + "loss": 0.2325, + "step": 4156 + }, + { + "epoch": 0.8413276664642785, + "grad_norm": 0.3089185953140259, + "learning_rate": 0.00012481784418102242, + "loss": 0.2224, + "step": 4157 + }, + { + "epoch": 0.8415300546448088, + "grad_norm": 0.28800642490386963, + "learning_rate": 0.00012478703087961192, + "loss": 0.2248, + "step": 4158 + }, + { + "epoch": 0.841732442825339, + "grad_norm": 0.26682931184768677, + "learning_rate": 0.00012475621507050975, + "loss": 0.2331, + "step": 4159 + }, + { + "epoch": 0.8419348310058693, + "grad_norm": 0.24975162744522095, + "learning_rate": 0.0001247253967568335, + "loss": 0.2743, + "step": 4160 + }, + { + "epoch": 0.8421372191863995, + "grad_norm": 0.2340884804725647, + "learning_rate": 0.00012469457594170105, + "loss": 0.2211, + "step": 4161 + }, + { + "epoch": 0.8423396073669298, + "grad_norm": 0.4196295738220215, + "learning_rate": 0.0001246637526282305, + "loss": 0.2465, + "step": 4162 + }, + { + "epoch": 0.84254199554746, + "grad_norm": 0.34679028391838074, + "learning_rate": 0.00012463292681954029, + "loss": 0.2195, + "step": 4163 + }, + { + "epoch": 0.8427443837279903, + "grad_norm": 0.4965565800666809, + "learning_rate": 0.000124602098518749, + "loss": 0.2472, + "step": 4164 + }, + { + "epoch": 0.8429467719085205, + "grad_norm": 0.31285765767097473, + "learning_rate": 0.00012457126772897554, + "loss": 0.2732, + "step": 4165 + }, + { + "epoch": 0.8431491600890508, + "grad_norm": 0.32587265968322754, + "learning_rate": 0.000124540434453339, + "loss": 0.2426, + "step": 4166 + }, + { + "epoch": 0.843351548269581, + "grad_norm": 0.30432063341140747, + "learning_rate": 0.00012450959869495884, + "loss": 0.2891, + "step": 4167 + }, + { + "epoch": 0.8435539364501113, + "grad_norm": 0.2635737359523773, + "learning_rate": 0.00012447876045695465, + "loss": 0.2311, + "step": 4168 + }, + { + "epoch": 0.8437563246306415, + "grad_norm": 0.28429943323135376, + "learning_rate": 0.00012444791974244632, + "loss": 0.2306, + "step": 4169 + }, + { + "epoch": 0.8439587128111719, + "grad_norm": 0.2997816503047943, + "learning_rate": 0.000124417076554554, + "loss": 0.2489, + "step": 4170 + }, + { + "epoch": 0.844161100991702, + "grad_norm": 0.258544385433197, + "learning_rate": 0.00012438623089639807, + "loss": 0.2333, + "step": 4171 + }, + { + "epoch": 0.8443634891722324, + "grad_norm": 0.2563791871070862, + "learning_rate": 0.0001243553827710992, + "loss": 0.2013, + "step": 4172 + }, + { + "epoch": 0.8445658773527626, + "grad_norm": 0.28656166791915894, + "learning_rate": 0.00012432453218177826, + "loss": 0.2551, + "step": 4173 + }, + { + "epoch": 0.8447682655332929, + "grad_norm": 0.2514500916004181, + "learning_rate": 0.0001242936791315564, + "loss": 0.2311, + "step": 4174 + }, + { + "epoch": 0.8449706537138231, + "grad_norm": 0.3577767014503479, + "learning_rate": 0.00012426282362355497, + "loss": 0.233, + "step": 4175 + }, + { + "epoch": 0.8451730418943534, + "grad_norm": 0.25653162598609924, + "learning_rate": 0.00012423196566089563, + "loss": 0.2361, + "step": 4176 + }, + { + "epoch": 0.8453754300748836, + "grad_norm": 0.26661503314971924, + "learning_rate": 0.00012420110524670027, + "loss": 0.2388, + "step": 4177 + }, + { + "epoch": 0.8455778182554139, + "grad_norm": 0.3121111989021301, + "learning_rate": 0.00012417024238409104, + "loss": 0.2512, + "step": 4178 + }, + { + "epoch": 0.8457802064359441, + "grad_norm": 0.24404673278331757, + "learning_rate": 0.0001241393770761903, + "loss": 0.23, + "step": 4179 + }, + { + "epoch": 0.8459825946164744, + "grad_norm": 0.26664167642593384, + "learning_rate": 0.00012410850932612067, + "loss": 0.2175, + "step": 4180 + }, + { + "epoch": 0.8461849827970046, + "grad_norm": 0.3187786340713501, + "learning_rate": 0.000124077639137005, + "loss": 0.2389, + "step": 4181 + }, + { + "epoch": 0.8463873709775349, + "grad_norm": 0.27171164751052856, + "learning_rate": 0.0001240467665119665, + "loss": 0.2225, + "step": 4182 + }, + { + "epoch": 0.8465897591580651, + "grad_norm": 0.27636775374412537, + "learning_rate": 0.00012401589145412848, + "loss": 0.2409, + "step": 4183 + }, + { + "epoch": 0.8467921473385954, + "grad_norm": 0.2631177008152008, + "learning_rate": 0.00012398501396661455, + "loss": 0.208, + "step": 4184 + }, + { + "epoch": 0.8469945355191257, + "grad_norm": 0.24153582751750946, + "learning_rate": 0.00012395413405254853, + "loss": 0.2238, + "step": 4185 + }, + { + "epoch": 0.8471969236996559, + "grad_norm": 0.2937990725040436, + "learning_rate": 0.0001239232517150546, + "loss": 0.2307, + "step": 4186 + }, + { + "epoch": 0.8473993118801862, + "grad_norm": 0.2699730694293976, + "learning_rate": 0.00012389236695725713, + "loss": 0.2187, + "step": 4187 + }, + { + "epoch": 0.8476017000607164, + "grad_norm": 0.31642740964889526, + "learning_rate": 0.00012386147978228062, + "loss": 0.2496, + "step": 4188 + }, + { + "epoch": 0.8478040882412468, + "grad_norm": 0.25987985730171204, + "learning_rate": 0.00012383059019325, + "loss": 0.221, + "step": 4189 + }, + { + "epoch": 0.848006476421777, + "grad_norm": 0.2971332371234894, + "learning_rate": 0.0001237996981932903, + "loss": 0.2033, + "step": 4190 + }, + { + "epoch": 0.8482088646023073, + "grad_norm": 0.2770395576953888, + "learning_rate": 0.00012376880378552684, + "loss": 0.2042, + "step": 4191 + }, + { + "epoch": 0.8484112527828375, + "grad_norm": 0.3305583894252777, + "learning_rate": 0.00012373790697308524, + "loss": 0.2493, + "step": 4192 + }, + { + "epoch": 0.8486136409633678, + "grad_norm": 0.2884077727794647, + "learning_rate": 0.0001237070077590913, + "loss": 0.2318, + "step": 4193 + }, + { + "epoch": 0.848816029143898, + "grad_norm": 0.2692849934101105, + "learning_rate": 0.00012367610614667104, + "loss": 0.2435, + "step": 4194 + }, + { + "epoch": 0.8490184173244283, + "grad_norm": 0.3168359696865082, + "learning_rate": 0.00012364520213895084, + "loss": 0.2159, + "step": 4195 + }, + { + "epoch": 0.8492208055049585, + "grad_norm": 0.2593216598033905, + "learning_rate": 0.00012361429573905716, + "loss": 0.2372, + "step": 4196 + }, + { + "epoch": 0.8494231936854888, + "grad_norm": 0.33089300990104675, + "learning_rate": 0.00012358338695011683, + "loss": 0.2329, + "step": 4197 + }, + { + "epoch": 0.849625581866019, + "grad_norm": 0.32414549589157104, + "learning_rate": 0.00012355247577525686, + "loss": 0.2377, + "step": 4198 + }, + { + "epoch": 0.8498279700465493, + "grad_norm": 0.2713814973831177, + "learning_rate": 0.0001235215622176045, + "loss": 0.2442, + "step": 4199 + }, + { + "epoch": 0.8500303582270795, + "grad_norm": 0.525406002998352, + "learning_rate": 0.00012349064628028731, + "loss": 0.2433, + "step": 4200 + }, + { + "epoch": 0.8500303582270795, + "eval_loss": 0.26852184534072876, + "eval_runtime": 0.7369, + "eval_samples_per_second": 6.785, + "eval_steps_per_second": 1.357, + "step": 4200 + }, + { + "epoch": 0.8502327464076098, + "grad_norm": 0.33354291319847107, + "learning_rate": 0.000123459727966433, + "loss": 0.2556, + "step": 4201 + }, + { + "epoch": 0.85043513458814, + "grad_norm": 0.30555057525634766, + "learning_rate": 0.00012342880727916962, + "loss": 0.2485, + "step": 4202 + }, + { + "epoch": 0.8506375227686703, + "grad_norm": 0.30072835087776184, + "learning_rate": 0.0001233978842216253, + "loss": 0.2212, + "step": 4203 + }, + { + "epoch": 0.8508399109492005, + "grad_norm": 0.29450398683547974, + "learning_rate": 0.0001233669587969286, + "loss": 0.2406, + "step": 4204 + }, + { + "epoch": 0.8510422991297308, + "grad_norm": 0.25204646587371826, + "learning_rate": 0.00012333603100820817, + "loss": 0.231, + "step": 4205 + }, + { + "epoch": 0.851244687310261, + "grad_norm": 0.3143097758293152, + "learning_rate": 0.000123305100858593, + "loss": 0.2027, + "step": 4206 + }, + { + "epoch": 0.8514470754907913, + "grad_norm": 0.2622186243534088, + "learning_rate": 0.00012327416835121227, + "loss": 0.2354, + "step": 4207 + }, + { + "epoch": 0.8516494636713215, + "grad_norm": 0.32362186908721924, + "learning_rate": 0.00012324323348919538, + "loss": 0.2323, + "step": 4208 + }, + { + "epoch": 0.8518518518518519, + "grad_norm": 0.27008962631225586, + "learning_rate": 0.00012321229627567203, + "loss": 0.2444, + "step": 4209 + }, + { + "epoch": 0.8520542400323821, + "grad_norm": 0.26546764373779297, + "learning_rate": 0.0001231813567137721, + "loss": 0.2562, + "step": 4210 + }, + { + "epoch": 0.8522566282129124, + "grad_norm": 0.3695333003997803, + "learning_rate": 0.00012315041480662572, + "loss": 0.2415, + "step": 4211 + }, + { + "epoch": 0.8524590163934426, + "grad_norm": 0.27647483348846436, + "learning_rate": 0.00012311947055736332, + "loss": 0.2173, + "step": 4212 + }, + { + "epoch": 0.8526614045739729, + "grad_norm": 0.29639866948127747, + "learning_rate": 0.00012308852396911545, + "loss": 0.2011, + "step": 4213 + }, + { + "epoch": 0.8528637927545032, + "grad_norm": 0.24613668024539948, + "learning_rate": 0.00012305757504501297, + "loss": 0.2152, + "step": 4214 + }, + { + "epoch": 0.8530661809350334, + "grad_norm": 0.2693268656730652, + "learning_rate": 0.00012302662378818702, + "loss": 0.2334, + "step": 4215 + }, + { + "epoch": 0.8532685691155637, + "grad_norm": 0.2689792811870575, + "learning_rate": 0.0001229956702017689, + "loss": 0.2235, + "step": 4216 + }, + { + "epoch": 0.8534709572960939, + "grad_norm": 0.28079694509506226, + "learning_rate": 0.00012296471428889017, + "loss": 0.2363, + "step": 4217 + }, + { + "epoch": 0.8536733454766242, + "grad_norm": 0.41205066442489624, + "learning_rate": 0.00012293375605268257, + "loss": 0.2761, + "step": 4218 + }, + { + "epoch": 0.8538757336571544, + "grad_norm": 0.28291985392570496, + "learning_rate": 0.0001229027954962782, + "loss": 0.2173, + "step": 4219 + }, + { + "epoch": 0.8540781218376847, + "grad_norm": 0.2866034209728241, + "learning_rate": 0.0001228718326228093, + "loss": 0.2426, + "step": 4220 + }, + { + "epoch": 0.8542805100182149, + "grad_norm": 0.26522722840309143, + "learning_rate": 0.00012284086743540837, + "loss": 0.2061, + "step": 4221 + }, + { + "epoch": 0.8544828981987452, + "grad_norm": 0.2686139941215515, + "learning_rate": 0.00012280989993720812, + "loss": 0.2458, + "step": 4222 + }, + { + "epoch": 0.8546852863792754, + "grad_norm": 0.3038209080696106, + "learning_rate": 0.00012277893013134153, + "loss": 0.2744, + "step": 4223 + }, + { + "epoch": 0.8548876745598057, + "grad_norm": 0.29698294401168823, + "learning_rate": 0.00012274795802094183, + "loss": 0.2576, + "step": 4224 + }, + { + "epoch": 0.8550900627403359, + "grad_norm": 0.3130031228065491, + "learning_rate": 0.00012271698360914241, + "loss": 0.2659, + "step": 4225 + }, + { + "epoch": 0.8552924509208663, + "grad_norm": 0.29156437516212463, + "learning_rate": 0.00012268600689907696, + "loss": 0.2503, + "step": 4226 + }, + { + "epoch": 0.8554948391013965, + "grad_norm": 0.32451170682907104, + "learning_rate": 0.0001226550278938794, + "loss": 0.2334, + "step": 4227 + }, + { + "epoch": 0.8556972272819268, + "grad_norm": 0.4321478605270386, + "learning_rate": 0.0001226240465966838, + "loss": 0.2359, + "step": 4228 + }, + { + "epoch": 0.855899615462457, + "grad_norm": 0.2836534082889557, + "learning_rate": 0.00012259306301062457, + "loss": 0.2312, + "step": 4229 + }, + { + "epoch": 0.8561020036429873, + "grad_norm": 0.28352999687194824, + "learning_rate": 0.00012256207713883633, + "loss": 0.2497, + "step": 4230 + }, + { + "epoch": 0.8563043918235175, + "grad_norm": 0.2701122760772705, + "learning_rate": 0.0001225310889844538, + "loss": 0.2047, + "step": 4231 + }, + { + "epoch": 0.8565067800040478, + "grad_norm": 0.29781049489974976, + "learning_rate": 0.00012250009855061214, + "loss": 0.2241, + "step": 4232 + }, + { + "epoch": 0.856709168184578, + "grad_norm": 0.2576650083065033, + "learning_rate": 0.00012246910584044656, + "loss": 0.2108, + "step": 4233 + }, + { + "epoch": 0.8569115563651083, + "grad_norm": 0.3117930591106415, + "learning_rate": 0.00012243811085709268, + "loss": 0.2287, + "step": 4234 + }, + { + "epoch": 0.8571139445456385, + "grad_norm": 0.2636104226112366, + "learning_rate": 0.00012240711360368613, + "loss": 0.2458, + "step": 4235 + }, + { + "epoch": 0.8573163327261688, + "grad_norm": 0.2883046865463257, + "learning_rate": 0.00012237611408336298, + "loss": 0.2275, + "step": 4236 + }, + { + "epoch": 0.857518720906699, + "grad_norm": 0.31685107946395874, + "learning_rate": 0.00012234511229925935, + "loss": 0.2821, + "step": 4237 + }, + { + "epoch": 0.8577211090872293, + "grad_norm": 0.2774471342563629, + "learning_rate": 0.00012231410825451177, + "loss": 0.2242, + "step": 4238 + }, + { + "epoch": 0.8579234972677595, + "grad_norm": 0.2992092967033386, + "learning_rate": 0.00012228310195225683, + "loss": 0.2295, + "step": 4239 + }, + { + "epoch": 0.8581258854482898, + "grad_norm": 0.49218326807022095, + "learning_rate": 0.00012225209339563145, + "loss": 0.2647, + "step": 4240 + }, + { + "epoch": 0.85832827362882, + "grad_norm": 0.3133196234703064, + "learning_rate": 0.00012222108258777277, + "loss": 0.2413, + "step": 4241 + }, + { + "epoch": 0.8585306618093503, + "grad_norm": 0.3050863742828369, + "learning_rate": 0.0001221900695318181, + "loss": 0.2365, + "step": 4242 + }, + { + "epoch": 0.8587330499898805, + "grad_norm": 0.2712414860725403, + "learning_rate": 0.00012215905423090503, + "loss": 0.2068, + "step": 4243 + }, + { + "epoch": 0.8589354381704108, + "grad_norm": 0.2704077661037445, + "learning_rate": 0.00012212803668817135, + "loss": 0.1997, + "step": 4244 + }, + { + "epoch": 0.8591378263509412, + "grad_norm": 0.2854161262512207, + "learning_rate": 0.00012209701690675512, + "loss": 0.2451, + "step": 4245 + }, + { + "epoch": 0.8593402145314714, + "grad_norm": 0.2779470384120941, + "learning_rate": 0.00012206599488979458, + "loss": 0.234, + "step": 4246 + }, + { + "epoch": 0.8595426027120017, + "grad_norm": 0.2911601662635803, + "learning_rate": 0.00012203497064042821, + "loss": 0.2534, + "step": 4247 + }, + { + "epoch": 0.8597449908925319, + "grad_norm": 0.24760468304157257, + "learning_rate": 0.00012200394416179473, + "loss": 0.207, + "step": 4248 + }, + { + "epoch": 0.8599473790730622, + "grad_norm": 0.2779715359210968, + "learning_rate": 0.00012197291545703306, + "loss": 0.2406, + "step": 4249 + }, + { + "epoch": 0.8601497672535924, + "grad_norm": 0.2952604591846466, + "learning_rate": 0.00012194188452928237, + "loss": 0.2307, + "step": 4250 + }, + { + "epoch": 0.8601497672535924, + "eval_loss": 0.26880744099617004, + "eval_runtime": 0.7389, + "eval_samples_per_second": 6.767, + "eval_steps_per_second": 1.353, + "step": 4250 + }, + { + "epoch": 0.8603521554341227, + "grad_norm": 0.26754164695739746, + "learning_rate": 0.00012191085138168205, + "loss": 0.2347, + "step": 4251 + }, + { + "epoch": 0.8605545436146529, + "grad_norm": 0.3067428171634674, + "learning_rate": 0.00012187981601737168, + "loss": 0.2407, + "step": 4252 + }, + { + "epoch": 0.8607569317951832, + "grad_norm": 0.26935678720474243, + "learning_rate": 0.00012184877843949109, + "loss": 0.2517, + "step": 4253 + }, + { + "epoch": 0.8609593199757134, + "grad_norm": 0.2682383358478546, + "learning_rate": 0.00012181773865118038, + "loss": 0.2088, + "step": 4254 + }, + { + "epoch": 0.8611617081562437, + "grad_norm": 0.3583400249481201, + "learning_rate": 0.00012178669665557978, + "loss": 0.2717, + "step": 4255 + }, + { + "epoch": 0.8613640963367739, + "grad_norm": 0.26008254289627075, + "learning_rate": 0.00012175565245582983, + "loss": 0.2705, + "step": 4256 + }, + { + "epoch": 0.8615664845173042, + "grad_norm": 0.46403968334198, + "learning_rate": 0.00012172460605507126, + "loss": 0.2724, + "step": 4257 + }, + { + "epoch": 0.8617688726978344, + "grad_norm": 0.2515455484390259, + "learning_rate": 0.00012169355745644498, + "loss": 0.2391, + "step": 4258 + }, + { + "epoch": 0.8619712608783647, + "grad_norm": 0.25544747710227966, + "learning_rate": 0.00012166250666309218, + "loss": 0.2561, + "step": 4259 + }, + { + "epoch": 0.8621736490588949, + "grad_norm": 0.29672637581825256, + "learning_rate": 0.00012163145367815428, + "loss": 0.2362, + "step": 4260 + }, + { + "epoch": 0.8623760372394252, + "grad_norm": 0.29169073700904846, + "learning_rate": 0.00012160039850477286, + "loss": 0.2237, + "step": 4261 + }, + { + "epoch": 0.8625784254199554, + "grad_norm": 0.28334859013557434, + "learning_rate": 0.00012156934114608977, + "loss": 0.2228, + "step": 4262 + }, + { + "epoch": 0.8627808136004858, + "grad_norm": 0.2901778221130371, + "learning_rate": 0.00012153828160524707, + "loss": 0.2407, + "step": 4263 + }, + { + "epoch": 0.862983201781016, + "grad_norm": 0.276862233877182, + "learning_rate": 0.00012150721988538703, + "loss": 0.2362, + "step": 4264 + }, + { + "epoch": 0.8631855899615463, + "grad_norm": 0.30135321617126465, + "learning_rate": 0.00012147615598965216, + "loss": 0.2148, + "step": 4265 + }, + { + "epoch": 0.8633879781420765, + "grad_norm": 0.2975095212459564, + "learning_rate": 0.00012144508992118518, + "loss": 0.2506, + "step": 4266 + }, + { + "epoch": 0.8635903663226068, + "grad_norm": 0.3389938771724701, + "learning_rate": 0.000121414021683129, + "loss": 0.2407, + "step": 4267 + }, + { + "epoch": 0.863792754503137, + "grad_norm": 0.2970937490463257, + "learning_rate": 0.00012138295127862682, + "loss": 0.2218, + "step": 4268 + }, + { + "epoch": 0.8639951426836673, + "grad_norm": 0.315762996673584, + "learning_rate": 0.00012135187871082201, + "loss": 0.2503, + "step": 4269 + }, + { + "epoch": 0.8641975308641975, + "grad_norm": 0.3555409610271454, + "learning_rate": 0.00012132080398285812, + "loss": 0.2838, + "step": 4270 + }, + { + "epoch": 0.8643999190447278, + "grad_norm": 0.29222363233566284, + "learning_rate": 0.00012128972709787903, + "loss": 0.2555, + "step": 4271 + }, + { + "epoch": 0.864602307225258, + "grad_norm": 0.31274712085723877, + "learning_rate": 0.00012125864805902873, + "loss": 0.2368, + "step": 4272 + }, + { + "epoch": 0.8648046954057883, + "grad_norm": 0.412761390209198, + "learning_rate": 0.00012122756686945151, + "loss": 0.2347, + "step": 4273 + }, + { + "epoch": 0.8650070835863185, + "grad_norm": 0.32250848412513733, + "learning_rate": 0.00012119648353229179, + "loss": 0.2867, + "step": 4274 + }, + { + "epoch": 0.8652094717668488, + "grad_norm": 0.30340439081192017, + "learning_rate": 0.00012116539805069426, + "loss": 0.2235, + "step": 4275 + }, + { + "epoch": 0.8654118599473791, + "grad_norm": 0.2612261176109314, + "learning_rate": 0.00012113431042780387, + "loss": 0.2122, + "step": 4276 + }, + { + "epoch": 0.8656142481279093, + "grad_norm": 0.2950068712234497, + "learning_rate": 0.0001211032206667657, + "loss": 0.2534, + "step": 4277 + }, + { + "epoch": 0.8658166363084396, + "grad_norm": 0.2997754216194153, + "learning_rate": 0.00012107212877072509, + "loss": 0.246, + "step": 4278 + }, + { + "epoch": 0.8660190244889698, + "grad_norm": 0.3149646818637848, + "learning_rate": 0.0001210410347428276, + "loss": 0.2493, + "step": 4279 + }, + { + "epoch": 0.8662214126695001, + "grad_norm": 0.33829453587532043, + "learning_rate": 0.000121009938586219, + "loss": 0.2602, + "step": 4280 + }, + { + "epoch": 0.8664238008500303, + "grad_norm": 0.3235696256160736, + "learning_rate": 0.00012097884030404527, + "loss": 0.2269, + "step": 4281 + }, + { + "epoch": 0.8666261890305607, + "grad_norm": 0.23874787986278534, + "learning_rate": 0.00012094773989945261, + "loss": 0.2319, + "step": 4282 + }, + { + "epoch": 0.8668285772110909, + "grad_norm": 0.26383423805236816, + "learning_rate": 0.00012091663737558743, + "loss": 0.2389, + "step": 4283 + }, + { + "epoch": 0.8670309653916212, + "grad_norm": 0.28093284368515015, + "learning_rate": 0.00012088553273559638, + "loss": 0.2476, + "step": 4284 + }, + { + "epoch": 0.8672333535721514, + "grad_norm": 0.31474965810775757, + "learning_rate": 0.00012085442598262624, + "loss": 0.2602, + "step": 4285 + }, + { + "epoch": 0.8674357417526817, + "grad_norm": 0.25037381052970886, + "learning_rate": 0.00012082331711982411, + "loss": 0.2328, + "step": 4286 + }, + { + "epoch": 0.8676381299332119, + "grad_norm": 0.34945225715637207, + "learning_rate": 0.00012079220615033724, + "loss": 0.2495, + "step": 4287 + }, + { + "epoch": 0.8678405181137422, + "grad_norm": 0.48855215311050415, + "learning_rate": 0.00012076109307731314, + "loss": 0.2462, + "step": 4288 + }, + { + "epoch": 0.8680429062942724, + "grad_norm": 0.288464218378067, + "learning_rate": 0.00012072997790389946, + "loss": 0.2455, + "step": 4289 + }, + { + "epoch": 0.8682452944748027, + "grad_norm": 0.3199939429759979, + "learning_rate": 0.00012069886063324414, + "loss": 0.2443, + "step": 4290 + }, + { + "epoch": 0.8684476826553329, + "grad_norm": 0.35276976227760315, + "learning_rate": 0.00012066774126849529, + "loss": 0.229, + "step": 4291 + }, + { + "epoch": 0.8686500708358632, + "grad_norm": 0.22685624659061432, + "learning_rate": 0.00012063661981280125, + "loss": 0.201, + "step": 4292 + }, + { + "epoch": 0.8688524590163934, + "grad_norm": 0.3191373944282532, + "learning_rate": 0.00012060549626931057, + "loss": 0.2442, + "step": 4293 + }, + { + "epoch": 0.8690548471969237, + "grad_norm": 0.3646087944507599, + "learning_rate": 0.00012057437064117198, + "loss": 0.2304, + "step": 4294 + }, + { + "epoch": 0.8692572353774539, + "grad_norm": 0.2860366702079773, + "learning_rate": 0.00012054324293153447, + "loss": 0.2518, + "step": 4295 + }, + { + "epoch": 0.8694596235579842, + "grad_norm": 0.2568869888782501, + "learning_rate": 0.00012051211314354719, + "loss": 0.2683, + "step": 4296 + }, + { + "epoch": 0.8696620117385144, + "grad_norm": 0.3527676463127136, + "learning_rate": 0.00012048098128035951, + "loss": 0.2383, + "step": 4297 + }, + { + "epoch": 0.8698643999190447, + "grad_norm": 0.26397374272346497, + "learning_rate": 0.00012044984734512106, + "loss": 0.2068, + "step": 4298 + }, + { + "epoch": 0.8700667880995749, + "grad_norm": 0.28540942072868347, + "learning_rate": 0.00012041871134098164, + "loss": 0.2392, + "step": 4299 + }, + { + "epoch": 0.8702691762801052, + "grad_norm": 0.30011117458343506, + "learning_rate": 0.00012038757327109125, + "loss": 0.2559, + "step": 4300 + }, + { + "epoch": 0.8702691762801052, + "eval_loss": 0.26672375202178955, + "eval_runtime": 0.7378, + "eval_samples_per_second": 6.777, + "eval_steps_per_second": 1.355, + "step": 4300 + }, + { + "epoch": 0.8704715644606354, + "grad_norm": 0.2464076280593872, + "learning_rate": 0.00012035643313860013, + "loss": 0.2172, + "step": 4301 + }, + { + "epoch": 0.8706739526411658, + "grad_norm": 0.3207111656665802, + "learning_rate": 0.0001203252909466587, + "loss": 0.2681, + "step": 4302 + }, + { + "epoch": 0.870876340821696, + "grad_norm": 0.26082876324653625, + "learning_rate": 0.00012029414669841758, + "loss": 0.246, + "step": 4303 + }, + { + "epoch": 0.8710787290022263, + "grad_norm": 0.29645365476608276, + "learning_rate": 0.00012026300039702766, + "loss": 0.202, + "step": 4304 + }, + { + "epoch": 0.8712811171827566, + "grad_norm": 0.24679231643676758, + "learning_rate": 0.00012023185204563998, + "loss": 0.1947, + "step": 4305 + }, + { + "epoch": 0.8714835053632868, + "grad_norm": 0.3063866198062897, + "learning_rate": 0.00012020070164740582, + "loss": 0.2473, + "step": 4306 + }, + { + "epoch": 0.8716858935438171, + "grad_norm": 0.305171936750412, + "learning_rate": 0.00012016954920547661, + "loss": 0.2506, + "step": 4307 + }, + { + "epoch": 0.8718882817243473, + "grad_norm": 0.28420180082321167, + "learning_rate": 0.00012013839472300406, + "loss": 0.2275, + "step": 4308 + }, + { + "epoch": 0.8720906699048776, + "grad_norm": 0.32442429661750793, + "learning_rate": 0.00012010723820314, + "loss": 0.2362, + "step": 4309 + }, + { + "epoch": 0.8722930580854078, + "grad_norm": 0.3036497235298157, + "learning_rate": 0.0001200760796490366, + "loss": 0.2795, + "step": 4310 + }, + { + "epoch": 0.8724954462659381, + "grad_norm": 0.2749023735523224, + "learning_rate": 0.0001200449190638461, + "loss": 0.2314, + "step": 4311 + }, + { + "epoch": 0.8726978344464683, + "grad_norm": 0.3086685836315155, + "learning_rate": 0.00012001375645072104, + "loss": 0.2589, + "step": 4312 + }, + { + "epoch": 0.8729002226269986, + "grad_norm": 0.27015221118927, + "learning_rate": 0.00011998259181281408, + "loss": 0.2445, + "step": 4313 + }, + { + "epoch": 0.8731026108075288, + "grad_norm": 0.23451970517635345, + "learning_rate": 0.00011995142515327815, + "loss": 0.1892, + "step": 4314 + }, + { + "epoch": 0.8733049989880591, + "grad_norm": 0.3884549140930176, + "learning_rate": 0.00011992025647526639, + "loss": 0.2129, + "step": 4315 + }, + { + "epoch": 0.8735073871685893, + "grad_norm": 0.3305327594280243, + "learning_rate": 0.0001198890857819321, + "loss": 0.2281, + "step": 4316 + }, + { + "epoch": 0.8737097753491196, + "grad_norm": 0.26919251680374146, + "learning_rate": 0.0001198579130764288, + "loss": 0.269, + "step": 4317 + }, + { + "epoch": 0.8739121635296498, + "grad_norm": 0.24107398092746735, + "learning_rate": 0.00011982673836191023, + "loss": 0.2294, + "step": 4318 + }, + { + "epoch": 0.8741145517101802, + "grad_norm": 0.30871227383613586, + "learning_rate": 0.0001197955616415303, + "loss": 0.2511, + "step": 4319 + }, + { + "epoch": 0.8743169398907104, + "grad_norm": 0.2817114293575287, + "learning_rate": 0.00011976438291844316, + "loss": 0.2449, + "step": 4320 + }, + { + "epoch": 0.8745193280712407, + "grad_norm": 0.2848239839076996, + "learning_rate": 0.00011973320219580312, + "loss": 0.2531, + "step": 4321 + }, + { + "epoch": 0.8747217162517709, + "grad_norm": 0.2998313009738922, + "learning_rate": 0.00011970201947676478, + "loss": 0.2148, + "step": 4322 + }, + { + "epoch": 0.8749241044323012, + "grad_norm": 0.24333159625530243, + "learning_rate": 0.00011967083476448282, + "loss": 0.1948, + "step": 4323 + }, + { + "epoch": 0.8751264926128314, + "grad_norm": 0.2738083600997925, + "learning_rate": 0.0001196396480621122, + "loss": 0.2395, + "step": 4324 + }, + { + "epoch": 0.8753288807933617, + "grad_norm": 0.23754236102104187, + "learning_rate": 0.00011960845937280807, + "loss": 0.2232, + "step": 4325 + }, + { + "epoch": 0.8755312689738919, + "grad_norm": 0.2790989279747009, + "learning_rate": 0.00011957726869972577, + "loss": 0.2207, + "step": 4326 + }, + { + "epoch": 0.8757336571544222, + "grad_norm": 0.32336124777793884, + "learning_rate": 0.00011954607604602084, + "loss": 0.2323, + "step": 4327 + }, + { + "epoch": 0.8759360453349524, + "grad_norm": 0.27224284410476685, + "learning_rate": 0.00011951488141484903, + "loss": 0.2312, + "step": 4328 + }, + { + "epoch": 0.8761384335154827, + "grad_norm": 0.2953594923019409, + "learning_rate": 0.00011948368480936631, + "loss": 0.2485, + "step": 4329 + }, + { + "epoch": 0.8763408216960129, + "grad_norm": 0.281449556350708, + "learning_rate": 0.00011945248623272878, + "loss": 0.2772, + "step": 4330 + }, + { + "epoch": 0.8765432098765432, + "grad_norm": 0.2558891475200653, + "learning_rate": 0.0001194212856880928, + "loss": 0.2237, + "step": 4331 + }, + { + "epoch": 0.8767455980570734, + "grad_norm": 0.36965882778167725, + "learning_rate": 0.00011939008317861494, + "loss": 0.2371, + "step": 4332 + }, + { + "epoch": 0.8769479862376037, + "grad_norm": 0.26143166422843933, + "learning_rate": 0.0001193588787074519, + "loss": 0.2301, + "step": 4333 + }, + { + "epoch": 0.8771503744181339, + "grad_norm": 0.31695854663848877, + "learning_rate": 0.00011932767227776065, + "loss": 0.2345, + "step": 4334 + }, + { + "epoch": 0.8773527625986642, + "grad_norm": 0.2816372215747833, + "learning_rate": 0.00011929646389269833, + "loss": 0.2551, + "step": 4335 + }, + { + "epoch": 0.8775551507791945, + "grad_norm": 0.25129345059394836, + "learning_rate": 0.00011926525355542227, + "loss": 0.2437, + "step": 4336 + }, + { + "epoch": 0.8777575389597247, + "grad_norm": 0.23418568074703217, + "learning_rate": 0.00011923404126909, + "loss": 0.2251, + "step": 4337 + }, + { + "epoch": 0.8779599271402551, + "grad_norm": 0.3892250061035156, + "learning_rate": 0.00011920282703685923, + "loss": 0.2311, + "step": 4338 + }, + { + "epoch": 0.8781623153207853, + "grad_norm": 0.3089623749256134, + "learning_rate": 0.00011917161086188793, + "loss": 0.2332, + "step": 4339 + }, + { + "epoch": 0.8783647035013156, + "grad_norm": 0.3821837604045868, + "learning_rate": 0.00011914039274733422, + "loss": 0.2771, + "step": 4340 + }, + { + "epoch": 0.8785670916818458, + "grad_norm": 0.2563057243824005, + "learning_rate": 0.0001191091726963564, + "loss": 0.2575, + "step": 4341 + }, + { + "epoch": 0.8787694798623761, + "grad_norm": 0.2678040862083435, + "learning_rate": 0.00011907795071211298, + "loss": 0.224, + "step": 4342 + }, + { + "epoch": 0.8789718680429063, + "grad_norm": 0.34176933765411377, + "learning_rate": 0.00011904672679776272, + "loss": 0.2652, + "step": 4343 + }, + { + "epoch": 0.8791742562234366, + "grad_norm": 0.304793119430542, + "learning_rate": 0.00011901550095646447, + "loss": 0.2023, + "step": 4344 + }, + { + "epoch": 0.8793766444039668, + "grad_norm": 0.266438364982605, + "learning_rate": 0.0001189842731913774, + "loss": 0.2156, + "step": 4345 + }, + { + "epoch": 0.8795790325844971, + "grad_norm": 0.2922779321670532, + "learning_rate": 0.00011895304350566073, + "loss": 0.2285, + "step": 4346 + }, + { + "epoch": 0.8797814207650273, + "grad_norm": 0.27575254440307617, + "learning_rate": 0.000118921811902474, + "loss": 0.2364, + "step": 4347 + }, + { + "epoch": 0.8799838089455576, + "grad_norm": 0.2569499909877777, + "learning_rate": 0.0001188905783849769, + "loss": 0.2504, + "step": 4348 + }, + { + "epoch": 0.8801861971260878, + "grad_norm": 0.387317419052124, + "learning_rate": 0.00011885934295632928, + "loss": 0.2598, + "step": 4349 + }, + { + "epoch": 0.8803885853066181, + "grad_norm": 0.3084860146045685, + "learning_rate": 0.00011882810561969124, + "loss": 0.2644, + "step": 4350 + }, + { + "epoch": 0.8803885853066181, + "eval_loss": 0.27138298749923706, + "eval_runtime": 0.7407, + "eval_samples_per_second": 6.751, + "eval_steps_per_second": 1.35, + "step": 4350 + }, + { + "epoch": 0.8805909734871483, + "grad_norm": 0.2924489676952362, + "learning_rate": 0.00011879686637822305, + "loss": 0.2464, + "step": 4351 + }, + { + "epoch": 0.8807933616676786, + "grad_norm": 0.3202839195728302, + "learning_rate": 0.00011876562523508512, + "loss": 0.2118, + "step": 4352 + }, + { + "epoch": 0.8809957498482088, + "grad_norm": 0.32835039496421814, + "learning_rate": 0.00011873438219343816, + "loss": 0.2227, + "step": 4353 + }, + { + "epoch": 0.8811981380287391, + "grad_norm": 0.2956985533237457, + "learning_rate": 0.00011870313725644295, + "loss": 0.2616, + "step": 4354 + }, + { + "epoch": 0.8814005262092693, + "grad_norm": 0.2905611991882324, + "learning_rate": 0.00011867189042726059, + "loss": 0.2207, + "step": 4355 + }, + { + "epoch": 0.8816029143897997, + "grad_norm": 0.29978570342063904, + "learning_rate": 0.00011864064170905229, + "loss": 0.2191, + "step": 4356 + }, + { + "epoch": 0.8818053025703299, + "grad_norm": 0.2473001331090927, + "learning_rate": 0.00011860939110497945, + "loss": 0.2369, + "step": 4357 + }, + { + "epoch": 0.8820076907508602, + "grad_norm": 0.3887574076652527, + "learning_rate": 0.00011857813861820366, + "loss": 0.2523, + "step": 4358 + }, + { + "epoch": 0.8822100789313904, + "grad_norm": 0.32068565487861633, + "learning_rate": 0.00011854688425188673, + "loss": 0.2293, + "step": 4359 + }, + { + "epoch": 0.8824124671119207, + "grad_norm": 0.31508004665374756, + "learning_rate": 0.00011851562800919071, + "loss": 0.2661, + "step": 4360 + }, + { + "epoch": 0.8826148552924509, + "grad_norm": 0.2505917549133301, + "learning_rate": 0.0001184843698932777, + "loss": 0.2282, + "step": 4361 + }, + { + "epoch": 0.8828172434729812, + "grad_norm": 0.38696616888046265, + "learning_rate": 0.00011845310990731014, + "loss": 0.2559, + "step": 4362 + }, + { + "epoch": 0.8830196316535114, + "grad_norm": 0.25132423639297485, + "learning_rate": 0.00011842184805445051, + "loss": 0.2044, + "step": 4363 + }, + { + "epoch": 0.8832220198340417, + "grad_norm": 0.3269764482975006, + "learning_rate": 0.00011839058433786158, + "loss": 0.2734, + "step": 4364 + }, + { + "epoch": 0.8834244080145719, + "grad_norm": 0.32102179527282715, + "learning_rate": 0.00011835931876070632, + "loss": 0.2147, + "step": 4365 + }, + { + "epoch": 0.8836267961951022, + "grad_norm": 0.2468416839838028, + "learning_rate": 0.00011832805132614781, + "loss": 0.2289, + "step": 4366 + }, + { + "epoch": 0.8838291843756325, + "grad_norm": 0.24596892297267914, + "learning_rate": 0.00011829678203734937, + "loss": 0.2056, + "step": 4367 + }, + { + "epoch": 0.8840315725561627, + "grad_norm": 0.2805595397949219, + "learning_rate": 0.00011826551089747455, + "loss": 0.2155, + "step": 4368 + }, + { + "epoch": 0.884233960736693, + "grad_norm": 0.345514714717865, + "learning_rate": 0.00011823423790968698, + "loss": 0.2542, + "step": 4369 + }, + { + "epoch": 0.8844363489172232, + "grad_norm": 0.25061899423599243, + "learning_rate": 0.00011820296307715053, + "loss": 0.1959, + "step": 4370 + }, + { + "epoch": 0.8846387370977535, + "grad_norm": 0.30302125215530396, + "learning_rate": 0.0001181716864030293, + "loss": 0.196, + "step": 4371 + }, + { + "epoch": 0.8848411252782837, + "grad_norm": 0.2581700086593628, + "learning_rate": 0.0001181404078904875, + "loss": 0.2058, + "step": 4372 + }, + { + "epoch": 0.885043513458814, + "grad_norm": 0.2704077959060669, + "learning_rate": 0.00011810912754268962, + "loss": 0.2, + "step": 4373 + }, + { + "epoch": 0.8852459016393442, + "grad_norm": 0.22900304198265076, + "learning_rate": 0.00011807784536280018, + "loss": 0.2031, + "step": 4374 + }, + { + "epoch": 0.8854482898198746, + "grad_norm": 0.25832971930503845, + "learning_rate": 0.00011804656135398404, + "loss": 0.2102, + "step": 4375 + }, + { + "epoch": 0.8856506780004048, + "grad_norm": 0.31184902787208557, + "learning_rate": 0.00011801527551940619, + "loss": 0.2341, + "step": 4376 + }, + { + "epoch": 0.8858530661809351, + "grad_norm": 0.30117902159690857, + "learning_rate": 0.00011798398786223179, + "loss": 0.2479, + "step": 4377 + }, + { + "epoch": 0.8860554543614653, + "grad_norm": 0.3672550618648529, + "learning_rate": 0.00011795269838562621, + "loss": 0.2278, + "step": 4378 + }, + { + "epoch": 0.8862578425419956, + "grad_norm": 0.2866049110889435, + "learning_rate": 0.00011792140709275498, + "loss": 0.1992, + "step": 4379 + }, + { + "epoch": 0.8864602307225258, + "grad_norm": 0.31390565633773804, + "learning_rate": 0.00011789011398678385, + "loss": 0.2485, + "step": 4380 + }, + { + "epoch": 0.8866626189030561, + "grad_norm": 0.3039097189903259, + "learning_rate": 0.00011785881907087866, + "loss": 0.2522, + "step": 4381 + }, + { + "epoch": 0.8868650070835863, + "grad_norm": 0.2775220572948456, + "learning_rate": 0.00011782752234820558, + "loss": 0.2482, + "step": 4382 + }, + { + "epoch": 0.8870673952641166, + "grad_norm": 0.2754577100276947, + "learning_rate": 0.00011779622382193083, + "loss": 0.2364, + "step": 4383 + }, + { + "epoch": 0.8872697834446468, + "grad_norm": 0.38729971647262573, + "learning_rate": 0.00011776492349522092, + "loss": 0.2182, + "step": 4384 + }, + { + "epoch": 0.8874721716251771, + "grad_norm": 0.3540734350681305, + "learning_rate": 0.00011773362137124244, + "loss": 0.2203, + "step": 4385 + }, + { + "epoch": 0.8876745598057073, + "grad_norm": 0.2709711492061615, + "learning_rate": 0.00011770231745316222, + "loss": 0.2158, + "step": 4386 + }, + { + "epoch": 0.8878769479862376, + "grad_norm": 0.27588939666748047, + "learning_rate": 0.00011767101174414727, + "loss": 0.1985, + "step": 4387 + }, + { + "epoch": 0.8880793361667678, + "grad_norm": 0.27467525005340576, + "learning_rate": 0.00011763970424736477, + "loss": 0.2165, + "step": 4388 + }, + { + "epoch": 0.8882817243472981, + "grad_norm": 0.32186320424079895, + "learning_rate": 0.00011760839496598208, + "loss": 0.2277, + "step": 4389 + }, + { + "epoch": 0.8884841125278283, + "grad_norm": 0.33760732412338257, + "learning_rate": 0.00011757708390316678, + "loss": 0.2447, + "step": 4390 + }, + { + "epoch": 0.8886865007083586, + "grad_norm": 0.27130693197250366, + "learning_rate": 0.00011754577106208654, + "loss": 0.2472, + "step": 4391 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.2727298438549042, + "learning_rate": 0.00011751445644590928, + "loss": 0.238, + "step": 4392 + }, + { + "epoch": 0.8890912770694192, + "grad_norm": 0.33025866746902466, + "learning_rate": 0.0001174831400578031, + "loss": 0.2187, + "step": 4393 + }, + { + "epoch": 0.8892936652499493, + "grad_norm": 0.2998366057872772, + "learning_rate": 0.00011745182190093626, + "loss": 0.2469, + "step": 4394 + }, + { + "epoch": 0.8894960534304797, + "grad_norm": 0.27042579650878906, + "learning_rate": 0.0001174205019784772, + "loss": 0.209, + "step": 4395 + }, + { + "epoch": 0.88969844161101, + "grad_norm": 0.2757139205932617, + "learning_rate": 0.00011738918029359453, + "loss": 0.2332, + "step": 4396 + }, + { + "epoch": 0.8899008297915402, + "grad_norm": 0.2801726460456848, + "learning_rate": 0.00011735785684945708, + "loss": 0.2433, + "step": 4397 + }, + { + "epoch": 0.8901032179720705, + "grad_norm": 0.2413640022277832, + "learning_rate": 0.00011732653164923381, + "loss": 0.2236, + "step": 4398 + }, + { + "epoch": 0.8903056061526007, + "grad_norm": 0.37460601329803467, + "learning_rate": 0.00011729520469609388, + "loss": 0.2886, + "step": 4399 + }, + { + "epoch": 0.890507994333131, + "grad_norm": 0.2749800384044647, + "learning_rate": 0.00011726387599320658, + "loss": 0.2384, + "step": 4400 + }, + { + "epoch": 0.890507994333131, + "eval_loss": 0.2683194875717163, + "eval_runtime": 0.7387, + "eval_samples_per_second": 6.769, + "eval_steps_per_second": 1.354, + "step": 4400 + }, + { + "epoch": 0.8907103825136612, + "grad_norm": 0.3313538134098053, + "learning_rate": 0.00011723254554374148, + "loss": 0.2359, + "step": 4401 + }, + { + "epoch": 0.8909127706941915, + "grad_norm": 0.2636862099170685, + "learning_rate": 0.00011720121335086824, + "loss": 0.2458, + "step": 4402 + }, + { + "epoch": 0.8911151588747217, + "grad_norm": 0.3257393538951874, + "learning_rate": 0.0001171698794177567, + "loss": 0.2657, + "step": 4403 + }, + { + "epoch": 0.891317547055252, + "grad_norm": 0.2799130976200104, + "learning_rate": 0.00011713854374757696, + "loss": 0.2262, + "step": 4404 + }, + { + "epoch": 0.8915199352357822, + "grad_norm": 0.2589074671268463, + "learning_rate": 0.00011710720634349916, + "loss": 0.2034, + "step": 4405 + }, + { + "epoch": 0.8917223234163125, + "grad_norm": 0.27647581696510315, + "learning_rate": 0.00011707586720869374, + "loss": 0.2543, + "step": 4406 + }, + { + "epoch": 0.8919247115968427, + "grad_norm": 0.3513265550136566, + "learning_rate": 0.00011704452634633129, + "loss": 0.2222, + "step": 4407 + }, + { + "epoch": 0.892127099777373, + "grad_norm": 0.2585983872413635, + "learning_rate": 0.00011701318375958247, + "loss": 0.2068, + "step": 4408 + }, + { + "epoch": 0.8923294879579032, + "grad_norm": 0.3059662878513336, + "learning_rate": 0.00011698183945161824, + "loss": 0.2385, + "step": 4409 + }, + { + "epoch": 0.8925318761384335, + "grad_norm": 0.2765025198459625, + "learning_rate": 0.00011695049342560968, + "loss": 0.2115, + "step": 4410 + }, + { + "epoch": 0.8927342643189637, + "grad_norm": 0.3690018653869629, + "learning_rate": 0.00011691914568472806, + "loss": 0.2162, + "step": 4411 + }, + { + "epoch": 0.892936652499494, + "grad_norm": 0.3051934242248535, + "learning_rate": 0.00011688779623214481, + "loss": 0.2866, + "step": 4412 + }, + { + "epoch": 0.8931390406800243, + "grad_norm": 0.43420571088790894, + "learning_rate": 0.00011685644507103152, + "loss": 0.2625, + "step": 4413 + }, + { + "epoch": 0.8933414288605546, + "grad_norm": 0.25232359766960144, + "learning_rate": 0.00011682509220456002, + "loss": 0.2307, + "step": 4414 + }, + { + "epoch": 0.8935438170410848, + "grad_norm": 0.2460232675075531, + "learning_rate": 0.00011679373763590222, + "loss": 0.172, + "step": 4415 + }, + { + "epoch": 0.8937462052216151, + "grad_norm": 0.3492420017719269, + "learning_rate": 0.00011676238136823025, + "loss": 0.213, + "step": 4416 + }, + { + "epoch": 0.8939485934021453, + "grad_norm": 0.36887046694755554, + "learning_rate": 0.00011673102340471644, + "loss": 0.2217, + "step": 4417 + }, + { + "epoch": 0.8941509815826756, + "grad_norm": 0.2646304666996002, + "learning_rate": 0.00011669966374853323, + "loss": 0.2468, + "step": 4418 + }, + { + "epoch": 0.8943533697632058, + "grad_norm": 0.33006051182746887, + "learning_rate": 0.00011666830240285328, + "loss": 0.2327, + "step": 4419 + }, + { + "epoch": 0.8945557579437361, + "grad_norm": 0.39953455328941345, + "learning_rate": 0.00011663693937084936, + "loss": 0.2549, + "step": 4420 + }, + { + "epoch": 0.8947581461242663, + "grad_norm": 0.3088074326515198, + "learning_rate": 0.0001166055746556945, + "loss": 0.2437, + "step": 4421 + }, + { + "epoch": 0.8949605343047966, + "grad_norm": 0.5596060156822205, + "learning_rate": 0.00011657420826056184, + "loss": 0.2091, + "step": 4422 + }, + { + "epoch": 0.8951629224853268, + "grad_norm": 0.4558367431163788, + "learning_rate": 0.00011654284018862471, + "loss": 0.2282, + "step": 4423 + }, + { + "epoch": 0.8953653106658571, + "grad_norm": 0.3053852617740631, + "learning_rate": 0.00011651147044305656, + "loss": 0.2614, + "step": 4424 + }, + { + "epoch": 0.8955676988463873, + "grad_norm": 0.2652442157268524, + "learning_rate": 0.00011648009902703112, + "loss": 0.2064, + "step": 4425 + }, + { + "epoch": 0.8957700870269176, + "grad_norm": 0.31589120626449585, + "learning_rate": 0.00011644872594372218, + "loss": 0.2637, + "step": 4426 + }, + { + "epoch": 0.8959724752074479, + "grad_norm": 0.25115832686424255, + "learning_rate": 0.00011641735119630372, + "loss": 0.218, + "step": 4427 + }, + { + "epoch": 0.8961748633879781, + "grad_norm": 0.3270891010761261, + "learning_rate": 0.00011638597478794995, + "loss": 0.2325, + "step": 4428 + }, + { + "epoch": 0.8963772515685084, + "grad_norm": 0.23799145221710205, + "learning_rate": 0.0001163545967218352, + "loss": 0.1974, + "step": 4429 + }, + { + "epoch": 0.8965796397490386, + "grad_norm": 0.3497200906276703, + "learning_rate": 0.00011632321700113393, + "loss": 0.2611, + "step": 4430 + }, + { + "epoch": 0.896782027929569, + "grad_norm": 0.24682220816612244, + "learning_rate": 0.00011629183562902087, + "loss": 0.254, + "step": 4431 + }, + { + "epoch": 0.8969844161100992, + "grad_norm": 0.28864797949790955, + "learning_rate": 0.0001162604526086708, + "loss": 0.2637, + "step": 4432 + }, + { + "epoch": 0.8971868042906295, + "grad_norm": 0.39302846789360046, + "learning_rate": 0.00011622906794325877, + "loss": 0.2285, + "step": 4433 + }, + { + "epoch": 0.8973891924711597, + "grad_norm": 0.4144213795661926, + "learning_rate": 0.00011619768163595991, + "loss": 0.2094, + "step": 4434 + }, + { + "epoch": 0.89759158065169, + "grad_norm": 0.3300606906414032, + "learning_rate": 0.00011616629368994962, + "loss": 0.2707, + "step": 4435 + }, + { + "epoch": 0.8977939688322202, + "grad_norm": 0.26967158913612366, + "learning_rate": 0.00011613490410840335, + "loss": 0.2392, + "step": 4436 + }, + { + "epoch": 0.8979963570127505, + "grad_norm": 0.3630208671092987, + "learning_rate": 0.0001161035128944968, + "loss": 0.2342, + "step": 4437 + }, + { + "epoch": 0.8981987451932807, + "grad_norm": 0.29514622688293457, + "learning_rate": 0.00011607212005140576, + "loss": 0.2395, + "step": 4438 + }, + { + "epoch": 0.898401133373811, + "grad_norm": 0.34164735674858093, + "learning_rate": 0.00011604072558230625, + "loss": 0.2391, + "step": 4439 + }, + { + "epoch": 0.8986035215543412, + "grad_norm": 0.2891792058944702, + "learning_rate": 0.00011600932949037449, + "loss": 0.2289, + "step": 4440 + }, + { + "epoch": 0.8988059097348715, + "grad_norm": 0.2580989599227905, + "learning_rate": 0.00011597793177878671, + "loss": 0.2443, + "step": 4441 + }, + { + "epoch": 0.8990082979154017, + "grad_norm": 0.31715089082717896, + "learning_rate": 0.00011594653245071946, + "loss": 0.2453, + "step": 4442 + }, + { + "epoch": 0.899210686095932, + "grad_norm": 0.3037600517272949, + "learning_rate": 0.00011591513150934937, + "loss": 0.2557, + "step": 4443 + }, + { + "epoch": 0.8994130742764622, + "grad_norm": 0.2914448380470276, + "learning_rate": 0.00011588372895785328, + "loss": 0.2609, + "step": 4444 + }, + { + "epoch": 0.8996154624569925, + "grad_norm": 0.3002516031265259, + "learning_rate": 0.00011585232479940815, + "loss": 0.2394, + "step": 4445 + }, + { + "epoch": 0.8998178506375227, + "grad_norm": 0.33220770955085754, + "learning_rate": 0.00011582091903719114, + "loss": 0.2564, + "step": 4446 + }, + { + "epoch": 0.900020238818053, + "grad_norm": 0.26941895484924316, + "learning_rate": 0.00011578951167437957, + "loss": 0.2413, + "step": 4447 + }, + { + "epoch": 0.9002226269985832, + "grad_norm": 0.3035530745983124, + "learning_rate": 0.00011575810271415086, + "loss": 0.2415, + "step": 4448 + }, + { + "epoch": 0.9004250151791136, + "grad_norm": 0.30377620458602905, + "learning_rate": 0.00011572669215968269, + "loss": 0.2435, + "step": 4449 + }, + { + "epoch": 0.9006274033596438, + "grad_norm": 0.36584311723709106, + "learning_rate": 0.0001156952800141528, + "loss": 0.2269, + "step": 4450 + }, + { + "epoch": 0.9006274033596438, + "eval_loss": 0.2701588571071625, + "eval_runtime": 0.7366, + "eval_samples_per_second": 6.788, + "eval_steps_per_second": 1.358, + "step": 4450 + }, + { + "epoch": 0.9008297915401741, + "grad_norm": 0.2716978192329407, + "learning_rate": 0.0001156638662807392, + "loss": 0.2314, + "step": 4451 + }, + { + "epoch": 0.9010321797207043, + "grad_norm": 0.2875329852104187, + "learning_rate": 0.00011563245096261994, + "loss": 0.2171, + "step": 4452 + }, + { + "epoch": 0.9012345679012346, + "grad_norm": 0.2926979959011078, + "learning_rate": 0.00011560103406297331, + "loss": 0.2255, + "step": 4453 + }, + { + "epoch": 0.9014369560817648, + "grad_norm": 0.2483060508966446, + "learning_rate": 0.00011556961558497779, + "loss": 0.2231, + "step": 4454 + }, + { + "epoch": 0.9016393442622951, + "grad_norm": 0.3437354266643524, + "learning_rate": 0.00011553819553181191, + "loss": 0.2311, + "step": 4455 + }, + { + "epoch": 0.9018417324428254, + "grad_norm": 0.28860417008399963, + "learning_rate": 0.00011550677390665445, + "loss": 0.2432, + "step": 4456 + }, + { + "epoch": 0.9020441206233556, + "grad_norm": 0.2693041265010834, + "learning_rate": 0.00011547535071268432, + "loss": 0.2399, + "step": 4457 + }, + { + "epoch": 0.9022465088038859, + "grad_norm": 0.2697674632072449, + "learning_rate": 0.00011544392595308058, + "loss": 0.2263, + "step": 4458 + }, + { + "epoch": 0.9024488969844161, + "grad_norm": 0.31832581758499146, + "learning_rate": 0.00011541249963102245, + "loss": 0.2526, + "step": 4459 + }, + { + "epoch": 0.9026512851649464, + "grad_norm": 0.3540724515914917, + "learning_rate": 0.00011538107174968935, + "loss": 0.2643, + "step": 4460 + }, + { + "epoch": 0.9028536733454766, + "grad_norm": 0.3032924234867096, + "learning_rate": 0.00011534964231226082, + "loss": 0.2556, + "step": 4461 + }, + { + "epoch": 0.9030560615260069, + "grad_norm": 0.30924227833747864, + "learning_rate": 0.00011531821132191653, + "loss": 0.2331, + "step": 4462 + }, + { + "epoch": 0.9032584497065371, + "grad_norm": 0.2958310544490814, + "learning_rate": 0.00011528677878183634, + "loss": 0.2366, + "step": 4463 + }, + { + "epoch": 0.9034608378870674, + "grad_norm": 0.3167951703071594, + "learning_rate": 0.00011525534469520027, + "loss": 0.197, + "step": 4464 + }, + { + "epoch": 0.9036632260675976, + "grad_norm": 0.28465691208839417, + "learning_rate": 0.00011522390906518851, + "loss": 0.2322, + "step": 4465 + }, + { + "epoch": 0.903865614248128, + "grad_norm": 0.3153257966041565, + "learning_rate": 0.00011519247189498137, + "loss": 0.2502, + "step": 4466 + }, + { + "epoch": 0.9040680024286581, + "grad_norm": 0.31020671129226685, + "learning_rate": 0.00011516103318775932, + "loss": 0.269, + "step": 4467 + }, + { + "epoch": 0.9042703906091885, + "grad_norm": 0.29223933815956116, + "learning_rate": 0.00011512959294670305, + "loss": 0.2317, + "step": 4468 + }, + { + "epoch": 0.9044727787897187, + "grad_norm": 0.2880360186100006, + "learning_rate": 0.0001150981511749933, + "loss": 0.223, + "step": 4469 + }, + { + "epoch": 0.904675166970249, + "grad_norm": 0.3483765423297882, + "learning_rate": 0.00011506670787581101, + "loss": 0.218, + "step": 4470 + }, + { + "epoch": 0.9048775551507792, + "grad_norm": 0.30725324153900146, + "learning_rate": 0.00011503526305233734, + "loss": 0.2474, + "step": 4471 + }, + { + "epoch": 0.9050799433313095, + "grad_norm": 0.3243599236011505, + "learning_rate": 0.0001150038167077535, + "loss": 0.2513, + "step": 4472 + }, + { + "epoch": 0.9052823315118397, + "grad_norm": 0.3006613254547119, + "learning_rate": 0.00011497236884524094, + "loss": 0.2502, + "step": 4473 + }, + { + "epoch": 0.90548471969237, + "grad_norm": 0.26012781262397766, + "learning_rate": 0.0001149409194679812, + "loss": 0.2439, + "step": 4474 + }, + { + "epoch": 0.9056871078729002, + "grad_norm": 0.36949893832206726, + "learning_rate": 0.000114909468579156, + "loss": 0.2073, + "step": 4475 + }, + { + "epoch": 0.9058894960534305, + "grad_norm": 0.25878971815109253, + "learning_rate": 0.0001148780161819472, + "loss": 0.1932, + "step": 4476 + }, + { + "epoch": 0.9060918842339607, + "grad_norm": 0.2846607267856598, + "learning_rate": 0.00011484656227953685, + "loss": 0.2044, + "step": 4477 + }, + { + "epoch": 0.906294272414491, + "grad_norm": 0.3530745506286621, + "learning_rate": 0.0001148151068751071, + "loss": 0.2143, + "step": 4478 + }, + { + "epoch": 0.9064966605950212, + "grad_norm": 0.2996197044849396, + "learning_rate": 0.0001147836499718403, + "loss": 0.242, + "step": 4479 + }, + { + "epoch": 0.9066990487755515, + "grad_norm": 0.341861367225647, + "learning_rate": 0.00011475219157291892, + "loss": 0.2284, + "step": 4480 + }, + { + "epoch": 0.9069014369560817, + "grad_norm": 0.29489073157310486, + "learning_rate": 0.00011472073168152557, + "loss": 0.2442, + "step": 4481 + }, + { + "epoch": 0.907103825136612, + "grad_norm": 0.24275663495063782, + "learning_rate": 0.00011468927030084307, + "loss": 0.2096, + "step": 4482 + }, + { + "epoch": 0.9073062133171422, + "grad_norm": 0.3213668763637543, + "learning_rate": 0.00011465780743405432, + "loss": 0.2298, + "step": 4483 + }, + { + "epoch": 0.9075086014976725, + "grad_norm": 0.2886224687099457, + "learning_rate": 0.00011462634308434245, + "loss": 0.2327, + "step": 4484 + }, + { + "epoch": 0.9077109896782027, + "grad_norm": 0.2937127649784088, + "learning_rate": 0.00011459487725489065, + "loss": 0.2579, + "step": 4485 + }, + { + "epoch": 0.907913377858733, + "grad_norm": 0.2543278932571411, + "learning_rate": 0.00011456340994888229, + "loss": 0.2356, + "step": 4486 + }, + { + "epoch": 0.9081157660392634, + "grad_norm": 0.25066015124320984, + "learning_rate": 0.00011453194116950093, + "loss": 0.2489, + "step": 4487 + }, + { + "epoch": 0.9083181542197936, + "grad_norm": 0.4157361388206482, + "learning_rate": 0.00011450047091993024, + "loss": 0.1968, + "step": 4488 + }, + { + "epoch": 0.9085205424003239, + "grad_norm": 0.27913933992385864, + "learning_rate": 0.00011446899920335405, + "loss": 0.258, + "step": 4489 + }, + { + "epoch": 0.9087229305808541, + "grad_norm": 0.33072513341903687, + "learning_rate": 0.00011443752602295634, + "loss": 0.2147, + "step": 4490 + }, + { + "epoch": 0.9089253187613844, + "grad_norm": 0.2571675777435303, + "learning_rate": 0.00011440605138192126, + "loss": 0.2203, + "step": 4491 + }, + { + "epoch": 0.9091277069419146, + "grad_norm": 0.22976812720298767, + "learning_rate": 0.00011437457528343305, + "loss": 0.2028, + "step": 4492 + }, + { + "epoch": 0.9093300951224449, + "grad_norm": 0.3034539520740509, + "learning_rate": 0.00011434309773067616, + "loss": 0.2498, + "step": 4493 + }, + { + "epoch": 0.9095324833029751, + "grad_norm": 0.32470160722732544, + "learning_rate": 0.00011431161872683512, + "loss": 0.2731, + "step": 4494 + }, + { + "epoch": 0.9097348714835054, + "grad_norm": 0.2735867500305176, + "learning_rate": 0.00011428013827509467, + "loss": 0.2614, + "step": 4495 + }, + { + "epoch": 0.9099372596640356, + "grad_norm": 0.37068819999694824, + "learning_rate": 0.00011424865637863967, + "loss": 0.2552, + "step": 4496 + }, + { + "epoch": 0.9101396478445659, + "grad_norm": 0.3341186046600342, + "learning_rate": 0.00011421717304065514, + "loss": 0.2616, + "step": 4497 + }, + { + "epoch": 0.9103420360250961, + "grad_norm": 0.253572553396225, + "learning_rate": 0.0001141856882643262, + "loss": 0.2391, + "step": 4498 + }, + { + "epoch": 0.9105444242056264, + "grad_norm": 0.2911362946033478, + "learning_rate": 0.00011415420205283818, + "loss": 0.2249, + "step": 4499 + }, + { + "epoch": 0.9107468123861566, + "grad_norm": 0.30564719438552856, + "learning_rate": 0.00011412271440937652, + "loss": 0.2529, + "step": 4500 + }, + { + "epoch": 0.9107468123861566, + "eval_loss": 0.27335047721862793, + "eval_runtime": 0.7406, + "eval_samples_per_second": 6.752, + "eval_steps_per_second": 1.35, + "step": 4500 + }, + { + "epoch": 0.9109492005666869, + "grad_norm": 0.3103221356868744, + "learning_rate": 0.0001140912253371268, + "loss": 0.2481, + "step": 4501 + }, + { + "epoch": 0.9111515887472171, + "grad_norm": 0.340580016374588, + "learning_rate": 0.00011405973483927474, + "loss": 0.2776, + "step": 4502 + }, + { + "epoch": 0.9113539769277474, + "grad_norm": 0.274093896150589, + "learning_rate": 0.00011402824291900627, + "loss": 0.2332, + "step": 4503 + }, + { + "epoch": 0.9115563651082776, + "grad_norm": 0.3204316794872284, + "learning_rate": 0.00011399674957950735, + "loss": 0.2167, + "step": 4504 + }, + { + "epoch": 0.911758753288808, + "grad_norm": 0.2747783660888672, + "learning_rate": 0.00011396525482396419, + "loss": 0.251, + "step": 4505 + }, + { + "epoch": 0.9119611414693382, + "grad_norm": 0.29710814356803894, + "learning_rate": 0.00011393375865556309, + "loss": 0.2347, + "step": 4506 + }, + { + "epoch": 0.9121635296498685, + "grad_norm": 0.3104104697704315, + "learning_rate": 0.00011390226107749049, + "loss": 0.2371, + "step": 4507 + }, + { + "epoch": 0.9123659178303987, + "grad_norm": 0.308168888092041, + "learning_rate": 0.000113870762092933, + "loss": 0.2713, + "step": 4508 + }, + { + "epoch": 0.912568306010929, + "grad_norm": 0.3603154718875885, + "learning_rate": 0.0001138392617050773, + "loss": 0.2434, + "step": 4509 + }, + { + "epoch": 0.9127706941914592, + "grad_norm": 0.31686174869537354, + "learning_rate": 0.00011380775991711035, + "loss": 0.2446, + "step": 4510 + }, + { + "epoch": 0.9129730823719895, + "grad_norm": 0.2970934510231018, + "learning_rate": 0.00011377625673221912, + "loss": 0.2634, + "step": 4511 + }, + { + "epoch": 0.9131754705525197, + "grad_norm": 0.36884772777557373, + "learning_rate": 0.0001137447521535908, + "loss": 0.2426, + "step": 4512 + }, + { + "epoch": 0.91337785873305, + "grad_norm": 0.2537902891635895, + "learning_rate": 0.00011371324618441269, + "loss": 0.2337, + "step": 4513 + }, + { + "epoch": 0.9135802469135802, + "grad_norm": 0.3862202763557434, + "learning_rate": 0.00011368173882787218, + "loss": 0.2466, + "step": 4514 + }, + { + "epoch": 0.9137826350941105, + "grad_norm": 0.2834358215332031, + "learning_rate": 0.00011365023008715691, + "loss": 0.2304, + "step": 4515 + }, + { + "epoch": 0.9139850232746407, + "grad_norm": 0.31917238235473633, + "learning_rate": 0.00011361871996545461, + "loss": 0.2698, + "step": 4516 + }, + { + "epoch": 0.914187411455171, + "grad_norm": 0.2584877610206604, + "learning_rate": 0.00011358720846595313, + "loss": 0.2378, + "step": 4517 + }, + { + "epoch": 0.9143897996357013, + "grad_norm": 0.29169657826423645, + "learning_rate": 0.00011355569559184047, + "loss": 0.2413, + "step": 4518 + }, + { + "epoch": 0.9145921878162315, + "grad_norm": 0.23529942333698273, + "learning_rate": 0.00011352418134630473, + "loss": 0.243, + "step": 4519 + }, + { + "epoch": 0.9147945759967618, + "grad_norm": 0.2689376175403595, + "learning_rate": 0.00011349266573253423, + "loss": 0.2198, + "step": 4520 + }, + { + "epoch": 0.914996964177292, + "grad_norm": 0.27592605352401733, + "learning_rate": 0.00011346114875371741, + "loss": 0.2286, + "step": 4521 + }, + { + "epoch": 0.9151993523578223, + "grad_norm": 0.3357422947883606, + "learning_rate": 0.0001134296304130428, + "loss": 0.2589, + "step": 4522 + }, + { + "epoch": 0.9154017405383525, + "grad_norm": 0.44812870025634766, + "learning_rate": 0.0001133981107136991, + "loss": 0.2527, + "step": 4523 + }, + { + "epoch": 0.9156041287188829, + "grad_norm": 0.2871791422367096, + "learning_rate": 0.00011336658965887514, + "loss": 0.2148, + "step": 4524 + }, + { + "epoch": 0.9158065168994131, + "grad_norm": 0.3729591369628906, + "learning_rate": 0.0001133350672517599, + "loss": 0.1677, + "step": 4525 + }, + { + "epoch": 0.9160089050799434, + "grad_norm": 0.41365382075309753, + "learning_rate": 0.00011330354349554249, + "loss": 0.2459, + "step": 4526 + }, + { + "epoch": 0.9162112932604736, + "grad_norm": 0.31599223613739014, + "learning_rate": 0.00011327201839341213, + "loss": 0.2652, + "step": 4527 + }, + { + "epoch": 0.9164136814410039, + "grad_norm": 0.2886614501476288, + "learning_rate": 0.00011324049194855819, + "loss": 0.225, + "step": 4528 + }, + { + "epoch": 0.9166160696215341, + "grad_norm": 0.28534746170043945, + "learning_rate": 0.00011320896416417026, + "loss": 0.2253, + "step": 4529 + }, + { + "epoch": 0.9168184578020644, + "grad_norm": 0.30376002192497253, + "learning_rate": 0.0001131774350434379, + "loss": 0.2323, + "step": 4530 + }, + { + "epoch": 0.9170208459825946, + "grad_norm": 0.42857232689857483, + "learning_rate": 0.00011314590458955092, + "loss": 0.238, + "step": 4531 + }, + { + "epoch": 0.9172232341631249, + "grad_norm": 0.2847810983657837, + "learning_rate": 0.00011311437280569925, + "loss": 0.2275, + "step": 4532 + }, + { + "epoch": 0.9174256223436551, + "grad_norm": 0.25568756461143494, + "learning_rate": 0.00011308283969507297, + "loss": 0.204, + "step": 4533 + }, + { + "epoch": 0.9176280105241854, + "grad_norm": 0.3276273012161255, + "learning_rate": 0.00011305130526086223, + "loss": 0.2499, + "step": 4534 + }, + { + "epoch": 0.9178303987047156, + "grad_norm": 0.3064262270927429, + "learning_rate": 0.00011301976950625739, + "loss": 0.2641, + "step": 4535 + }, + { + "epoch": 0.9180327868852459, + "grad_norm": 0.29613596200942993, + "learning_rate": 0.00011298823243444887, + "loss": 0.2236, + "step": 4536 + }, + { + "epoch": 0.9182351750657761, + "grad_norm": 0.30147165060043335, + "learning_rate": 0.00011295669404862728, + "loss": 0.2362, + "step": 4537 + }, + { + "epoch": 0.9184375632463064, + "grad_norm": 0.31944146752357483, + "learning_rate": 0.00011292515435198332, + "loss": 0.2462, + "step": 4538 + }, + { + "epoch": 0.9186399514268366, + "grad_norm": 0.5171250104904175, + "learning_rate": 0.0001128936133477079, + "loss": 0.1999, + "step": 4539 + }, + { + "epoch": 0.9188423396073669, + "grad_norm": 0.34059274196624756, + "learning_rate": 0.00011286207103899195, + "loss": 0.2169, + "step": 4540 + }, + { + "epoch": 0.9190447277878971, + "grad_norm": 0.2513200640678406, + "learning_rate": 0.00011283052742902664, + "loss": 0.2063, + "step": 4541 + }, + { + "epoch": 0.9192471159684275, + "grad_norm": 0.26981818675994873, + "learning_rate": 0.00011279898252100316, + "loss": 0.1965, + "step": 4542 + }, + { + "epoch": 0.9194495041489577, + "grad_norm": 0.3300010561943054, + "learning_rate": 0.00011276743631811295, + "loss": 0.2539, + "step": 4543 + }, + { + "epoch": 0.919651892329488, + "grad_norm": 0.30883342027664185, + "learning_rate": 0.00011273588882354749, + "loss": 0.1949, + "step": 4544 + }, + { + "epoch": 0.9198542805100182, + "grad_norm": 0.2752622067928314, + "learning_rate": 0.00011270434004049844, + "loss": 0.2371, + "step": 4545 + }, + { + "epoch": 0.9200566686905485, + "grad_norm": 0.24499693512916565, + "learning_rate": 0.00011267278997215756, + "loss": 0.21, + "step": 4546 + }, + { + "epoch": 0.9202590568710788, + "grad_norm": 0.2723928987979889, + "learning_rate": 0.00011264123862171675, + "loss": 0.2453, + "step": 4547 + }, + { + "epoch": 0.920461445051609, + "grad_norm": 0.276915043592453, + "learning_rate": 0.00011260968599236807, + "loss": 0.2224, + "step": 4548 + }, + { + "epoch": 0.9206638332321393, + "grad_norm": 0.2836087644100189, + "learning_rate": 0.00011257813208730368, + "loss": 0.2228, + "step": 4549 + }, + { + "epoch": 0.9208662214126695, + "grad_norm": 0.2985590100288391, + "learning_rate": 0.00011254657690971586, + "loss": 0.2548, + "step": 4550 + }, + { + "epoch": 0.9208662214126695, + "eval_loss": 0.2719789445400238, + "eval_runtime": 0.737, + "eval_samples_per_second": 6.784, + "eval_steps_per_second": 1.357, + "step": 4550 + }, + { + "epoch": 0.9210686095931998, + "grad_norm": 0.24112625420093536, + "learning_rate": 0.00011251502046279707, + "loss": 0.2018, + "step": 4551 + }, + { + "epoch": 0.92127099777373, + "grad_norm": 0.33927157521247864, + "learning_rate": 0.0001124834627497398, + "loss": 0.2299, + "step": 4552 + }, + { + "epoch": 0.9214733859542603, + "grad_norm": 0.252463161945343, + "learning_rate": 0.00011245190377373676, + "loss": 0.2099, + "step": 4553 + }, + { + "epoch": 0.9216757741347905, + "grad_norm": 0.3403078019618988, + "learning_rate": 0.00011242034353798075, + "loss": 0.2567, + "step": 4554 + }, + { + "epoch": 0.9218781623153208, + "grad_norm": 0.2527310848236084, + "learning_rate": 0.0001123887820456647, + "loss": 0.2071, + "step": 4555 + }, + { + "epoch": 0.922080550495851, + "grad_norm": 0.27652308344841003, + "learning_rate": 0.00011235721929998169, + "loss": 0.2249, + "step": 4556 + }, + { + "epoch": 0.9222829386763813, + "grad_norm": 0.29642534255981445, + "learning_rate": 0.0001123256553041249, + "loss": 0.2553, + "step": 4557 + }, + { + "epoch": 0.9224853268569115, + "grad_norm": 0.2769574820995331, + "learning_rate": 0.00011229409006128762, + "loss": 0.2369, + "step": 4558 + }, + { + "epoch": 0.9226877150374418, + "grad_norm": 0.27466651797294617, + "learning_rate": 0.00011226252357466331, + "loss": 0.2182, + "step": 4559 + }, + { + "epoch": 0.922890103217972, + "grad_norm": 0.27808988094329834, + "learning_rate": 0.00011223095584744553, + "loss": 0.2398, + "step": 4560 + }, + { + "epoch": 0.9230924913985024, + "grad_norm": 0.2918395400047302, + "learning_rate": 0.00011219938688282798, + "loss": 0.232, + "step": 4561 + }, + { + "epoch": 0.9232948795790326, + "grad_norm": 0.3108648657798767, + "learning_rate": 0.0001121678166840045, + "loss": 0.2455, + "step": 4562 + }, + { + "epoch": 0.9234972677595629, + "grad_norm": 0.27647170424461365, + "learning_rate": 0.000112136245254169, + "loss": 0.2309, + "step": 4563 + }, + { + "epoch": 0.9236996559400931, + "grad_norm": 0.25153157114982605, + "learning_rate": 0.00011210467259651552, + "loss": 0.2444, + "step": 4564 + }, + { + "epoch": 0.9239020441206234, + "grad_norm": 0.27187663316726685, + "learning_rate": 0.00011207309871423828, + "loss": 0.2296, + "step": 4565 + }, + { + "epoch": 0.9241044323011536, + "grad_norm": 0.3199034035205841, + "learning_rate": 0.0001120415236105316, + "loss": 0.2469, + "step": 4566 + }, + { + "epoch": 0.9243068204816839, + "grad_norm": 0.36292028427124023, + "learning_rate": 0.00011200994728858991, + "loss": 0.2094, + "step": 4567 + }, + { + "epoch": 0.9245092086622141, + "grad_norm": 0.30743375420570374, + "learning_rate": 0.00011197836975160778, + "loss": 0.2465, + "step": 4568 + }, + { + "epoch": 0.9247115968427444, + "grad_norm": 0.28878483176231384, + "learning_rate": 0.00011194679100277987, + "loss": 0.2228, + "step": 4569 + }, + { + "epoch": 0.9249139850232746, + "grad_norm": 0.25312918424606323, + "learning_rate": 0.00011191521104530103, + "loss": 0.2491, + "step": 4570 + }, + { + "epoch": 0.9251163732038049, + "grad_norm": 0.25213220715522766, + "learning_rate": 0.00011188362988236614, + "loss": 0.2279, + "step": 4571 + }, + { + "epoch": 0.9253187613843351, + "grad_norm": 0.26037511229515076, + "learning_rate": 0.00011185204751717029, + "loss": 0.2412, + "step": 4572 + }, + { + "epoch": 0.9255211495648654, + "grad_norm": 0.2789521813392639, + "learning_rate": 0.00011182046395290861, + "loss": 0.2533, + "step": 4573 + }, + { + "epoch": 0.9257235377453956, + "grad_norm": 0.3250572085380554, + "learning_rate": 0.00011178887919277642, + "loss": 0.2609, + "step": 4574 + }, + { + "epoch": 0.9259259259259259, + "grad_norm": 0.2678629755973816, + "learning_rate": 0.00011175729323996915, + "loss": 0.1953, + "step": 4575 + }, + { + "epoch": 0.9261283141064561, + "grad_norm": 0.3143162429332733, + "learning_rate": 0.00011172570609768231, + "loss": 0.2449, + "step": 4576 + }, + { + "epoch": 0.9263307022869864, + "grad_norm": 0.2648458778858185, + "learning_rate": 0.00011169411776911157, + "loss": 0.2152, + "step": 4577 + }, + { + "epoch": 0.9265330904675168, + "grad_norm": 0.32966744899749756, + "learning_rate": 0.00011166252825745269, + "loss": 0.2684, + "step": 4578 + }, + { + "epoch": 0.926735478648047, + "grad_norm": 0.26951298117637634, + "learning_rate": 0.00011163093756590157, + "loss": 0.2348, + "step": 4579 + }, + { + "epoch": 0.9269378668285773, + "grad_norm": 0.3233502507209778, + "learning_rate": 0.00011159934569765425, + "loss": 0.2827, + "step": 4580 + }, + { + "epoch": 0.9271402550091075, + "grad_norm": 0.35136401653289795, + "learning_rate": 0.00011156775265590682, + "loss": 0.2508, + "step": 4581 + }, + { + "epoch": 0.9273426431896378, + "grad_norm": 0.298715740442276, + "learning_rate": 0.00011153615844385557, + "loss": 0.2525, + "step": 4582 + }, + { + "epoch": 0.927545031370168, + "grad_norm": 0.314382404088974, + "learning_rate": 0.00011150456306469686, + "loss": 0.218, + "step": 4583 + }, + { + "epoch": 0.9277474195506983, + "grad_norm": 0.29797565937042236, + "learning_rate": 0.00011147296652162716, + "loss": 0.2336, + "step": 4584 + }, + { + "epoch": 0.9279498077312285, + "grad_norm": 0.26795753836631775, + "learning_rate": 0.00011144136881784311, + "loss": 0.2377, + "step": 4585 + }, + { + "epoch": 0.9281521959117588, + "grad_norm": 0.29854822158813477, + "learning_rate": 0.0001114097699565414, + "loss": 0.2337, + "step": 4586 + }, + { + "epoch": 0.928354584092289, + "grad_norm": 0.301384836435318, + "learning_rate": 0.0001113781699409189, + "loss": 0.2445, + "step": 4587 + }, + { + "epoch": 0.9285569722728193, + "grad_norm": 0.2761942446231842, + "learning_rate": 0.00011134656877417254, + "loss": 0.2555, + "step": 4588 + }, + { + "epoch": 0.9287593604533495, + "grad_norm": 0.283226877450943, + "learning_rate": 0.00011131496645949941, + "loss": 0.2316, + "step": 4589 + }, + { + "epoch": 0.9289617486338798, + "grad_norm": 0.3124370574951172, + "learning_rate": 0.00011128336300009672, + "loss": 0.2384, + "step": 4590 + }, + { + "epoch": 0.92916413681441, + "grad_norm": 0.29625093936920166, + "learning_rate": 0.00011125175839916173, + "loss": 0.227, + "step": 4591 + }, + { + "epoch": 0.9293665249949403, + "grad_norm": 0.28010037541389465, + "learning_rate": 0.0001112201526598919, + "loss": 0.2786, + "step": 4592 + }, + { + "epoch": 0.9295689131754705, + "grad_norm": 0.24734483659267426, + "learning_rate": 0.00011118854578548477, + "loss": 0.187, + "step": 4593 + }, + { + "epoch": 0.9297713013560008, + "grad_norm": 0.2376982867717743, + "learning_rate": 0.00011115693777913796, + "loss": 0.1929, + "step": 4594 + }, + { + "epoch": 0.929973689536531, + "grad_norm": 0.2752760648727417, + "learning_rate": 0.00011112532864404925, + "loss": 0.2548, + "step": 4595 + }, + { + "epoch": 0.9301760777170613, + "grad_norm": 0.2565372586250305, + "learning_rate": 0.00011109371838341653, + "loss": 0.246, + "step": 4596 + }, + { + "epoch": 0.9303784658975915, + "grad_norm": 0.2597411870956421, + "learning_rate": 0.0001110621070004378, + "loss": 0.2333, + "step": 4597 + }, + { + "epoch": 0.9305808540781219, + "grad_norm": 0.25775232911109924, + "learning_rate": 0.00011103049449831113, + "loss": 0.2496, + "step": 4598 + }, + { + "epoch": 0.930783242258652, + "grad_norm": 0.2996119558811188, + "learning_rate": 0.0001109988808802348, + "loss": 0.2434, + "step": 4599 + }, + { + "epoch": 0.9309856304391824, + "grad_norm": 0.3157181739807129, + "learning_rate": 0.00011096726614940709, + "loss": 0.2835, + "step": 4600 + }, + { + "epoch": 0.9309856304391824, + "eval_loss": 0.266468346118927, + "eval_runtime": 0.7401, + "eval_samples_per_second": 6.755, + "eval_steps_per_second": 1.351, + "step": 4600 + }, + { + "epoch": 0.9311880186197126, + "grad_norm": 0.2493111789226532, + "learning_rate": 0.00011093565030902648, + "loss": 0.1943, + "step": 4601 + }, + { + "epoch": 0.9313904068002429, + "grad_norm": 0.22633974254131317, + "learning_rate": 0.00011090403336229152, + "loss": 0.2091, + "step": 4602 + }, + { + "epoch": 0.9315927949807731, + "grad_norm": 0.27421262860298157, + "learning_rate": 0.00011087241531240086, + "loss": 0.1961, + "step": 4603 + }, + { + "epoch": 0.9317951831613034, + "grad_norm": 0.33170488476753235, + "learning_rate": 0.00011084079616255334, + "loss": 0.2481, + "step": 4604 + }, + { + "epoch": 0.9319975713418336, + "grad_norm": 0.29805314540863037, + "learning_rate": 0.0001108091759159478, + "loss": 0.2388, + "step": 4605 + }, + { + "epoch": 0.9321999595223639, + "grad_norm": 0.3398219645023346, + "learning_rate": 0.00011077755457578325, + "loss": 0.2388, + "step": 4606 + }, + { + "epoch": 0.9324023477028941, + "grad_norm": 0.27871426939964294, + "learning_rate": 0.00011074593214525883, + "loss": 0.2726, + "step": 4607 + }, + { + "epoch": 0.9326047358834244, + "grad_norm": 0.3019596040248871, + "learning_rate": 0.00011071430862757374, + "loss": 0.2641, + "step": 4608 + }, + { + "epoch": 0.9328071240639547, + "grad_norm": 0.2957676649093628, + "learning_rate": 0.00011068268402592733, + "loss": 0.2861, + "step": 4609 + }, + { + "epoch": 0.9330095122444849, + "grad_norm": 0.2980649471282959, + "learning_rate": 0.00011065105834351903, + "loss": 0.237, + "step": 4610 + }, + { + "epoch": 0.9332119004250152, + "grad_norm": 0.25749891996383667, + "learning_rate": 0.00011061943158354842, + "loss": 0.2209, + "step": 4611 + }, + { + "epoch": 0.9334142886055454, + "grad_norm": 0.2515775263309479, + "learning_rate": 0.00011058780374921516, + "loss": 0.2195, + "step": 4612 + }, + { + "epoch": 0.9336166767860757, + "grad_norm": 0.3047962486743927, + "learning_rate": 0.00011055617484371899, + "loss": 0.2506, + "step": 4613 + }, + { + "epoch": 0.9338190649666059, + "grad_norm": 0.30205193161964417, + "learning_rate": 0.00011052454487025983, + "loss": 0.1969, + "step": 4614 + }, + { + "epoch": 0.9340214531471362, + "grad_norm": 0.3510016202926636, + "learning_rate": 0.00011049291383203764, + "loss": 0.2523, + "step": 4615 + }, + { + "epoch": 0.9342238413276664, + "grad_norm": 0.29446735978126526, + "learning_rate": 0.00011046128173225254, + "loss": 0.2316, + "step": 4616 + }, + { + "epoch": 0.9344262295081968, + "grad_norm": 0.2585512101650238, + "learning_rate": 0.00011042964857410471, + "loss": 0.2221, + "step": 4617 + }, + { + "epoch": 0.934628617688727, + "grad_norm": 0.24132204055786133, + "learning_rate": 0.0001103980143607945, + "loss": 0.1958, + "step": 4618 + }, + { + "epoch": 0.9348310058692573, + "grad_norm": 0.3184613585472107, + "learning_rate": 0.0001103663790955223, + "loss": 0.2287, + "step": 4619 + }, + { + "epoch": 0.9350333940497875, + "grad_norm": 0.2446582168340683, + "learning_rate": 0.00011033474278148864, + "loss": 0.243, + "step": 4620 + }, + { + "epoch": 0.9352357822303178, + "grad_norm": 0.22792799770832062, + "learning_rate": 0.0001103031054218941, + "loss": 0.1865, + "step": 4621 + }, + { + "epoch": 0.935438170410848, + "grad_norm": 0.26008936762809753, + "learning_rate": 0.00011027146701993951, + "loss": 0.2276, + "step": 4622 + }, + { + "epoch": 0.9356405585913783, + "grad_norm": 0.2646524906158447, + "learning_rate": 0.00011023982757882564, + "loss": 0.2437, + "step": 4623 + }, + { + "epoch": 0.9358429467719085, + "grad_norm": 0.23059087991714478, + "learning_rate": 0.00011020818710175347, + "loss": 0.1967, + "step": 4624 + }, + { + "epoch": 0.9360453349524388, + "grad_norm": 0.2654322683811188, + "learning_rate": 0.00011017654559192403, + "loss": 0.2043, + "step": 4625 + }, + { + "epoch": 0.936247723132969, + "grad_norm": 0.24038441479206085, + "learning_rate": 0.00011014490305253849, + "loss": 0.2069, + "step": 4626 + }, + { + "epoch": 0.9364501113134993, + "grad_norm": 0.3121231198310852, + "learning_rate": 0.00011011325948679812, + "loss": 0.2627, + "step": 4627 + }, + { + "epoch": 0.9366524994940295, + "grad_norm": 0.2508528232574463, + "learning_rate": 0.00011008161489790425, + "loss": 0.194, + "step": 4628 + }, + { + "epoch": 0.9368548876745598, + "grad_norm": 0.3439335823059082, + "learning_rate": 0.00011004996928905842, + "loss": 0.2551, + "step": 4629 + }, + { + "epoch": 0.93705727585509, + "grad_norm": 0.2643868327140808, + "learning_rate": 0.00011001832266346213, + "loss": 0.2419, + "step": 4630 + }, + { + "epoch": 0.9372596640356203, + "grad_norm": 0.2533891201019287, + "learning_rate": 0.00010998667502431706, + "loss": 0.207, + "step": 4631 + }, + { + "epoch": 0.9374620522161505, + "grad_norm": 0.27981844544410706, + "learning_rate": 0.000109955026374825, + "loss": 0.2601, + "step": 4632 + }, + { + "epoch": 0.9376644403966808, + "grad_norm": 0.28298893570899963, + "learning_rate": 0.00010992337671818782, + "loss": 0.2526, + "step": 4633 + }, + { + "epoch": 0.937866828577211, + "grad_norm": 0.288486510515213, + "learning_rate": 0.00010989172605760752, + "loss": 0.26, + "step": 4634 + }, + { + "epoch": 0.9380692167577414, + "grad_norm": 0.3344356119632721, + "learning_rate": 0.00010986007439628616, + "loss": 0.2326, + "step": 4635 + }, + { + "epoch": 0.9382716049382716, + "grad_norm": 0.2752384841442108, + "learning_rate": 0.00010982842173742595, + "loss": 0.2194, + "step": 4636 + }, + { + "epoch": 0.9384739931188019, + "grad_norm": 0.3313218355178833, + "learning_rate": 0.00010979676808422916, + "loss": 0.2558, + "step": 4637 + }, + { + "epoch": 0.9386763812993322, + "grad_norm": 0.2754729688167572, + "learning_rate": 0.00010976511343989814, + "loss": 0.2682, + "step": 4638 + }, + { + "epoch": 0.9388787694798624, + "grad_norm": 0.2828863263130188, + "learning_rate": 0.00010973345780763544, + "loss": 0.2409, + "step": 4639 + }, + { + "epoch": 0.9390811576603927, + "grad_norm": 0.2455834299325943, + "learning_rate": 0.00010970180119064361, + "loss": 0.208, + "step": 4640 + }, + { + "epoch": 0.9392835458409229, + "grad_norm": 0.42734503746032715, + "learning_rate": 0.00010967014359212533, + "loss": 0.271, + "step": 4641 + }, + { + "epoch": 0.9394859340214532, + "grad_norm": 0.2849595844745636, + "learning_rate": 0.0001096384850152834, + "loss": 0.2408, + "step": 4642 + }, + { + "epoch": 0.9396883222019834, + "grad_norm": 0.37806615233421326, + "learning_rate": 0.00010960682546332066, + "loss": 0.2634, + "step": 4643 + }, + { + "epoch": 0.9398907103825137, + "grad_norm": 0.253713995218277, + "learning_rate": 0.00010957516493944014, + "loss": 0.19, + "step": 4644 + }, + { + "epoch": 0.9400930985630439, + "grad_norm": 0.27861085534095764, + "learning_rate": 0.0001095435034468449, + "loss": 0.204, + "step": 4645 + }, + { + "epoch": 0.9402954867435742, + "grad_norm": 0.26259860396385193, + "learning_rate": 0.00010951184098873812, + "loss": 0.2234, + "step": 4646 + }, + { + "epoch": 0.9404978749241044, + "grad_norm": 0.3342563509941101, + "learning_rate": 0.00010948017756832307, + "loss": 0.238, + "step": 4647 + }, + { + "epoch": 0.9407002631046347, + "grad_norm": 0.30692070722579956, + "learning_rate": 0.00010944851318880314, + "loss": 0.2612, + "step": 4648 + }, + { + "epoch": 0.9409026512851649, + "grad_norm": 0.2919248044490814, + "learning_rate": 0.00010941684785338178, + "loss": 0.237, + "step": 4649 + }, + { + "epoch": 0.9411050394656952, + "grad_norm": 0.26972222328186035, + "learning_rate": 0.00010938518156526256, + "loss": 0.1923, + "step": 4650 + }, + { + "epoch": 0.9411050394656952, + "eval_loss": 0.26375117897987366, + "eval_runtime": 0.737, + "eval_samples_per_second": 6.784, + "eval_steps_per_second": 1.357, + "step": 4650 + }, + { + "epoch": 0.9413074276462254, + "grad_norm": 0.2946036159992218, + "learning_rate": 0.00010935351432764915, + "loss": 0.2034, + "step": 4651 + }, + { + "epoch": 0.9415098158267557, + "grad_norm": 0.25022605061531067, + "learning_rate": 0.00010932184614374533, + "loss": 0.2109, + "step": 4652 + }, + { + "epoch": 0.941712204007286, + "grad_norm": 0.2673112750053406, + "learning_rate": 0.0001092901770167549, + "loss": 0.2268, + "step": 4653 + }, + { + "epoch": 0.9419145921878163, + "grad_norm": 0.3491578698158264, + "learning_rate": 0.00010925850694988184, + "loss": 0.2692, + "step": 4654 + }, + { + "epoch": 0.9421169803683465, + "grad_norm": 0.30046436190605164, + "learning_rate": 0.00010922683594633021, + "loss": 0.2325, + "step": 4655 + }, + { + "epoch": 0.9423193685488768, + "grad_norm": 0.2795022130012512, + "learning_rate": 0.00010919516400930412, + "loss": 0.2397, + "step": 4656 + }, + { + "epoch": 0.942521756729407, + "grad_norm": 0.2704300880432129, + "learning_rate": 0.00010916349114200784, + "loss": 0.2095, + "step": 4657 + }, + { + "epoch": 0.9427241449099373, + "grad_norm": 0.33030474185943604, + "learning_rate": 0.00010913181734764566, + "loss": 0.2485, + "step": 4658 + }, + { + "epoch": 0.9429265330904675, + "grad_norm": 0.3205012381076813, + "learning_rate": 0.00010910014262942204, + "loss": 0.2446, + "step": 4659 + }, + { + "epoch": 0.9431289212709978, + "grad_norm": 0.2922936677932739, + "learning_rate": 0.00010906846699054144, + "loss": 0.208, + "step": 4660 + }, + { + "epoch": 0.943331309451528, + "grad_norm": 0.2377457320690155, + "learning_rate": 0.00010903679043420854, + "loss": 0.2016, + "step": 4661 + }, + { + "epoch": 0.9435336976320583, + "grad_norm": 0.2981378436088562, + "learning_rate": 0.00010900511296362801, + "loss": 0.2158, + "step": 4662 + }, + { + "epoch": 0.9437360858125885, + "grad_norm": 0.34946900606155396, + "learning_rate": 0.00010897343458200462, + "loss": 0.2385, + "step": 4663 + }, + { + "epoch": 0.9439384739931188, + "grad_norm": 0.3810003101825714, + "learning_rate": 0.00010894175529254327, + "loss": 0.2299, + "step": 4664 + }, + { + "epoch": 0.944140862173649, + "grad_norm": 0.2824034094810486, + "learning_rate": 0.00010891007509844894, + "loss": 0.2584, + "step": 4665 + }, + { + "epoch": 0.9443432503541793, + "grad_norm": 0.3092564642429352, + "learning_rate": 0.00010887839400292672, + "loss": 0.2426, + "step": 4666 + }, + { + "epoch": 0.9445456385347095, + "grad_norm": 0.25669777393341064, + "learning_rate": 0.00010884671200918175, + "loss": 0.2222, + "step": 4667 + }, + { + "epoch": 0.9447480267152398, + "grad_norm": 0.3152638077735901, + "learning_rate": 0.0001088150291204193, + "loss": 0.2484, + "step": 4668 + }, + { + "epoch": 0.9449504148957701, + "grad_norm": 0.4432920217514038, + "learning_rate": 0.00010878334533984467, + "loss": 0.1989, + "step": 4669 + }, + { + "epoch": 0.9451528030763003, + "grad_norm": 0.301040917634964, + "learning_rate": 0.00010875166067066334, + "loss": 0.2654, + "step": 4670 + }, + { + "epoch": 0.9453551912568307, + "grad_norm": 0.28827717900276184, + "learning_rate": 0.0001087199751160808, + "loss": 0.2589, + "step": 4671 + }, + { + "epoch": 0.9455575794373609, + "grad_norm": 0.2627612352371216, + "learning_rate": 0.0001086882886793027, + "loss": 0.2177, + "step": 4672 + }, + { + "epoch": 0.9457599676178912, + "grad_norm": 0.2797980308532715, + "learning_rate": 0.0001086566013635347, + "loss": 0.224, + "step": 4673 + }, + { + "epoch": 0.9459623557984214, + "grad_norm": 0.2875703275203705, + "learning_rate": 0.0001086249131719826, + "loss": 0.2278, + "step": 4674 + }, + { + "epoch": 0.9461647439789517, + "grad_norm": 0.2887953221797943, + "learning_rate": 0.0001085932241078523, + "loss": 0.2355, + "step": 4675 + }, + { + "epoch": 0.9463671321594819, + "grad_norm": 0.33487871289253235, + "learning_rate": 0.00010856153417434975, + "loss": 0.2275, + "step": 4676 + }, + { + "epoch": 0.9465695203400122, + "grad_norm": 0.308789998292923, + "learning_rate": 0.000108529843374681, + "loss": 0.2606, + "step": 4677 + }, + { + "epoch": 0.9467719085205424, + "grad_norm": 0.2678741216659546, + "learning_rate": 0.0001084981517120522, + "loss": 0.2206, + "step": 4678 + }, + { + "epoch": 0.9469742967010727, + "grad_norm": 0.31572091579437256, + "learning_rate": 0.00010846645918966958, + "loss": 0.2686, + "step": 4679 + }, + { + "epoch": 0.9471766848816029, + "grad_norm": 0.29302868247032166, + "learning_rate": 0.00010843476581073946, + "loss": 0.2397, + "step": 4680 + }, + { + "epoch": 0.9473790730621332, + "grad_norm": 0.4068066477775574, + "learning_rate": 0.00010840307157846825, + "loss": 0.2529, + "step": 4681 + }, + { + "epoch": 0.9475814612426634, + "grad_norm": 0.3153510093688965, + "learning_rate": 0.00010837137649606242, + "loss": 0.232, + "step": 4682 + }, + { + "epoch": 0.9477838494231937, + "grad_norm": 0.4613901376724243, + "learning_rate": 0.00010833968056672854, + "loss": 0.2561, + "step": 4683 + }, + { + "epoch": 0.9479862376037239, + "grad_norm": 0.27325960993766785, + "learning_rate": 0.00010830798379367331, + "loss": 0.2204, + "step": 4684 + }, + { + "epoch": 0.9481886257842542, + "grad_norm": 0.2553834319114685, + "learning_rate": 0.00010827628618010348, + "loss": 0.2109, + "step": 4685 + }, + { + "epoch": 0.9483910139647844, + "grad_norm": 0.24329812824726105, + "learning_rate": 0.0001082445877292258, + "loss": 0.2188, + "step": 4686 + }, + { + "epoch": 0.9485934021453147, + "grad_norm": 0.24125301837921143, + "learning_rate": 0.00010821288844424729, + "loss": 0.2392, + "step": 4687 + }, + { + "epoch": 0.9487957903258449, + "grad_norm": 0.29916635155677795, + "learning_rate": 0.00010818118832837487, + "loss": 0.2382, + "step": 4688 + }, + { + "epoch": 0.9489981785063752, + "grad_norm": 0.277773916721344, + "learning_rate": 0.00010814948738481568, + "loss": 0.2504, + "step": 4689 + }, + { + "epoch": 0.9492005666869054, + "grad_norm": 0.2472153902053833, + "learning_rate": 0.00010811778561677686, + "loss": 0.2376, + "step": 4690 + }, + { + "epoch": 0.9494029548674358, + "grad_norm": 0.3063032627105713, + "learning_rate": 0.00010808608302746568, + "loss": 0.2506, + "step": 4691 + }, + { + "epoch": 0.949605343047966, + "grad_norm": 0.3086594045162201, + "learning_rate": 0.00010805437962008944, + "loss": 0.2558, + "step": 4692 + }, + { + "epoch": 0.9498077312284963, + "grad_norm": 0.2349616289138794, + "learning_rate": 0.0001080226753978556, + "loss": 0.2006, + "step": 4693 + }, + { + "epoch": 0.9500101194090265, + "grad_norm": 0.3396564722061157, + "learning_rate": 0.00010799097036397166, + "loss": 0.231, + "step": 4694 + }, + { + "epoch": 0.9502125075895568, + "grad_norm": 0.2693391740322113, + "learning_rate": 0.00010795926452164515, + "loss": 0.219, + "step": 4695 + }, + { + "epoch": 0.950414895770087, + "grad_norm": 0.39639660716056824, + "learning_rate": 0.00010792755787408381, + "loss": 0.2265, + "step": 4696 + }, + { + "epoch": 0.9506172839506173, + "grad_norm": 0.2864764630794525, + "learning_rate": 0.00010789585042449532, + "loss": 0.2152, + "step": 4697 + }, + { + "epoch": 0.9508196721311475, + "grad_norm": 0.26854822039604187, + "learning_rate": 0.00010786414217608756, + "loss": 0.2353, + "step": 4698 + }, + { + "epoch": 0.9510220603116778, + "grad_norm": 0.2807072401046753, + "learning_rate": 0.00010783243313206839, + "loss": 0.2415, + "step": 4699 + }, + { + "epoch": 0.9512244484922081, + "grad_norm": 0.44308915734291077, + "learning_rate": 0.00010780072329564583, + "loss": 0.3097, + "step": 4700 + }, + { + "epoch": 0.9512244484922081, + "eval_loss": 0.2648102939128876, + "eval_runtime": 0.7419, + "eval_samples_per_second": 6.74, + "eval_steps_per_second": 1.348, + "step": 4700 + }, + { + "epoch": 0.9514268366727383, + "grad_norm": 0.290475994348526, + "learning_rate": 0.00010776901267002793, + "loss": 0.2212, + "step": 4701 + }, + { + "epoch": 0.9516292248532686, + "grad_norm": 0.2510857582092285, + "learning_rate": 0.00010773730125842283, + "loss": 0.212, + "step": 4702 + }, + { + "epoch": 0.9518316130337988, + "grad_norm": 0.26925188302993774, + "learning_rate": 0.0001077055890640388, + "loss": 0.2465, + "step": 4703 + }, + { + "epoch": 0.9520340012143291, + "grad_norm": 0.30950313806533813, + "learning_rate": 0.00010767387609008413, + "loss": 0.2292, + "step": 4704 + }, + { + "epoch": 0.9522363893948593, + "grad_norm": 0.35539141297340393, + "learning_rate": 0.00010764216233976718, + "loss": 0.2516, + "step": 4705 + }, + { + "epoch": 0.9524387775753896, + "grad_norm": 0.27495214343070984, + "learning_rate": 0.00010761044781629644, + "loss": 0.2038, + "step": 4706 + }, + { + "epoch": 0.9526411657559198, + "grad_norm": 0.3153650462627411, + "learning_rate": 0.00010757873252288047, + "loss": 0.2235, + "step": 4707 + }, + { + "epoch": 0.9528435539364501, + "grad_norm": 0.26855596899986267, + "learning_rate": 0.00010754701646272782, + "loss": 0.2216, + "step": 4708 + }, + { + "epoch": 0.9530459421169803, + "grad_norm": 0.24935470521450043, + "learning_rate": 0.00010751529963904727, + "loss": 0.2277, + "step": 4709 + }, + { + "epoch": 0.9532483302975107, + "grad_norm": 0.2733069062232971, + "learning_rate": 0.00010748358205504754, + "loss": 0.2274, + "step": 4710 + }, + { + "epoch": 0.9534507184780409, + "grad_norm": 0.3918766677379608, + "learning_rate": 0.00010745186371393751, + "loss": 0.2387, + "step": 4711 + }, + { + "epoch": 0.9536531066585712, + "grad_norm": 0.29222676157951355, + "learning_rate": 0.00010742014461892611, + "loss": 0.2373, + "step": 4712 + }, + { + "epoch": 0.9538554948391014, + "grad_norm": 0.2915707230567932, + "learning_rate": 0.0001073884247732223, + "loss": 0.2251, + "step": 4713 + }, + { + "epoch": 0.9540578830196317, + "grad_norm": 0.2980474829673767, + "learning_rate": 0.00010735670418003523, + "loss": 0.2419, + "step": 4714 + }, + { + "epoch": 0.9542602712001619, + "grad_norm": 0.2788250148296356, + "learning_rate": 0.00010732498284257401, + "loss": 0.2131, + "step": 4715 + }, + { + "epoch": 0.9544626593806922, + "grad_norm": 0.25359243154525757, + "learning_rate": 0.0001072932607640479, + "loss": 0.2342, + "step": 4716 + }, + { + "epoch": 0.9546650475612224, + "grad_norm": 0.41266191005706787, + "learning_rate": 0.00010726153794766618, + "loss": 0.2305, + "step": 4717 + }, + { + "epoch": 0.9548674357417527, + "grad_norm": 0.29730692505836487, + "learning_rate": 0.00010722981439663829, + "loss": 0.216, + "step": 4718 + }, + { + "epoch": 0.9550698239222829, + "grad_norm": 0.29037201404571533, + "learning_rate": 0.00010719809011417358, + "loss": 0.2067, + "step": 4719 + }, + { + "epoch": 0.9552722121028132, + "grad_norm": 0.2740785777568817, + "learning_rate": 0.00010716636510348168, + "loss": 0.2179, + "step": 4720 + }, + { + "epoch": 0.9554746002833434, + "grad_norm": 0.30382829904556274, + "learning_rate": 0.00010713463936777213, + "loss": 0.2624, + "step": 4721 + }, + { + "epoch": 0.9556769884638737, + "grad_norm": 0.2392117828130722, + "learning_rate": 0.00010710291291025465, + "loss": 0.202, + "step": 4722 + }, + { + "epoch": 0.9558793766444039, + "grad_norm": 0.4092789590358734, + "learning_rate": 0.00010707118573413894, + "loss": 0.223, + "step": 4723 + }, + { + "epoch": 0.9560817648249342, + "grad_norm": 0.3075420558452606, + "learning_rate": 0.00010703945784263489, + "loss": 0.2257, + "step": 4724 + }, + { + "epoch": 0.9562841530054644, + "grad_norm": 0.25516483187675476, + "learning_rate": 0.00010700772923895235, + "loss": 0.2206, + "step": 4725 + }, + { + "epoch": 0.9564865411859947, + "grad_norm": 0.30991610884666443, + "learning_rate": 0.00010697599992630128, + "loss": 0.2224, + "step": 4726 + }, + { + "epoch": 0.9566889293665249, + "grad_norm": 0.3171761631965637, + "learning_rate": 0.00010694426990789174, + "loss": 0.2478, + "step": 4727 + }, + { + "epoch": 0.9568913175470553, + "grad_norm": 0.25520816445350647, + "learning_rate": 0.00010691253918693385, + "loss": 0.2256, + "step": 4728 + }, + { + "epoch": 0.9570937057275856, + "grad_norm": 0.2540293335914612, + "learning_rate": 0.00010688080776663778, + "loss": 0.2074, + "step": 4729 + }, + { + "epoch": 0.9572960939081158, + "grad_norm": 0.27108830213546753, + "learning_rate": 0.00010684907565021376, + "loss": 0.2256, + "step": 4730 + }, + { + "epoch": 0.9574984820886461, + "grad_norm": 0.24713459610939026, + "learning_rate": 0.00010681734284087215, + "loss": 0.1832, + "step": 4731 + }, + { + "epoch": 0.9577008702691763, + "grad_norm": 0.2772236466407776, + "learning_rate": 0.00010678560934182331, + "loss": 0.2324, + "step": 4732 + }, + { + "epoch": 0.9579032584497066, + "grad_norm": 0.261482834815979, + "learning_rate": 0.00010675387515627773, + "loss": 0.2349, + "step": 4733 + }, + { + "epoch": 0.9581056466302368, + "grad_norm": 0.2800223231315613, + "learning_rate": 0.00010672214028744591, + "loss": 0.2354, + "step": 4734 + }, + { + "epoch": 0.9583080348107671, + "grad_norm": 0.2289680540561676, + "learning_rate": 0.00010669040473853848, + "loss": 0.1974, + "step": 4735 + }, + { + "epoch": 0.9585104229912973, + "grad_norm": 0.2535719871520996, + "learning_rate": 0.00010665866851276611, + "loss": 0.2096, + "step": 4736 + }, + { + "epoch": 0.9587128111718276, + "grad_norm": 0.25151848793029785, + "learning_rate": 0.00010662693161333954, + "loss": 0.2093, + "step": 4737 + }, + { + "epoch": 0.9589151993523578, + "grad_norm": 0.3600228726863861, + "learning_rate": 0.00010659519404346954, + "loss": 0.2494, + "step": 4738 + }, + { + "epoch": 0.9591175875328881, + "grad_norm": 0.26792627573013306, + "learning_rate": 0.00010656345580636702, + "loss": 0.2552, + "step": 4739 + }, + { + "epoch": 0.9593199757134183, + "grad_norm": 0.29325777292251587, + "learning_rate": 0.00010653171690524293, + "loss": 0.2888, + "step": 4740 + }, + { + "epoch": 0.9595223638939486, + "grad_norm": 0.2313537895679474, + "learning_rate": 0.00010649997734330824, + "loss": 0.2209, + "step": 4741 + }, + { + "epoch": 0.9597247520744788, + "grad_norm": 0.28957894444465637, + "learning_rate": 0.00010646823712377405, + "loss": 0.2527, + "step": 4742 + }, + { + "epoch": 0.9599271402550091, + "grad_norm": 0.242612823843956, + "learning_rate": 0.00010643649624985148, + "loss": 0.2044, + "step": 4743 + }, + { + "epoch": 0.9601295284355393, + "grad_norm": 0.27495434880256653, + "learning_rate": 0.00010640475472475178, + "loss": 0.2192, + "step": 4744 + }, + { + "epoch": 0.9603319166160696, + "grad_norm": 0.22725163400173187, + "learning_rate": 0.00010637301255168619, + "loss": 0.1944, + "step": 4745 + }, + { + "epoch": 0.9605343047965998, + "grad_norm": 0.28637436032295227, + "learning_rate": 0.00010634126973386607, + "loss": 0.2644, + "step": 4746 + }, + { + "epoch": 0.9607366929771302, + "grad_norm": 0.2781379818916321, + "learning_rate": 0.00010630952627450279, + "loss": 0.2483, + "step": 4747 + }, + { + "epoch": 0.9609390811576604, + "grad_norm": 0.34088853001594543, + "learning_rate": 0.00010627778217680786, + "loss": 0.2224, + "step": 4748 + }, + { + "epoch": 0.9611414693381907, + "grad_norm": 0.3108992576599121, + "learning_rate": 0.00010624603744399282, + "loss": 0.216, + "step": 4749 + }, + { + "epoch": 0.9613438575187209, + "grad_norm": 0.30344733595848083, + "learning_rate": 0.00010621429207926923, + "loss": 0.2161, + "step": 4750 + }, + { + "epoch": 0.9613438575187209, + "eval_loss": 0.27021822333335876, + "eval_runtime": 0.7381, + "eval_samples_per_second": 6.774, + "eval_steps_per_second": 1.355, + "step": 4750 + }, + { + "epoch": 0.9615462456992512, + "grad_norm": 0.30304837226867676, + "learning_rate": 0.00010618254608584879, + "loss": 0.2572, + "step": 4751 + }, + { + "epoch": 0.9617486338797814, + "grad_norm": 0.2323007434606552, + "learning_rate": 0.0001061507994669432, + "loss": 0.2226, + "step": 4752 + }, + { + "epoch": 0.9619510220603117, + "grad_norm": 0.24936886131763458, + "learning_rate": 0.00010611905222576426, + "loss": 0.1989, + "step": 4753 + }, + { + "epoch": 0.9621534102408419, + "grad_norm": 0.2739543318748474, + "learning_rate": 0.00010608730436552381, + "loss": 0.2544, + "step": 4754 + }, + { + "epoch": 0.9623557984213722, + "grad_norm": 0.2710932791233063, + "learning_rate": 0.00010605555588943378, + "loss": 0.2378, + "step": 4755 + }, + { + "epoch": 0.9625581866019024, + "grad_norm": 0.284679114818573, + "learning_rate": 0.00010602380680070616, + "loss": 0.2244, + "step": 4756 + }, + { + "epoch": 0.9627605747824327, + "grad_norm": 0.33875322341918945, + "learning_rate": 0.00010599205710255298, + "loss": 0.2296, + "step": 4757 + }, + { + "epoch": 0.9629629629629629, + "grad_norm": 0.3009338974952698, + "learning_rate": 0.00010596030679818631, + "loss": 0.2089, + "step": 4758 + }, + { + "epoch": 0.9631653511434932, + "grad_norm": 0.30808284878730774, + "learning_rate": 0.00010592855589081838, + "loss": 0.2563, + "step": 4759 + }, + { + "epoch": 0.9633677393240235, + "grad_norm": 0.22831407189369202, + "learning_rate": 0.00010589680438366134, + "loss": 0.1959, + "step": 4760 + }, + { + "epoch": 0.9635701275045537, + "grad_norm": 0.23412199318408966, + "learning_rate": 0.00010586505227992752, + "loss": 0.2208, + "step": 4761 + }, + { + "epoch": 0.963772515685084, + "grad_norm": 0.35633623600006104, + "learning_rate": 0.00010583329958282926, + "loss": 0.2848, + "step": 4762 + }, + { + "epoch": 0.9639749038656142, + "grad_norm": 0.2718070447444916, + "learning_rate": 0.00010580154629557895, + "loss": 0.2401, + "step": 4763 + }, + { + "epoch": 0.9641772920461446, + "grad_norm": 0.29783880710601807, + "learning_rate": 0.00010576979242138904, + "loss": 0.2431, + "step": 4764 + }, + { + "epoch": 0.9643796802266748, + "grad_norm": 0.24454525113105774, + "learning_rate": 0.0001057380379634721, + "loss": 0.1961, + "step": 4765 + }, + { + "epoch": 0.9645820684072051, + "grad_norm": 0.287728875875473, + "learning_rate": 0.00010570628292504068, + "loss": 0.2301, + "step": 4766 + }, + { + "epoch": 0.9647844565877353, + "grad_norm": 0.28134074807167053, + "learning_rate": 0.00010567452730930743, + "loss": 0.2446, + "step": 4767 + }, + { + "epoch": 0.9649868447682656, + "grad_norm": 0.3506639897823334, + "learning_rate": 0.00010564277111948501, + "loss": 0.2137, + "step": 4768 + }, + { + "epoch": 0.9651892329487958, + "grad_norm": 0.364789217710495, + "learning_rate": 0.00010561101435878627, + "loss": 0.2426, + "step": 4769 + }, + { + "epoch": 0.9653916211293261, + "grad_norm": 0.3204108476638794, + "learning_rate": 0.00010557925703042395, + "loss": 0.2397, + "step": 4770 + }, + { + "epoch": 0.9655940093098563, + "grad_norm": 0.25606632232666016, + "learning_rate": 0.00010554749913761095, + "loss": 0.1992, + "step": 4771 + }, + { + "epoch": 0.9657963974903866, + "grad_norm": 0.28512629866600037, + "learning_rate": 0.0001055157406835602, + "loss": 0.2361, + "step": 4772 + }, + { + "epoch": 0.9659987856709168, + "grad_norm": 0.2572082281112671, + "learning_rate": 0.00010548398167148468, + "loss": 0.2004, + "step": 4773 + }, + { + "epoch": 0.9662011738514471, + "grad_norm": 0.2586156129837036, + "learning_rate": 0.00010545222210459744, + "loss": 0.2051, + "step": 4774 + }, + { + "epoch": 0.9664035620319773, + "grad_norm": 0.2859581410884857, + "learning_rate": 0.0001054204619861116, + "loss": 0.2254, + "step": 4775 + }, + { + "epoch": 0.9666059502125076, + "grad_norm": 0.23509328067302704, + "learning_rate": 0.00010538870131924026, + "loss": 0.2183, + "step": 4776 + }, + { + "epoch": 0.9668083383930378, + "grad_norm": 0.27621352672576904, + "learning_rate": 0.00010535694010719665, + "loss": 0.2309, + "step": 4777 + }, + { + "epoch": 0.9670107265735681, + "grad_norm": 0.2531593143939972, + "learning_rate": 0.00010532517835319407, + "loss": 0.1967, + "step": 4778 + }, + { + "epoch": 0.9672131147540983, + "grad_norm": 0.33856093883514404, + "learning_rate": 0.0001052934160604458, + "loss": 0.2425, + "step": 4779 + }, + { + "epoch": 0.9674155029346286, + "grad_norm": 0.27292317152023315, + "learning_rate": 0.00010526165323216525, + "loss": 0.2316, + "step": 4780 + }, + { + "epoch": 0.9676178911151588, + "grad_norm": 0.23614521324634552, + "learning_rate": 0.00010522988987156586, + "loss": 0.1961, + "step": 4781 + }, + { + "epoch": 0.9678202792956891, + "grad_norm": 0.2543821930885315, + "learning_rate": 0.00010519812598186107, + "loss": 0.215, + "step": 4782 + }, + { + "epoch": 0.9680226674762193, + "grad_norm": 0.3500335216522217, + "learning_rate": 0.00010516636156626445, + "loss": 0.2445, + "step": 4783 + }, + { + "epoch": 0.9682250556567497, + "grad_norm": 0.2474358230829239, + "learning_rate": 0.00010513459662798954, + "loss": 0.2219, + "step": 4784 + }, + { + "epoch": 0.9684274438372799, + "grad_norm": 0.26944953203201294, + "learning_rate": 0.00010510283117025008, + "loss": 0.2411, + "step": 4785 + }, + { + "epoch": 0.9686298320178102, + "grad_norm": 0.37569373846054077, + "learning_rate": 0.00010507106519625967, + "loss": 0.2564, + "step": 4786 + }, + { + "epoch": 0.9688322201983404, + "grad_norm": 0.2393825501203537, + "learning_rate": 0.00010503929870923208, + "loss": 0.2335, + "step": 4787 + }, + { + "epoch": 0.9690346083788707, + "grad_norm": 0.23142491281032562, + "learning_rate": 0.00010500753171238116, + "loss": 0.2066, + "step": 4788 + }, + { + "epoch": 0.9692369965594009, + "grad_norm": 0.29840072989463806, + "learning_rate": 0.0001049757642089207, + "loss": 0.2315, + "step": 4789 + }, + { + "epoch": 0.9694393847399312, + "grad_norm": 0.27222388982772827, + "learning_rate": 0.00010494399620206464, + "loss": 0.2222, + "step": 4790 + }, + { + "epoch": 0.9696417729204615, + "grad_norm": 0.24776479601860046, + "learning_rate": 0.00010491222769502688, + "loss": 0.2159, + "step": 4791 + }, + { + "epoch": 0.9698441611009917, + "grad_norm": 0.362596720457077, + "learning_rate": 0.0001048804586910215, + "loss": 0.2469, + "step": 4792 + }, + { + "epoch": 0.970046549281522, + "grad_norm": 0.2624041736125946, + "learning_rate": 0.00010484868919326251, + "loss": 0.2227, + "step": 4793 + }, + { + "epoch": 0.9702489374620522, + "grad_norm": 0.3750174343585968, + "learning_rate": 0.00010481691920496404, + "loss": 0.2398, + "step": 4794 + }, + { + "epoch": 0.9704513256425825, + "grad_norm": 0.2520590126514435, + "learning_rate": 0.00010478514872934023, + "loss": 0.2154, + "step": 4795 + }, + { + "epoch": 0.9706537138231127, + "grad_norm": 0.21335478127002716, + "learning_rate": 0.00010475337776960528, + "loss": 0.2068, + "step": 4796 + }, + { + "epoch": 0.970856102003643, + "grad_norm": 0.3522154986858368, + "learning_rate": 0.00010472160632897343, + "loss": 0.2391, + "step": 4797 + }, + { + "epoch": 0.9710584901841732, + "grad_norm": 0.4050356149673462, + "learning_rate": 0.000104689834410659, + "loss": 0.2603, + "step": 4798 + }, + { + "epoch": 0.9712608783647035, + "grad_norm": 0.2647132873535156, + "learning_rate": 0.00010465806201787634, + "loss": 0.2358, + "step": 4799 + }, + { + "epoch": 0.9714632665452337, + "grad_norm": 0.6313262581825256, + "learning_rate": 0.00010462628915383983, + "loss": 0.2435, + "step": 4800 + }, + { + "epoch": 0.9714632665452337, + "eval_loss": 0.2704029083251953, + "eval_runtime": 0.7409, + "eval_samples_per_second": 6.749, + "eval_steps_per_second": 1.35, + "step": 4800 + }, + { + "epoch": 0.971665654725764, + "grad_norm": 0.3022230863571167, + "learning_rate": 0.00010459451582176392, + "loss": 0.2649, + "step": 4801 + }, + { + "epoch": 0.9718680429062942, + "grad_norm": 0.2528679370880127, + "learning_rate": 0.00010456274202486314, + "loss": 0.2116, + "step": 4802 + }, + { + "epoch": 0.9720704310868246, + "grad_norm": 0.6243422627449036, + "learning_rate": 0.00010453096776635196, + "loss": 0.2509, + "step": 4803 + }, + { + "epoch": 0.9722728192673548, + "grad_norm": 0.32224878668785095, + "learning_rate": 0.00010449919304944502, + "loss": 0.2208, + "step": 4804 + }, + { + "epoch": 0.9724752074478851, + "grad_norm": 0.3156091272830963, + "learning_rate": 0.00010446741787735695, + "loss": 0.2714, + "step": 4805 + }, + { + "epoch": 0.9726775956284153, + "grad_norm": 0.37287795543670654, + "learning_rate": 0.0001044356422533024, + "loss": 0.2404, + "step": 4806 + }, + { + "epoch": 0.9728799838089456, + "grad_norm": 0.26560112833976746, + "learning_rate": 0.00010440386618049611, + "loss": 0.2373, + "step": 4807 + }, + { + "epoch": 0.9730823719894758, + "grad_norm": 0.332040399312973, + "learning_rate": 0.00010437208966215286, + "loss": 0.2552, + "step": 4808 + }, + { + "epoch": 0.9732847601700061, + "grad_norm": 0.326384574174881, + "learning_rate": 0.00010434031270148743, + "loss": 0.2473, + "step": 4809 + }, + { + "epoch": 0.9734871483505363, + "grad_norm": 0.26876839995384216, + "learning_rate": 0.00010430853530171472, + "loss": 0.2343, + "step": 4810 + }, + { + "epoch": 0.9736895365310666, + "grad_norm": 0.3743020296096802, + "learning_rate": 0.00010427675746604962, + "loss": 0.2167, + "step": 4811 + }, + { + "epoch": 0.9738919247115968, + "grad_norm": 0.30613642930984497, + "learning_rate": 0.00010424497919770708, + "loss": 0.1993, + "step": 4812 + }, + { + "epoch": 0.9740943128921271, + "grad_norm": 0.2499884068965912, + "learning_rate": 0.00010421320049990207, + "loss": 0.2167, + "step": 4813 + }, + { + "epoch": 0.9742967010726573, + "grad_norm": 0.2823628783226013, + "learning_rate": 0.00010418142137584966, + "loss": 0.2131, + "step": 4814 + }, + { + "epoch": 0.9744990892531876, + "grad_norm": 0.27555301785469055, + "learning_rate": 0.0001041496418287649, + "loss": 0.2106, + "step": 4815 + }, + { + "epoch": 0.9747014774337178, + "grad_norm": 0.29797542095184326, + "learning_rate": 0.00010411786186186292, + "loss": 0.2279, + "step": 4816 + }, + { + "epoch": 0.9749038656142481, + "grad_norm": 0.24461086094379425, + "learning_rate": 0.00010408608147835888, + "loss": 0.2245, + "step": 4817 + }, + { + "epoch": 0.9751062537947783, + "grad_norm": 0.2839052677154541, + "learning_rate": 0.00010405430068146802, + "loss": 0.2658, + "step": 4818 + }, + { + "epoch": 0.9753086419753086, + "grad_norm": 0.2975747585296631, + "learning_rate": 0.00010402251947440554, + "loss": 0.2203, + "step": 4819 + }, + { + "epoch": 0.975511030155839, + "grad_norm": 0.4260922372341156, + "learning_rate": 0.00010399073786038673, + "loss": 0.2286, + "step": 4820 + }, + { + "epoch": 0.9757134183363692, + "grad_norm": 0.4903205633163452, + "learning_rate": 0.00010395895584262696, + "loss": 0.2026, + "step": 4821 + }, + { + "epoch": 0.9759158065168995, + "grad_norm": 0.2526836395263672, + "learning_rate": 0.00010392717342434157, + "loss": 0.2056, + "step": 4822 + }, + { + "epoch": 0.9761181946974297, + "grad_norm": 0.2706577777862549, + "learning_rate": 0.00010389539060874598, + "loss": 0.2054, + "step": 4823 + }, + { + "epoch": 0.97632058287796, + "grad_norm": 0.37094250321388245, + "learning_rate": 0.00010386360739905564, + "loss": 0.2334, + "step": 4824 + }, + { + "epoch": 0.9765229710584902, + "grad_norm": 0.4301499128341675, + "learning_rate": 0.00010383182379848607, + "loss": 0.2602, + "step": 4825 + }, + { + "epoch": 0.9767253592390205, + "grad_norm": 0.34220004081726074, + "learning_rate": 0.00010380003981025273, + "loss": 0.2503, + "step": 4826 + }, + { + "epoch": 0.9769277474195507, + "grad_norm": 0.2496861219406128, + "learning_rate": 0.00010376825543757127, + "loss": 0.1857, + "step": 4827 + }, + { + "epoch": 0.977130135600081, + "grad_norm": 0.31023451685905457, + "learning_rate": 0.00010373647068365724, + "loss": 0.2146, + "step": 4828 + }, + { + "epoch": 0.9773325237806112, + "grad_norm": 0.27009859681129456, + "learning_rate": 0.00010370468555172632, + "loss": 0.2307, + "step": 4829 + }, + { + "epoch": 0.9775349119611415, + "grad_norm": 0.2698228061199188, + "learning_rate": 0.00010367290004499419, + "loss": 0.2293, + "step": 4830 + }, + { + "epoch": 0.9777373001416717, + "grad_norm": 0.29861339926719666, + "learning_rate": 0.00010364111416667659, + "loss": 0.2261, + "step": 4831 + }, + { + "epoch": 0.977939688322202, + "grad_norm": 0.28584665060043335, + "learning_rate": 0.00010360932791998925, + "loss": 0.2301, + "step": 4832 + }, + { + "epoch": 0.9781420765027322, + "grad_norm": 0.303874135017395, + "learning_rate": 0.00010357754130814798, + "loss": 0.2446, + "step": 4833 + }, + { + "epoch": 0.9783444646832625, + "grad_norm": 0.31630566716194153, + "learning_rate": 0.00010354575433436862, + "loss": 0.2516, + "step": 4834 + }, + { + "epoch": 0.9785468528637927, + "grad_norm": 0.29177409410476685, + "learning_rate": 0.00010351396700186705, + "loss": 0.2596, + "step": 4835 + }, + { + "epoch": 0.978749241044323, + "grad_norm": 0.2889035940170288, + "learning_rate": 0.00010348217931385915, + "loss": 0.2329, + "step": 4836 + }, + { + "epoch": 0.9789516292248532, + "grad_norm": 0.3667570948600769, + "learning_rate": 0.00010345039127356091, + "loss": 0.2476, + "step": 4837 + }, + { + "epoch": 0.9791540174053835, + "grad_norm": 0.280477911233902, + "learning_rate": 0.00010341860288418827, + "loss": 0.2146, + "step": 4838 + }, + { + "epoch": 0.9793564055859137, + "grad_norm": 0.27934882044792175, + "learning_rate": 0.00010338681414895725, + "loss": 0.2283, + "step": 4839 + }, + { + "epoch": 0.9795587937664441, + "grad_norm": 0.3026507496833801, + "learning_rate": 0.00010335502507108396, + "loss": 0.2529, + "step": 4840 + }, + { + "epoch": 0.9797611819469743, + "grad_norm": 0.24515746533870697, + "learning_rate": 0.00010332323565378441, + "loss": 0.2388, + "step": 4841 + }, + { + "epoch": 0.9799635701275046, + "grad_norm": 0.2657843232154846, + "learning_rate": 0.00010329144590027474, + "loss": 0.2515, + "step": 4842 + }, + { + "epoch": 0.9801659583080348, + "grad_norm": 0.3656042516231537, + "learning_rate": 0.00010325965581377111, + "loss": 0.2654, + "step": 4843 + }, + { + "epoch": 0.9803683464885651, + "grad_norm": 0.3170849680900574, + "learning_rate": 0.00010322786539748972, + "loss": 0.2549, + "step": 4844 + }, + { + "epoch": 0.9805707346690953, + "grad_norm": 0.30992385745048523, + "learning_rate": 0.00010319607465464676, + "loss": 0.246, + "step": 4845 + }, + { + "epoch": 0.9807731228496256, + "grad_norm": 0.26733100414276123, + "learning_rate": 0.0001031642835884585, + "loss": 0.2385, + "step": 4846 + }, + { + "epoch": 0.9809755110301558, + "grad_norm": 0.34407472610473633, + "learning_rate": 0.00010313249220214126, + "loss": 0.2372, + "step": 4847 + }, + { + "epoch": 0.9811778992106861, + "grad_norm": 0.35374516248703003, + "learning_rate": 0.00010310070049891129, + "loss": 0.2311, + "step": 4848 + }, + { + "epoch": 0.9813802873912163, + "grad_norm": 0.2595251500606537, + "learning_rate": 0.000103068908481985, + "loss": 0.1729, + "step": 4849 + }, + { + "epoch": 0.9815826755717466, + "grad_norm": 0.2830277383327484, + "learning_rate": 0.00010303711615457876, + "loss": 0.2336, + "step": 4850 + }, + { + "epoch": 0.9815826755717466, + "eval_loss": 0.26844725012779236, + "eval_runtime": 0.7388, + "eval_samples_per_second": 6.768, + "eval_steps_per_second": 1.354, + "step": 4850 + }, + { + "epoch": 0.9817850637522769, + "grad_norm": 0.2586294114589691, + "learning_rate": 0.00010300532351990899, + "loss": 0.2333, + "step": 4851 + }, + { + "epoch": 0.9819874519328071, + "grad_norm": 0.3211621344089508, + "learning_rate": 0.00010297353058119208, + "loss": 0.2247, + "step": 4852 + }, + { + "epoch": 0.9821898401133374, + "grad_norm": 0.2804816961288452, + "learning_rate": 0.00010294173734164456, + "loss": 0.2364, + "step": 4853 + }, + { + "epoch": 0.9823922282938676, + "grad_norm": 0.516215980052948, + "learning_rate": 0.00010290994380448293, + "loss": 0.217, + "step": 4854 + }, + { + "epoch": 0.9825946164743979, + "grad_norm": 0.43761691451072693, + "learning_rate": 0.00010287814997292369, + "loss": 0.2468, + "step": 4855 + }, + { + "epoch": 0.9827970046549281, + "grad_norm": 0.2564006745815277, + "learning_rate": 0.00010284635585018348, + "loss": 0.2195, + "step": 4856 + }, + { + "epoch": 0.9829993928354585, + "grad_norm": 0.23958547413349152, + "learning_rate": 0.0001028145614394788, + "loss": 0.1811, + "step": 4857 + }, + { + "epoch": 0.9832017810159887, + "grad_norm": 0.28001734614372253, + "learning_rate": 0.00010278276674402638, + "loss": 0.2325, + "step": 4858 + }, + { + "epoch": 0.983404169196519, + "grad_norm": 0.30932843685150146, + "learning_rate": 0.00010275097176704277, + "loss": 0.2612, + "step": 4859 + }, + { + "epoch": 0.9836065573770492, + "grad_norm": 0.26031824946403503, + "learning_rate": 0.00010271917651174475, + "loss": 0.2253, + "step": 4860 + }, + { + "epoch": 0.9838089455575795, + "grad_norm": 0.35603246092796326, + "learning_rate": 0.00010268738098134895, + "loss": 0.2432, + "step": 4861 + }, + { + "epoch": 0.9840113337381097, + "grad_norm": 0.3286120891571045, + "learning_rate": 0.00010265558517907216, + "loss": 0.2778, + "step": 4862 + }, + { + "epoch": 0.98421372191864, + "grad_norm": 0.38171565532684326, + "learning_rate": 0.00010262378910813116, + "loss": 0.2569, + "step": 4863 + }, + { + "epoch": 0.9844161100991702, + "grad_norm": 0.27509549260139465, + "learning_rate": 0.00010259199277174266, + "loss": 0.2217, + "step": 4864 + }, + { + "epoch": 0.9846184982797005, + "grad_norm": 0.32143381237983704, + "learning_rate": 0.00010256019617312353, + "loss": 0.2705, + "step": 4865 + }, + { + "epoch": 0.9848208864602307, + "grad_norm": 0.28621792793273926, + "learning_rate": 0.00010252839931549063, + "loss": 0.2186, + "step": 4866 + }, + { + "epoch": 0.985023274640761, + "grad_norm": 0.26840049028396606, + "learning_rate": 0.0001024966022020608, + "loss": 0.2089, + "step": 4867 + }, + { + "epoch": 0.9852256628212912, + "grad_norm": 0.3102000057697296, + "learning_rate": 0.00010246480483605097, + "loss": 0.2216, + "step": 4868 + }, + { + "epoch": 0.9854280510018215, + "grad_norm": 0.28217270970344543, + "learning_rate": 0.00010243300722067806, + "loss": 0.2367, + "step": 4869 + }, + { + "epoch": 0.9856304391823517, + "grad_norm": 0.2826128602027893, + "learning_rate": 0.00010240120935915898, + "loss": 0.2139, + "step": 4870 + }, + { + "epoch": 0.985832827362882, + "grad_norm": 0.3096916675567627, + "learning_rate": 0.00010236941125471076, + "loss": 0.2775, + "step": 4871 + }, + { + "epoch": 0.9860352155434122, + "grad_norm": 0.27654576301574707, + "learning_rate": 0.00010233761291055035, + "loss": 0.2395, + "step": 4872 + }, + { + "epoch": 0.9862376037239425, + "grad_norm": 0.2547740340232849, + "learning_rate": 0.0001023058143298948, + "loss": 0.22, + "step": 4873 + }, + { + "epoch": 0.9864399919044727, + "grad_norm": 0.4360198974609375, + "learning_rate": 0.00010227401551596116, + "loss": 0.2752, + "step": 4874 + }, + { + "epoch": 0.986642380085003, + "grad_norm": 0.30171704292297363, + "learning_rate": 0.0001022422164719665, + "loss": 0.2812, + "step": 4875 + }, + { + "epoch": 0.9868447682655332, + "grad_norm": 0.3533839285373688, + "learning_rate": 0.00010221041720112789, + "loss": 0.2314, + "step": 4876 + }, + { + "epoch": 0.9870471564460636, + "grad_norm": 0.24221685528755188, + "learning_rate": 0.00010217861770666246, + "loss": 0.2042, + "step": 4877 + }, + { + "epoch": 0.9872495446265938, + "grad_norm": 0.35867074131965637, + "learning_rate": 0.00010214681799178736, + "loss": 0.2592, + "step": 4878 + }, + { + "epoch": 0.9874519328071241, + "grad_norm": 0.24381330609321594, + "learning_rate": 0.00010211501805971973, + "loss": 0.2275, + "step": 4879 + }, + { + "epoch": 0.9876543209876543, + "grad_norm": 0.2809392809867859, + "learning_rate": 0.00010208321791367676, + "loss": 0.2372, + "step": 4880 + }, + { + "epoch": 0.9878567091681846, + "grad_norm": 0.3226775825023651, + "learning_rate": 0.00010205141755687566, + "loss": 0.2624, + "step": 4881 + }, + { + "epoch": 0.9880590973487149, + "grad_norm": 0.27383941411972046, + "learning_rate": 0.00010201961699253366, + "loss": 0.237, + "step": 4882 + }, + { + "epoch": 0.9882614855292451, + "grad_norm": 0.35411977767944336, + "learning_rate": 0.00010198781622386802, + "loss": 0.2432, + "step": 4883 + }, + { + "epoch": 0.9884638737097754, + "grad_norm": 0.27126365900039673, + "learning_rate": 0.00010195601525409594, + "loss": 0.219, + "step": 4884 + }, + { + "epoch": 0.9886662618903056, + "grad_norm": 0.24518761038780212, + "learning_rate": 0.00010192421408643484, + "loss": 0.2377, + "step": 4885 + }, + { + "epoch": 0.9888686500708359, + "grad_norm": 0.2346014827489853, + "learning_rate": 0.0001018924127241019, + "loss": 0.1761, + "step": 4886 + }, + { + "epoch": 0.9890710382513661, + "grad_norm": 0.29043862223625183, + "learning_rate": 0.00010186061117031452, + "loss": 0.2313, + "step": 4887 + }, + { + "epoch": 0.9892734264318964, + "grad_norm": 0.2963557839393616, + "learning_rate": 0.00010182880942829001, + "loss": 0.2298, + "step": 4888 + }, + { + "epoch": 0.9894758146124266, + "grad_norm": 0.261079341173172, + "learning_rate": 0.00010179700750124576, + "loss": 0.233, + "step": 4889 + }, + { + "epoch": 0.9896782027929569, + "grad_norm": 0.30412137508392334, + "learning_rate": 0.00010176520539239913, + "loss": 0.2331, + "step": 4890 + }, + { + "epoch": 0.9898805909734871, + "grad_norm": 0.24819315969944, + "learning_rate": 0.00010173340310496757, + "loss": 0.2242, + "step": 4891 + }, + { + "epoch": 0.9900829791540174, + "grad_norm": 0.2694309651851654, + "learning_rate": 0.00010170160064216844, + "loss": 0.2094, + "step": 4892 + }, + { + "epoch": 0.9902853673345476, + "grad_norm": 0.3118360936641693, + "learning_rate": 0.00010166979800721923, + "loss": 0.2264, + "step": 4893 + }, + { + "epoch": 0.990487755515078, + "grad_norm": 0.3211439251899719, + "learning_rate": 0.00010163799520333739, + "loss": 0.2413, + "step": 4894 + }, + { + "epoch": 0.9906901436956081, + "grad_norm": 0.3218715786933899, + "learning_rate": 0.00010160619223374035, + "loss": 0.2514, + "step": 4895 + }, + { + "epoch": 0.9908925318761385, + "grad_norm": 0.25901928544044495, + "learning_rate": 0.00010157438910164568, + "loss": 0.195, + "step": 4896 + }, + { + "epoch": 0.9910949200566687, + "grad_norm": 0.2728163003921509, + "learning_rate": 0.0001015425858102708, + "loss": 0.2343, + "step": 4897 + }, + { + "epoch": 0.991297308237199, + "grad_norm": 0.2993209660053253, + "learning_rate": 0.00010151078236283331, + "loss": 0.2104, + "step": 4898 + }, + { + "epoch": 0.9914996964177292, + "grad_norm": 0.2572130560874939, + "learning_rate": 0.00010147897876255068, + "loss": 0.2327, + "step": 4899 + }, + { + "epoch": 0.9917020845982595, + "grad_norm": 0.2839822471141815, + "learning_rate": 0.00010144717501264052, + "loss": 0.2537, + "step": 4900 + }, + { + "epoch": 0.9917020845982595, + "eval_loss": 0.2662275433540344, + "eval_runtime": 0.7372, + "eval_samples_per_second": 6.782, + "eval_steps_per_second": 1.356, + "step": 4900 + }, + { + "epoch": 0.9919044727787897, + "grad_norm": 0.3022128939628601, + "learning_rate": 0.00010141537111632036, + "loss": 0.2642, + "step": 4901 + }, + { + "epoch": 0.99210686095932, + "grad_norm": 0.34551921486854553, + "learning_rate": 0.00010138356707680778, + "loss": 0.2149, + "step": 4902 + }, + { + "epoch": 0.9923092491398502, + "grad_norm": 0.27679723501205444, + "learning_rate": 0.00010135176289732044, + "loss": 0.216, + "step": 4903 + }, + { + "epoch": 0.9925116373203805, + "grad_norm": 0.2436773031949997, + "learning_rate": 0.00010131995858107591, + "loss": 0.2195, + "step": 4904 + }, + { + "epoch": 0.9927140255009107, + "grad_norm": 0.2481018751859665, + "learning_rate": 0.0001012881541312918, + "loss": 0.2017, + "step": 4905 + }, + { + "epoch": 0.992916413681441, + "grad_norm": 0.26413610577583313, + "learning_rate": 0.00010125634955118579, + "loss": 0.207, + "step": 4906 + }, + { + "epoch": 0.9931188018619712, + "grad_norm": 0.2637857496738434, + "learning_rate": 0.0001012245448439755, + "loss": 0.2297, + "step": 4907 + }, + { + "epoch": 0.9933211900425015, + "grad_norm": 0.264993816614151, + "learning_rate": 0.00010119274001287861, + "loss": 0.2246, + "step": 4908 + }, + { + "epoch": 0.9935235782230317, + "grad_norm": 0.3293372690677643, + "learning_rate": 0.00010116093506111282, + "loss": 0.2485, + "step": 4909 + }, + { + "epoch": 0.993725966403562, + "grad_norm": 0.6976776719093323, + "learning_rate": 0.00010112912999189579, + "loss": 0.2681, + "step": 4910 + }, + { + "epoch": 0.9939283545840923, + "grad_norm": 0.2869125008583069, + "learning_rate": 0.00010109732480844525, + "loss": 0.1983, + "step": 4911 + }, + { + "epoch": 0.9941307427646225, + "grad_norm": 0.2749207019805908, + "learning_rate": 0.00010106551951397887, + "loss": 0.2203, + "step": 4912 + }, + { + "epoch": 0.9943331309451529, + "grad_norm": 0.3956700265407562, + "learning_rate": 0.00010103371411171443, + "loss": 0.2762, + "step": 4913 + }, + { + "epoch": 0.994535519125683, + "grad_norm": 0.29219114780426025, + "learning_rate": 0.00010100190860486964, + "loss": 0.2497, + "step": 4914 + }, + { + "epoch": 0.9947379073062134, + "grad_norm": 0.3515431582927704, + "learning_rate": 0.00010097010299666226, + "loss": 0.2287, + "step": 4915 + }, + { + "epoch": 0.9949402954867436, + "grad_norm": 0.3369796872138977, + "learning_rate": 0.00010093829729031002, + "loss": 0.2184, + "step": 4916 + }, + { + "epoch": 0.9951426836672739, + "grad_norm": 0.2912400960922241, + "learning_rate": 0.00010090649148903071, + "loss": 0.1893, + "step": 4917 + }, + { + "epoch": 0.9953450718478041, + "grad_norm": 0.24918058514595032, + "learning_rate": 0.00010087468559604212, + "loss": 0.2059, + "step": 4918 + }, + { + "epoch": 0.9955474600283344, + "grad_norm": 0.35527414083480835, + "learning_rate": 0.000100842879614562, + "loss": 0.2088, + "step": 4919 + }, + { + "epoch": 0.9957498482088646, + "grad_norm": 0.2517155408859253, + "learning_rate": 0.00010081107354780816, + "loss": 0.2061, + "step": 4920 + }, + { + "epoch": 0.9959522363893949, + "grad_norm": 0.3073742985725403, + "learning_rate": 0.00010077926739899842, + "loss": 0.2299, + "step": 4921 + }, + { + "epoch": 0.9961546245699251, + "grad_norm": 0.2773192226886749, + "learning_rate": 0.00010074746117135057, + "loss": 0.2428, + "step": 4922 + }, + { + "epoch": 0.9963570127504554, + "grad_norm": 0.33836686611175537, + "learning_rate": 0.00010071565486808245, + "loss": 0.2654, + "step": 4923 + }, + { + "epoch": 0.9965594009309856, + "grad_norm": 0.2811589539051056, + "learning_rate": 0.00010068384849241188, + "loss": 0.2649, + "step": 4924 + }, + { + "epoch": 0.9967617891115159, + "grad_norm": 0.2710583508014679, + "learning_rate": 0.00010065204204755669, + "loss": 0.2459, + "step": 4925 + }, + { + "epoch": 0.9969641772920461, + "grad_norm": 0.3035294711589813, + "learning_rate": 0.00010062023553673474, + "loss": 0.2342, + "step": 4926 + }, + { + "epoch": 0.9971665654725764, + "grad_norm": 0.28561583161354065, + "learning_rate": 0.00010058842896316385, + "loss": 0.2556, + "step": 4927 + }, + { + "epoch": 0.9973689536531066, + "grad_norm": 0.26396119594573975, + "learning_rate": 0.00010055662233006192, + "loss": 0.2304, + "step": 4928 + }, + { + "epoch": 0.9975713418336369, + "grad_norm": 0.2924419343471527, + "learning_rate": 0.00010052481564064678, + "loss": 0.2298, + "step": 4929 + }, + { + "epoch": 0.9977737300141671, + "grad_norm": 0.27983298897743225, + "learning_rate": 0.00010049300889813627, + "loss": 0.2354, + "step": 4930 + }, + { + "epoch": 0.9979761181946974, + "grad_norm": 0.23018218576908112, + "learning_rate": 0.00010046120210574827, + "loss": 0.202, + "step": 4931 + }, + { + "epoch": 0.9981785063752276, + "grad_norm": 0.22712524235248566, + "learning_rate": 0.00010042939526670071, + "loss": 0.2301, + "step": 4932 + }, + { + "epoch": 0.998380894555758, + "grad_norm": 0.26420795917510986, + "learning_rate": 0.00010039758838421148, + "loss": 0.2161, + "step": 4933 + }, + { + "epoch": 0.9985832827362882, + "grad_norm": 0.31919482350349426, + "learning_rate": 0.00010036578146149838, + "loss": 0.2308, + "step": 4934 + }, + { + "epoch": 0.9987856709168185, + "grad_norm": 0.28096866607666016, + "learning_rate": 0.00010033397450177936, + "loss": 0.226, + "step": 4935 + }, + { + "epoch": 0.9989880590973487, + "grad_norm": 0.26480206847190857, + "learning_rate": 0.00010030216750827232, + "loss": 0.2491, + "step": 4936 + }, + { + "epoch": 0.999190447277879, + "grad_norm": 0.42638319730758667, + "learning_rate": 0.00010027036048419513, + "loss": 0.2322, + "step": 4937 + }, + { + "epoch": 0.9993928354584092, + "grad_norm": 0.3904888927936554, + "learning_rate": 0.00010023855343276572, + "loss": 0.2139, + "step": 4938 + }, + { + "epoch": 0.9995952236389395, + "grad_norm": 0.27836623787879944, + "learning_rate": 0.00010020674635720195, + "loss": 0.222, + "step": 4939 + }, + { + "epoch": 0.9997976118194697, + "grad_norm": 0.2650602459907532, + "learning_rate": 0.00010017493926072179, + "loss": 0.1917, + "step": 4940 + }, + { + "epoch": 1.0, + "grad_norm": 0.33457136154174805, + "learning_rate": 0.00010014313214654309, + "loss": 0.2885, + "step": 4941 + }, + { + "epoch": 1.0002023881805302, + "grad_norm": 0.47695091366767883, + "learning_rate": 0.00010011132501788379, + "loss": 0.1844, + "step": 4942 + }, + { + "epoch": 1.0004047763610606, + "grad_norm": 0.28054702281951904, + "learning_rate": 0.00010007951787796178, + "loss": 0.2, + "step": 4943 + }, + { + "epoch": 1.0006071645415908, + "grad_norm": 0.215849831700325, + "learning_rate": 0.000100047710729995, + "loss": 0.1708, + "step": 4944 + }, + { + "epoch": 1.000809552722121, + "grad_norm": 0.48324742913246155, + "learning_rate": 0.00010001590357720133, + "loss": 0.1951, + "step": 4945 + }, + { + "epoch": 1.0010119409026512, + "grad_norm": 0.24173052608966827, + "learning_rate": 9.99840964227987e-05, + "loss": 0.2146, + "step": 4946 + }, + { + "epoch": 1.0012143290831816, + "grad_norm": 0.2449575513601303, + "learning_rate": 9.995228927000504e-05, + "loss": 0.1778, + "step": 4947 + }, + { + "epoch": 1.0014167172637118, + "grad_norm": 0.3132277727127075, + "learning_rate": 9.992048212203823e-05, + "loss": 0.207, + "step": 4948 + }, + { + "epoch": 1.001619105444242, + "grad_norm": 0.5420759916305542, + "learning_rate": 9.988867498211624e-05, + "loss": 0.2027, + "step": 4949 + }, + { + "epoch": 1.0018214936247722, + "grad_norm": 0.22170519828796387, + "learning_rate": 9.985686785345693e-05, + "loss": 0.18, + "step": 4950 + }, + { + "epoch": 1.0018214936247722, + "eval_loss": 0.26427409052848816, + "eval_runtime": 0.7388, + "eval_samples_per_second": 6.768, + "eval_steps_per_second": 1.354, + "step": 4950 + }, + { + "epoch": 1.0020238818053027, + "grad_norm": 0.277170866727829, + "learning_rate": 9.982506073927822e-05, + "loss": 0.1889, + "step": 4951 + }, + { + "epoch": 1.0022262699858329, + "grad_norm": 0.24262042343616486, + "learning_rate": 9.979325364279803e-05, + "loss": 0.1763, + "step": 4952 + }, + { + "epoch": 1.002428658166363, + "grad_norm": 0.2713542580604553, + "learning_rate": 9.976144656723429e-05, + "loss": 0.2073, + "step": 4953 + }, + { + "epoch": 1.0026310463468933, + "grad_norm": 0.27338194847106934, + "learning_rate": 9.972963951580486e-05, + "loss": 0.1863, + "step": 4954 + }, + { + "epoch": 1.0028334345274237, + "grad_norm": 0.29679569602012634, + "learning_rate": 9.969783249172767e-05, + "loss": 0.2305, + "step": 4955 + }, + { + "epoch": 1.0030358227079539, + "grad_norm": 0.28192949295043945, + "learning_rate": 9.966602549822063e-05, + "loss": 0.1952, + "step": 4956 + }, + { + "epoch": 1.003238210888484, + "grad_norm": 0.2806640863418579, + "learning_rate": 9.963421853850163e-05, + "loss": 0.2047, + "step": 4957 + }, + { + "epoch": 1.0034405990690143, + "grad_norm": 0.32658886909484863, + "learning_rate": 9.960241161578855e-05, + "loss": 0.2308, + "step": 4958 + }, + { + "epoch": 1.0036429872495447, + "grad_norm": 0.4036096930503845, + "learning_rate": 9.95706047332993e-05, + "loss": 0.2399, + "step": 4959 + }, + { + "epoch": 1.003845375430075, + "grad_norm": 0.2438739687204361, + "learning_rate": 9.953879789425174e-05, + "loss": 0.1893, + "step": 4960 + }, + { + "epoch": 1.004047763610605, + "grad_norm": 0.39129194617271423, + "learning_rate": 9.950699110186378e-05, + "loss": 0.2041, + "step": 4961 + }, + { + "epoch": 1.0042501517911353, + "grad_norm": 0.25112760066986084, + "learning_rate": 9.947518435935328e-05, + "loss": 0.182, + "step": 4962 + }, + { + "epoch": 1.0044525399716657, + "grad_norm": 0.4767323434352875, + "learning_rate": 9.944337766993812e-05, + "loss": 0.2276, + "step": 4963 + }, + { + "epoch": 1.004654928152196, + "grad_norm": 0.28244414925575256, + "learning_rate": 9.941157103683617e-05, + "loss": 0.195, + "step": 4964 + }, + { + "epoch": 1.0048573163327261, + "grad_norm": 0.3557490408420563, + "learning_rate": 9.937976446326529e-05, + "loss": 0.2082, + "step": 4965 + }, + { + "epoch": 1.0050597045132563, + "grad_norm": 0.3751201927661896, + "learning_rate": 9.934795795244333e-05, + "loss": 0.2105, + "step": 4966 + }, + { + "epoch": 1.0052620926937867, + "grad_norm": 0.3247845470905304, + "learning_rate": 9.931615150758814e-05, + "loss": 0.1828, + "step": 4967 + }, + { + "epoch": 1.005464480874317, + "grad_norm": 0.3247244358062744, + "learning_rate": 9.928434513191757e-05, + "loss": 0.1867, + "step": 4968 + }, + { + "epoch": 1.0056668690548471, + "grad_norm": 0.2909950911998749, + "learning_rate": 9.925253882864944e-05, + "loss": 0.2101, + "step": 4969 + }, + { + "epoch": 1.0058692572353773, + "grad_norm": 0.29122984409332275, + "learning_rate": 9.922073260100161e-05, + "loss": 0.1965, + "step": 4970 + }, + { + "epoch": 1.0060716454159078, + "grad_norm": 0.36983293294906616, + "learning_rate": 9.918892645219187e-05, + "loss": 0.2161, + "step": 4971 + }, + { + "epoch": 1.006274033596438, + "grad_norm": 0.2427636981010437, + "learning_rate": 9.915712038543803e-05, + "loss": 0.1583, + "step": 4972 + }, + { + "epoch": 1.0064764217769682, + "grad_norm": 0.30179157853126526, + "learning_rate": 9.912531440395792e-05, + "loss": 0.1955, + "step": 4973 + }, + { + "epoch": 1.0066788099574986, + "grad_norm": 0.25663846731185913, + "learning_rate": 9.90935085109693e-05, + "loss": 0.2038, + "step": 4974 + }, + { + "epoch": 1.0068811981380288, + "grad_norm": 0.2936505079269409, + "learning_rate": 9.906170270968999e-05, + "loss": 0.1888, + "step": 4975 + }, + { + "epoch": 1.007083586318559, + "grad_norm": 0.24913620948791504, + "learning_rate": 9.902989700333775e-05, + "loss": 0.1602, + "step": 4976 + }, + { + "epoch": 1.0072859744990892, + "grad_norm": 0.25665563344955444, + "learning_rate": 9.899809139513037e-05, + "loss": 0.189, + "step": 4977 + }, + { + "epoch": 1.0074883626796196, + "grad_norm": 0.2616618573665619, + "learning_rate": 9.896628588828557e-05, + "loss": 0.1727, + "step": 4978 + }, + { + "epoch": 1.0076907508601498, + "grad_norm": 0.35549241304397583, + "learning_rate": 9.893448048602114e-05, + "loss": 0.2156, + "step": 4979 + }, + { + "epoch": 1.00789313904068, + "grad_norm": 0.30542701482772827, + "learning_rate": 9.890267519155479e-05, + "loss": 0.1956, + "step": 4980 + }, + { + "epoch": 1.0080955272212102, + "grad_norm": 0.7851333618164062, + "learning_rate": 9.887087000810424e-05, + "loss": 0.2317, + "step": 4981 + }, + { + "epoch": 1.0082979154017406, + "grad_norm": 0.2573649287223816, + "learning_rate": 9.88390649388872e-05, + "loss": 0.1782, + "step": 4982 + }, + { + "epoch": 1.0085003035822708, + "grad_norm": 0.2208670824766159, + "learning_rate": 9.880725998712141e-05, + "loss": 0.1836, + "step": 4983 + }, + { + "epoch": 1.008702691762801, + "grad_norm": 0.29855644702911377, + "learning_rate": 9.877545515602453e-05, + "loss": 0.2128, + "step": 4984 + }, + { + "epoch": 1.0089050799433312, + "grad_norm": 0.24554845690727234, + "learning_rate": 9.874365044881424e-05, + "loss": 0.2037, + "step": 4985 + }, + { + "epoch": 1.0091074681238617, + "grad_norm": 0.2785384953022003, + "learning_rate": 9.871184586870822e-05, + "loss": 0.2101, + "step": 4986 + }, + { + "epoch": 1.0093098563043919, + "grad_norm": 0.45173802971839905, + "learning_rate": 9.868004141892411e-05, + "loss": 0.1869, + "step": 4987 + }, + { + "epoch": 1.009512244484922, + "grad_norm": 0.2669700086116791, + "learning_rate": 9.864823710267958e-05, + "loss": 0.2041, + "step": 4988 + }, + { + "epoch": 1.0097146326654522, + "grad_norm": 0.23057915270328522, + "learning_rate": 9.861643292319223e-05, + "loss": 0.1516, + "step": 4989 + }, + { + "epoch": 1.0099170208459827, + "grad_norm": 0.67122882604599, + "learning_rate": 9.858462888367967e-05, + "loss": 0.1867, + "step": 4990 + }, + { + "epoch": 1.0101194090265129, + "grad_norm": 0.4083704948425293, + "learning_rate": 9.855282498735952e-05, + "loss": 0.194, + "step": 4991 + }, + { + "epoch": 1.010321797207043, + "grad_norm": 0.26877662539482117, + "learning_rate": 9.852102123744934e-05, + "loss": 0.2156, + "step": 4992 + }, + { + "epoch": 1.0105241853875733, + "grad_norm": 0.27300938963890076, + "learning_rate": 9.848921763716672e-05, + "loss": 0.2129, + "step": 4993 + }, + { + "epoch": 1.0107265735681037, + "grad_norm": 0.30437150597572327, + "learning_rate": 9.845741418972921e-05, + "loss": 0.2152, + "step": 4994 + }, + { + "epoch": 1.010928961748634, + "grad_norm": 0.3953990340232849, + "learning_rate": 9.842561089835433e-05, + "loss": 0.2392, + "step": 4995 + }, + { + "epoch": 1.011131349929164, + "grad_norm": 0.2798847556114197, + "learning_rate": 9.839380776625963e-05, + "loss": 0.1938, + "step": 4996 + }, + { + "epoch": 1.0113337381096943, + "grad_norm": 0.3126393258571625, + "learning_rate": 9.836200479666262e-05, + "loss": 0.2221, + "step": 4997 + }, + { + "epoch": 1.0115361262902247, + "grad_norm": 0.2901754081249237, + "learning_rate": 9.833020199278075e-05, + "loss": 0.2089, + "step": 4998 + }, + { + "epoch": 1.011738514470755, + "grad_norm": 0.6395556330680847, + "learning_rate": 9.829839935783155e-05, + "loss": 0.213, + "step": 4999 + }, + { + "epoch": 1.0119409026512851, + "grad_norm": 0.3289041519165039, + "learning_rate": 9.826659689503244e-05, + "loss": 0.1602, + "step": 5000 + }, + { + "epoch": 1.0119409026512851, + "eval_loss": 0.2699425518512726, + "eval_runtime": 0.7372, + "eval_samples_per_second": 6.783, + "eval_steps_per_second": 1.357, + "step": 5000 + }, + { + "epoch": 1.0121432908318153, + "grad_norm": 0.2821512222290039, + "learning_rate": 9.823479460760085e-05, + "loss": 0.2042, + "step": 5001 + }, + { + "epoch": 1.0123456790123457, + "grad_norm": 0.3883262276649475, + "learning_rate": 9.820299249875429e-05, + "loss": 0.194, + "step": 5002 + }, + { + "epoch": 1.012548067192876, + "grad_norm": 0.30730289220809937, + "learning_rate": 9.817119057171003e-05, + "loss": 0.1453, + "step": 5003 + }, + { + "epoch": 1.0127504553734061, + "grad_norm": 0.26515355706214905, + "learning_rate": 9.813938882968552e-05, + "loss": 0.1912, + "step": 5004 + }, + { + "epoch": 1.0129528435539366, + "grad_norm": 0.2594118118286133, + "learning_rate": 9.810758727589813e-05, + "loss": 0.1755, + "step": 5005 + }, + { + "epoch": 1.0131552317344668, + "grad_norm": 0.27748867869377136, + "learning_rate": 9.80757859135652e-05, + "loss": 0.2065, + "step": 5006 + }, + { + "epoch": 1.013357619914997, + "grad_norm": 0.29483193159103394, + "learning_rate": 9.804398474590407e-05, + "loss": 0.2022, + "step": 5007 + }, + { + "epoch": 1.0135600080955272, + "grad_norm": 0.2991485297679901, + "learning_rate": 9.8012183776132e-05, + "loss": 0.2201, + "step": 5008 + }, + { + "epoch": 1.0137623962760576, + "grad_norm": 0.3728349506855011, + "learning_rate": 9.798038300746635e-05, + "loss": 0.1912, + "step": 5009 + }, + { + "epoch": 1.0139647844565878, + "grad_norm": 0.32748380303382874, + "learning_rate": 9.794858244312436e-05, + "loss": 0.1714, + "step": 5010 + }, + { + "epoch": 1.014167172637118, + "grad_norm": 0.2615242898464203, + "learning_rate": 9.791678208632326e-05, + "loss": 0.199, + "step": 5011 + }, + { + "epoch": 1.0143695608176482, + "grad_norm": 0.28860002756118774, + "learning_rate": 9.788498194028031e-05, + "loss": 0.1711, + "step": 5012 + }, + { + "epoch": 1.0145719489981786, + "grad_norm": 0.2724457383155823, + "learning_rate": 9.785318200821267e-05, + "loss": 0.1964, + "step": 5013 + }, + { + "epoch": 1.0147743371787088, + "grad_norm": 0.28530627489089966, + "learning_rate": 9.782138229333755e-05, + "loss": 0.1776, + "step": 5014 + }, + { + "epoch": 1.014976725359239, + "grad_norm": 0.2859584093093872, + "learning_rate": 9.778958279887213e-05, + "loss": 0.1822, + "step": 5015 + }, + { + "epoch": 1.0151791135397692, + "grad_norm": 0.33056363463401794, + "learning_rate": 9.775778352803352e-05, + "loss": 0.1731, + "step": 5016 + }, + { + "epoch": 1.0153815017202996, + "grad_norm": 0.3300340175628662, + "learning_rate": 9.772598448403885e-05, + "loss": 0.1751, + "step": 5017 + }, + { + "epoch": 1.0155838899008298, + "grad_norm": 0.3166051506996155, + "learning_rate": 9.76941856701052e-05, + "loss": 0.2012, + "step": 5018 + }, + { + "epoch": 1.01578627808136, + "grad_norm": 0.26906871795654297, + "learning_rate": 9.766238708944965e-05, + "loss": 0.1994, + "step": 5019 + }, + { + "epoch": 1.0159886662618902, + "grad_norm": 0.274949848651886, + "learning_rate": 9.763058874528925e-05, + "loss": 0.212, + "step": 5020 + }, + { + "epoch": 1.0161910544424206, + "grad_norm": 0.31172770261764526, + "learning_rate": 9.759879064084102e-05, + "loss": 0.2387, + "step": 5021 + }, + { + "epoch": 1.0163934426229508, + "grad_norm": 0.3426840603351593, + "learning_rate": 9.756699277932195e-05, + "loss": 0.2097, + "step": 5022 + }, + { + "epoch": 1.016595830803481, + "grad_norm": 0.48346537351608276, + "learning_rate": 9.753519516394903e-05, + "loss": 0.1998, + "step": 5023 + }, + { + "epoch": 1.0167982189840112, + "grad_norm": 0.3097973167896271, + "learning_rate": 9.750339779793923e-05, + "loss": 0.2075, + "step": 5024 + }, + { + "epoch": 1.0170006071645417, + "grad_norm": 0.2547002136707306, + "learning_rate": 9.74716006845094e-05, + "loss": 0.187, + "step": 5025 + }, + { + "epoch": 1.0172029953450719, + "grad_norm": 0.26773279905319214, + "learning_rate": 9.743980382687651e-05, + "loss": 0.1875, + "step": 5026 + }, + { + "epoch": 1.017405383525602, + "grad_norm": 0.25491225719451904, + "learning_rate": 9.74080072282574e-05, + "loss": 0.1675, + "step": 5027 + }, + { + "epoch": 1.0176077717061323, + "grad_norm": 0.35699644684791565, + "learning_rate": 9.73762108918689e-05, + "loss": 0.1824, + "step": 5028 + }, + { + "epoch": 1.0178101598866627, + "grad_norm": 0.29440072178840637, + "learning_rate": 9.734441482092786e-05, + "loss": 0.1621, + "step": 5029 + }, + { + "epoch": 1.0180125480671929, + "grad_norm": 0.3017171323299408, + "learning_rate": 9.731261901865107e-05, + "loss": 0.1913, + "step": 5030 + }, + { + "epoch": 1.018214936247723, + "grad_norm": 0.3333732783794403, + "learning_rate": 9.728082348825526e-05, + "loss": 0.2166, + "step": 5031 + }, + { + "epoch": 1.0184173244282535, + "grad_norm": 0.3623688519001007, + "learning_rate": 9.724902823295724e-05, + "loss": 0.2236, + "step": 5032 + }, + { + "epoch": 1.0186197126087837, + "grad_norm": 0.2425580769777298, + "learning_rate": 9.721723325597365e-05, + "loss": 0.191, + "step": 5033 + }, + { + "epoch": 1.018822100789314, + "grad_norm": 0.23994003236293793, + "learning_rate": 9.71854385605212e-05, + "loss": 0.217, + "step": 5034 + }, + { + "epoch": 1.019024488969844, + "grad_norm": 0.26265624165534973, + "learning_rate": 9.715364414981656e-05, + "loss": 0.2137, + "step": 5035 + }, + { + "epoch": 1.0192268771503745, + "grad_norm": 0.26903507113456726, + "learning_rate": 9.712185002707634e-05, + "loss": 0.1842, + "step": 5036 + }, + { + "epoch": 1.0194292653309047, + "grad_norm": 0.30898502469062805, + "learning_rate": 9.709005619551709e-05, + "loss": 0.1837, + "step": 5037 + }, + { + "epoch": 1.019631653511435, + "grad_norm": 0.3141460418701172, + "learning_rate": 9.705826265835547e-05, + "loss": 0.1913, + "step": 5038 + }, + { + "epoch": 1.0198340416919651, + "grad_norm": 0.29945555329322815, + "learning_rate": 9.702646941880794e-05, + "loss": 0.2066, + "step": 5039 + }, + { + "epoch": 1.0200364298724955, + "grad_norm": 0.2980521023273468, + "learning_rate": 9.699467648009105e-05, + "loss": 0.1909, + "step": 5040 + }, + { + "epoch": 1.0202388180530257, + "grad_norm": 0.31508907675743103, + "learning_rate": 9.696288384542125e-05, + "loss": 0.2089, + "step": 5041 + }, + { + "epoch": 1.020441206233556, + "grad_norm": 0.29253244400024414, + "learning_rate": 9.693109151801499e-05, + "loss": 0.1968, + "step": 5042 + }, + { + "epoch": 1.0206435944140861, + "grad_norm": 0.2746085226535797, + "learning_rate": 9.68992995010887e-05, + "loss": 0.1936, + "step": 5043 + }, + { + "epoch": 1.0208459825946166, + "grad_norm": 0.3697469234466553, + "learning_rate": 9.686750779785875e-05, + "loss": 0.193, + "step": 5044 + }, + { + "epoch": 1.0210483707751468, + "grad_norm": 0.3167676031589508, + "learning_rate": 9.683571641154149e-05, + "loss": 0.1672, + "step": 5045 + }, + { + "epoch": 1.021250758955677, + "grad_norm": 0.3159421384334564, + "learning_rate": 9.680392534535328e-05, + "loss": 0.2296, + "step": 5046 + }, + { + "epoch": 1.0214531471362072, + "grad_norm": 0.2636842429637909, + "learning_rate": 9.677213460251033e-05, + "loss": 0.1887, + "step": 5047 + }, + { + "epoch": 1.0216555353167376, + "grad_norm": 0.2759999930858612, + "learning_rate": 9.674034418622894e-05, + "loss": 0.1876, + "step": 5048 + }, + { + "epoch": 1.0218579234972678, + "grad_norm": 0.3974986970424652, + "learning_rate": 9.67085540997253e-05, + "loss": 0.2078, + "step": 5049 + }, + { + "epoch": 1.022060311677798, + "grad_norm": 0.336736798286438, + "learning_rate": 9.667676434621564e-05, + "loss": 0.1857, + "step": 5050 + }, + { + "epoch": 1.022060311677798, + "eval_loss": 0.27779608964920044, + "eval_runtime": 0.7405, + "eval_samples_per_second": 6.753, + "eval_steps_per_second": 1.351, + "step": 5050 + }, + { + "epoch": 1.0222626998583282, + "grad_norm": 0.27973130345344543, + "learning_rate": 9.664497492891607e-05, + "loss": 0.2119, + "step": 5051 + }, + { + "epoch": 1.0224650880388586, + "grad_norm": 0.3350655734539032, + "learning_rate": 9.661318585104276e-05, + "loss": 0.1885, + "step": 5052 + }, + { + "epoch": 1.0226674762193888, + "grad_norm": 0.22865338623523712, + "learning_rate": 9.658139711581175e-05, + "loss": 0.1665, + "step": 5053 + }, + { + "epoch": 1.022869864399919, + "grad_norm": 0.3354731500148773, + "learning_rate": 9.654960872643913e-05, + "loss": 0.2064, + "step": 5054 + }, + { + "epoch": 1.0230722525804492, + "grad_norm": 0.24740594625473022, + "learning_rate": 9.651782068614087e-05, + "loss": 0.1743, + "step": 5055 + }, + { + "epoch": 1.0232746407609796, + "grad_norm": 0.2912404537200928, + "learning_rate": 9.648603299813298e-05, + "loss": 0.1991, + "step": 5056 + }, + { + "epoch": 1.0234770289415098, + "grad_norm": 0.3296900689601898, + "learning_rate": 9.64542456656314e-05, + "loss": 0.2185, + "step": 5057 + }, + { + "epoch": 1.02367941712204, + "grad_norm": 0.2634105682373047, + "learning_rate": 9.642245869185204e-05, + "loss": 0.1859, + "step": 5058 + }, + { + "epoch": 1.0238818053025702, + "grad_norm": 0.311507910490036, + "learning_rate": 9.639067208001077e-05, + "loss": 0.2318, + "step": 5059 + }, + { + "epoch": 1.0240841934831006, + "grad_norm": 0.3055361211299896, + "learning_rate": 9.635888583332344e-05, + "loss": 0.2004, + "step": 5060 + }, + { + "epoch": 1.0242865816636308, + "grad_norm": 0.276246577501297, + "learning_rate": 9.632709995500583e-05, + "loss": 0.2175, + "step": 5061 + }, + { + "epoch": 1.024488969844161, + "grad_norm": 0.2705673277378082, + "learning_rate": 9.629531444827369e-05, + "loss": 0.1903, + "step": 5062 + }, + { + "epoch": 1.0246913580246915, + "grad_norm": 0.322270005941391, + "learning_rate": 9.626352931634279e-05, + "loss": 0.2274, + "step": 5063 + }, + { + "epoch": 1.0248937462052217, + "grad_norm": 0.2642096281051636, + "learning_rate": 9.623174456242875e-05, + "loss": 0.1844, + "step": 5064 + }, + { + "epoch": 1.0250961343857519, + "grad_norm": 0.285813570022583, + "learning_rate": 9.619996018974728e-05, + "loss": 0.1688, + "step": 5065 + }, + { + "epoch": 1.025298522566282, + "grad_norm": 0.2848520874977112, + "learning_rate": 9.616817620151394e-05, + "loss": 0.2199, + "step": 5066 + }, + { + "epoch": 1.0255009107468125, + "grad_norm": 0.3013867139816284, + "learning_rate": 9.613639260094436e-05, + "loss": 0.197, + "step": 5067 + }, + { + "epoch": 1.0257032989273427, + "grad_norm": 0.30691197514533997, + "learning_rate": 9.610460939125407e-05, + "loss": 0.1734, + "step": 5068 + }, + { + "epoch": 1.0259056871078729, + "grad_norm": 0.26753026247024536, + "learning_rate": 9.607282657565848e-05, + "loss": 0.1984, + "step": 5069 + }, + { + "epoch": 1.026108075288403, + "grad_norm": 0.300430029630661, + "learning_rate": 9.604104415737308e-05, + "loss": 0.2048, + "step": 5070 + }, + { + "epoch": 1.0263104634689335, + "grad_norm": 0.3072032034397125, + "learning_rate": 9.60092621396133e-05, + "loss": 0.2279, + "step": 5071 + }, + { + "epoch": 1.0265128516494637, + "grad_norm": 0.2822709083557129, + "learning_rate": 9.597748052559451e-05, + "loss": 0.2083, + "step": 5072 + }, + { + "epoch": 1.026715239829994, + "grad_norm": 0.29260656237602234, + "learning_rate": 9.594569931853203e-05, + "loss": 0.2037, + "step": 5073 + }, + { + "epoch": 1.026917628010524, + "grad_norm": 0.291266530752182, + "learning_rate": 9.591391852164114e-05, + "loss": 0.1882, + "step": 5074 + }, + { + "epoch": 1.0271200161910545, + "grad_norm": 0.3024618327617645, + "learning_rate": 9.58821381381371e-05, + "loss": 0.2165, + "step": 5075 + }, + { + "epoch": 1.0273224043715847, + "grad_norm": 0.2564701437950134, + "learning_rate": 9.585035817123513e-05, + "loss": 0.1867, + "step": 5076 + }, + { + "epoch": 1.027524792552115, + "grad_norm": 0.28993088006973267, + "learning_rate": 9.581857862415037e-05, + "loss": 0.22, + "step": 5077 + }, + { + "epoch": 1.0277271807326451, + "grad_norm": 0.386116087436676, + "learning_rate": 9.578679950009794e-05, + "loss": 0.2035, + "step": 5078 + }, + { + "epoch": 1.0279295689131756, + "grad_norm": 1.0133898258209229, + "learning_rate": 9.575502080229295e-05, + "loss": 0.2139, + "step": 5079 + }, + { + "epoch": 1.0281319570937058, + "grad_norm": 0.2750903069972992, + "learning_rate": 9.57232425339504e-05, + "loss": 0.1953, + "step": 5080 + }, + { + "epoch": 1.028334345274236, + "grad_norm": 0.3563201129436493, + "learning_rate": 9.56914646982853e-05, + "loss": 0.1939, + "step": 5081 + }, + { + "epoch": 1.0285367334547661, + "grad_norm": 0.2231059968471527, + "learning_rate": 9.565968729851258e-05, + "loss": 0.1811, + "step": 5082 + }, + { + "epoch": 1.0287391216352966, + "grad_norm": 0.2834724187850952, + "learning_rate": 9.562791033784718e-05, + "loss": 0.2136, + "step": 5083 + }, + { + "epoch": 1.0289415098158268, + "grad_norm": 0.3485587239265442, + "learning_rate": 9.559613381950391e-05, + "loss": 0.2326, + "step": 5084 + }, + { + "epoch": 1.029143897996357, + "grad_norm": 0.31048068404197693, + "learning_rate": 9.556435774669763e-05, + "loss": 0.2059, + "step": 5085 + }, + { + "epoch": 1.0293462861768872, + "grad_norm": 0.24974344670772552, + "learning_rate": 9.553258212264308e-05, + "loss": 0.1914, + "step": 5086 + }, + { + "epoch": 1.0295486743574176, + "grad_norm": 0.2568418085575104, + "learning_rate": 9.5500806950555e-05, + "loss": 0.205, + "step": 5087 + }, + { + "epoch": 1.0297510625379478, + "grad_norm": 0.36947062611579895, + "learning_rate": 9.546903223364806e-05, + "loss": 0.2097, + "step": 5088 + }, + { + "epoch": 1.029953450718478, + "grad_norm": 0.3109140694141388, + "learning_rate": 9.54372579751369e-05, + "loss": 0.1712, + "step": 5089 + }, + { + "epoch": 1.0301558388990082, + "grad_norm": 0.2524207532405853, + "learning_rate": 9.540548417823609e-05, + "loss": 0.2061, + "step": 5090 + }, + { + "epoch": 1.0303582270795386, + "grad_norm": 0.6381499171257019, + "learning_rate": 9.537371084616021e-05, + "loss": 0.1702, + "step": 5091 + }, + { + "epoch": 1.0305606152600688, + "grad_norm": 0.29690369963645935, + "learning_rate": 9.53419379821237e-05, + "loss": 0.1703, + "step": 5092 + }, + { + "epoch": 1.030763003440599, + "grad_norm": 0.312467485666275, + "learning_rate": 9.531016558934103e-05, + "loss": 0.2054, + "step": 5093 + }, + { + "epoch": 1.0309653916211294, + "grad_norm": 0.30818724632263184, + "learning_rate": 9.527839367102661e-05, + "loss": 0.1933, + "step": 5094 + }, + { + "epoch": 1.0311677798016596, + "grad_norm": 0.2714020013809204, + "learning_rate": 9.524662223039476e-05, + "loss": 0.1909, + "step": 5095 + }, + { + "epoch": 1.0313701679821898, + "grad_norm": 0.2716444730758667, + "learning_rate": 9.52148512706598e-05, + "loss": 0.1573, + "step": 5096 + }, + { + "epoch": 1.03157255616272, + "grad_norm": 0.25801146030426025, + "learning_rate": 9.518308079503599e-05, + "loss": 0.1843, + "step": 5097 + }, + { + "epoch": 1.0317749443432505, + "grad_norm": 0.3397352993488312, + "learning_rate": 9.515131080673751e-05, + "loss": 0.2089, + "step": 5098 + }, + { + "epoch": 1.0319773325237807, + "grad_norm": 0.31652987003326416, + "learning_rate": 9.511954130897851e-05, + "loss": 0.1899, + "step": 5099 + }, + { + "epoch": 1.0321797207043109, + "grad_norm": 0.7190924286842346, + "learning_rate": 9.508777230497313e-05, + "loss": 0.2048, + "step": 5100 + }, + { + "epoch": 1.0321797207043109, + "eval_loss": 0.2794642746448517, + "eval_runtime": 0.7373, + "eval_samples_per_second": 6.781, + "eval_steps_per_second": 1.356, + "step": 5100 + }, + { + "epoch": 1.032382108884841, + "grad_norm": 0.3107394874095917, + "learning_rate": 9.50560037979354e-05, + "loss": 0.2168, + "step": 5101 + }, + { + "epoch": 1.0325844970653715, + "grad_norm": 0.28215718269348145, + "learning_rate": 9.502423579107933e-05, + "loss": 0.19, + "step": 5102 + }, + { + "epoch": 1.0327868852459017, + "grad_norm": 0.2528868317604065, + "learning_rate": 9.499246828761887e-05, + "loss": 0.1703, + "step": 5103 + }, + { + "epoch": 1.0329892734264319, + "grad_norm": 0.3218041956424713, + "learning_rate": 9.496070129076793e-05, + "loss": 0.2025, + "step": 5104 + }, + { + "epoch": 1.033191661606962, + "grad_norm": 0.3038148880004883, + "learning_rate": 9.492893480374035e-05, + "loss": 0.1984, + "step": 5105 + }, + { + "epoch": 1.0333940497874925, + "grad_norm": 0.9580700993537903, + "learning_rate": 9.489716882974994e-05, + "loss": 0.1923, + "step": 5106 + }, + { + "epoch": 1.0335964379680227, + "grad_norm": 0.4019380807876587, + "learning_rate": 9.486540337201046e-05, + "loss": 0.1793, + "step": 5107 + }, + { + "epoch": 1.033798826148553, + "grad_norm": 0.3231680989265442, + "learning_rate": 9.483363843373556e-05, + "loss": 0.2016, + "step": 5108 + }, + { + "epoch": 1.034001214329083, + "grad_norm": 0.30073419213294983, + "learning_rate": 9.480187401813893e-05, + "loss": 0.173, + "step": 5109 + }, + { + "epoch": 1.0342036025096135, + "grad_norm": 0.31227320432662964, + "learning_rate": 9.477011012843414e-05, + "loss": 0.1934, + "step": 5110 + }, + { + "epoch": 1.0344059906901437, + "grad_norm": 0.3224979043006897, + "learning_rate": 9.473834676783473e-05, + "loss": 0.1954, + "step": 5111 + }, + { + "epoch": 1.034608378870674, + "grad_norm": 0.2588309347629547, + "learning_rate": 9.470658393955419e-05, + "loss": 0.201, + "step": 5112 + }, + { + "epoch": 1.0348107670512041, + "grad_norm": 0.44500279426574707, + "learning_rate": 9.467482164680597e-05, + "loss": 0.2162, + "step": 5113 + }, + { + "epoch": 1.0350131552317345, + "grad_norm": 0.2538471817970276, + "learning_rate": 9.464305989280337e-05, + "loss": 0.1749, + "step": 5114 + }, + { + "epoch": 1.0352155434122647, + "grad_norm": 0.41678211092948914, + "learning_rate": 9.461129868075979e-05, + "loss": 0.1989, + "step": 5115 + }, + { + "epoch": 1.035417931592795, + "grad_norm": 0.3123410940170288, + "learning_rate": 9.457953801388846e-05, + "loss": 0.2083, + "step": 5116 + }, + { + "epoch": 1.0356203197733251, + "grad_norm": 0.3037175238132477, + "learning_rate": 9.45477778954026e-05, + "loss": 0.2191, + "step": 5117 + }, + { + "epoch": 1.0358227079538556, + "grad_norm": 0.2959064245223999, + "learning_rate": 9.451601832851534e-05, + "loss": 0.2312, + "step": 5118 + }, + { + "epoch": 1.0360250961343858, + "grad_norm": 0.292764276266098, + "learning_rate": 9.448425931643982e-05, + "loss": 0.2208, + "step": 5119 + }, + { + "epoch": 1.036227484314916, + "grad_norm": 0.3166069984436035, + "learning_rate": 9.445250086238908e-05, + "loss": 0.1858, + "step": 5120 + }, + { + "epoch": 1.0364298724954462, + "grad_norm": 0.2685317099094391, + "learning_rate": 9.442074296957607e-05, + "loss": 0.2162, + "step": 5121 + }, + { + "epoch": 1.0366322606759766, + "grad_norm": 0.30619198083877563, + "learning_rate": 9.438898564121375e-05, + "loss": 0.1973, + "step": 5122 + }, + { + "epoch": 1.0368346488565068, + "grad_norm": 0.33436110615730286, + "learning_rate": 9.4357228880515e-05, + "loss": 0.2431, + "step": 5123 + }, + { + "epoch": 1.037037037037037, + "grad_norm": 0.31361180543899536, + "learning_rate": 9.432547269069261e-05, + "loss": 0.2134, + "step": 5124 + }, + { + "epoch": 1.0372394252175674, + "grad_norm": 0.3122353255748749, + "learning_rate": 9.429371707495935e-05, + "loss": 0.1999, + "step": 5125 + }, + { + "epoch": 1.0374418133980976, + "grad_norm": 0.2919948697090149, + "learning_rate": 9.426196203652793e-05, + "loss": 0.194, + "step": 5126 + }, + { + "epoch": 1.0376442015786278, + "grad_norm": 0.33420529961586, + "learning_rate": 9.423020757861097e-05, + "loss": 0.2183, + "step": 5127 + }, + { + "epoch": 1.037846589759158, + "grad_norm": 0.2902718186378479, + "learning_rate": 9.419845370442107e-05, + "loss": 0.2048, + "step": 5128 + }, + { + "epoch": 1.0380489779396884, + "grad_norm": 0.3235926032066345, + "learning_rate": 9.416670041717076e-05, + "loss": 0.1843, + "step": 5129 + }, + { + "epoch": 1.0382513661202186, + "grad_norm": 0.2709651589393616, + "learning_rate": 9.413494772007248e-05, + "loss": 0.1803, + "step": 5130 + }, + { + "epoch": 1.0384537543007488, + "grad_norm": 0.290262371301651, + "learning_rate": 9.410319561633866e-05, + "loss": 0.1983, + "step": 5131 + }, + { + "epoch": 1.038656142481279, + "grad_norm": 0.2669159173965454, + "learning_rate": 9.407144410918163e-05, + "loss": 0.2022, + "step": 5132 + }, + { + "epoch": 1.0388585306618094, + "grad_norm": 0.3024842143058777, + "learning_rate": 9.403969320181367e-05, + "loss": 0.2085, + "step": 5133 + }, + { + "epoch": 1.0390609188423396, + "grad_norm": 0.30416321754455566, + "learning_rate": 9.400794289744702e-05, + "loss": 0.2032, + "step": 5134 + }, + { + "epoch": 1.0392633070228698, + "grad_norm": 0.270829439163208, + "learning_rate": 9.397619319929385e-05, + "loss": 0.2162, + "step": 5135 + }, + { + "epoch": 1.0394656952034, + "grad_norm": 0.27344194054603577, + "learning_rate": 9.394444411056623e-05, + "loss": 0.176, + "step": 5136 + }, + { + "epoch": 1.0396680833839305, + "grad_norm": 0.33522239327430725, + "learning_rate": 9.391269563447622e-05, + "loss": 0.2199, + "step": 5137 + }, + { + "epoch": 1.0398704715644607, + "grad_norm": 0.2570640742778778, + "learning_rate": 9.388094777423578e-05, + "loss": 0.2067, + "step": 5138 + }, + { + "epoch": 1.0400728597449909, + "grad_norm": 0.26825082302093506, + "learning_rate": 9.384920053305682e-05, + "loss": 0.2067, + "step": 5139 + }, + { + "epoch": 1.040275247925521, + "grad_norm": 0.2695053815841675, + "learning_rate": 9.381745391415125e-05, + "loss": 0.2057, + "step": 5140 + }, + { + "epoch": 1.0404776361060515, + "grad_norm": 0.35252243280410767, + "learning_rate": 9.37857079207308e-05, + "loss": 0.1945, + "step": 5141 + }, + { + "epoch": 1.0406800242865817, + "grad_norm": 0.3360033333301544, + "learning_rate": 9.37539625560072e-05, + "loss": 0.2287, + "step": 5142 + }, + { + "epoch": 1.0408824124671119, + "grad_norm": 0.34644338488578796, + "learning_rate": 9.372221782319215e-05, + "loss": 0.1944, + "step": 5143 + }, + { + "epoch": 1.041084800647642, + "grad_norm": 0.2628285884857178, + "learning_rate": 9.369047372549723e-05, + "loss": 0.1871, + "step": 5144 + }, + { + "epoch": 1.0412871888281725, + "grad_norm": 0.26992303133010864, + "learning_rate": 9.365873026613397e-05, + "loss": 0.1871, + "step": 5145 + }, + { + "epoch": 1.0414895770087027, + "grad_norm": 0.2952551245689392, + "learning_rate": 9.362698744831385e-05, + "loss": 0.2167, + "step": 5146 + }, + { + "epoch": 1.041691965189233, + "grad_norm": 0.23648761212825775, + "learning_rate": 9.359524527524825e-05, + "loss": 0.1759, + "step": 5147 + }, + { + "epoch": 1.041894353369763, + "grad_norm": 0.2490091472864151, + "learning_rate": 9.356350375014854e-05, + "loss": 0.1999, + "step": 5148 + }, + { + "epoch": 1.0420967415502935, + "grad_norm": 0.25964388251304626, + "learning_rate": 9.353176287622599e-05, + "loss": 0.1793, + "step": 5149 + }, + { + "epoch": 1.0422991297308237, + "grad_norm": 0.28901317715644836, + "learning_rate": 9.350002265669179e-05, + "loss": 0.204, + "step": 5150 + }, + { + "epoch": 1.0422991297308237, + "eval_loss": 0.2759149372577667, + "eval_runtime": 0.7382, + "eval_samples_per_second": 6.774, + "eval_steps_per_second": 1.355, + "step": 5150 + }, + { + "epoch": 1.042501517911354, + "grad_norm": 0.27219051122665405, + "learning_rate": 9.346828309475709e-05, + "loss": 0.1853, + "step": 5151 + }, + { + "epoch": 1.0427039060918841, + "grad_norm": 0.31162458658218384, + "learning_rate": 9.343654419363298e-05, + "loss": 0.2339, + "step": 5152 + }, + { + "epoch": 1.0429062942724145, + "grad_norm": 0.27195677161216736, + "learning_rate": 9.340480595653047e-05, + "loss": 0.2015, + "step": 5153 + }, + { + "epoch": 1.0431086824529447, + "grad_norm": 0.26143282651901245, + "learning_rate": 9.337306838666047e-05, + "loss": 0.1737, + "step": 5154 + }, + { + "epoch": 1.043311070633475, + "grad_norm": 0.36458563804626465, + "learning_rate": 9.334133148723387e-05, + "loss": 0.2331, + "step": 5155 + }, + { + "epoch": 1.0435134588140054, + "grad_norm": 0.30646729469299316, + "learning_rate": 9.33095952614615e-05, + "loss": 0.2053, + "step": 5156 + }, + { + "epoch": 1.0437158469945356, + "grad_norm": 0.3230549991130829, + "learning_rate": 9.327785971255413e-05, + "loss": 0.2252, + "step": 5157 + }, + { + "epoch": 1.0439182351750658, + "grad_norm": 0.24788984656333923, + "learning_rate": 9.324612484372231e-05, + "loss": 0.1804, + "step": 5158 + }, + { + "epoch": 1.044120623355596, + "grad_norm": 0.2826426327228546, + "learning_rate": 9.321439065817673e-05, + "loss": 0.1773, + "step": 5159 + }, + { + "epoch": 1.0443230115361264, + "grad_norm": 0.2371816188097, + "learning_rate": 9.318265715912791e-05, + "loss": 0.22, + "step": 5160 + }, + { + "epoch": 1.0445253997166566, + "grad_norm": 0.3433065116405487, + "learning_rate": 9.315092434978626e-05, + "loss": 0.1837, + "step": 5161 + }, + { + "epoch": 1.0447277878971868, + "grad_norm": 0.29012125730514526, + "learning_rate": 9.311919223336225e-05, + "loss": 0.2242, + "step": 5162 + }, + { + "epoch": 1.044930176077717, + "grad_norm": 0.406108021736145, + "learning_rate": 9.308746081306617e-05, + "loss": 0.2046, + "step": 5163 + }, + { + "epoch": 1.0451325642582474, + "grad_norm": 0.3668637275695801, + "learning_rate": 9.305573009210827e-05, + "loss": 0.1964, + "step": 5164 + }, + { + "epoch": 1.0453349524387776, + "grad_norm": 0.3359313905239105, + "learning_rate": 9.302400007369873e-05, + "loss": 0.2094, + "step": 5165 + }, + { + "epoch": 1.0455373406193078, + "grad_norm": 0.4924279451370239, + "learning_rate": 9.299227076104769e-05, + "loss": 0.2138, + "step": 5166 + }, + { + "epoch": 1.045739728799838, + "grad_norm": 0.40203621983528137, + "learning_rate": 9.296054215736514e-05, + "loss": 0.1813, + "step": 5167 + }, + { + "epoch": 1.0459421169803684, + "grad_norm": 0.3456156551837921, + "learning_rate": 9.292881426586108e-05, + "loss": 0.2265, + "step": 5168 + }, + { + "epoch": 1.0461445051608986, + "grad_norm": 0.26270124316215515, + "learning_rate": 9.289708708974538e-05, + "loss": 0.186, + "step": 5169 + }, + { + "epoch": 1.0463468933414288, + "grad_norm": 0.28756922483444214, + "learning_rate": 9.28653606322279e-05, + "loss": 0.205, + "step": 5170 + }, + { + "epoch": 1.046549281521959, + "grad_norm": 0.3029641807079315, + "learning_rate": 9.283363489651834e-05, + "loss": 0.1955, + "step": 5171 + }, + { + "epoch": 1.0467516697024895, + "grad_norm": 0.3574798107147217, + "learning_rate": 9.280190988582643e-05, + "loss": 0.2166, + "step": 5172 + }, + { + "epoch": 1.0469540578830197, + "grad_norm": 0.3494110107421875, + "learning_rate": 9.277018560336174e-05, + "loss": 0.1967, + "step": 5173 + }, + { + "epoch": 1.0471564460635499, + "grad_norm": 0.29691949486732483, + "learning_rate": 9.27384620523338e-05, + "loss": 0.2357, + "step": 5174 + }, + { + "epoch": 1.04735883424408, + "grad_norm": 0.19874915480613708, + "learning_rate": 9.27067392359521e-05, + "loss": 0.1364, + "step": 5175 + }, + { + "epoch": 1.0475612224246105, + "grad_norm": 0.30270639061927795, + "learning_rate": 9.267501715742598e-05, + "loss": 0.1933, + "step": 5176 + }, + { + "epoch": 1.0477636106051407, + "grad_norm": 0.2565094828605652, + "learning_rate": 9.264329581996476e-05, + "loss": 0.1948, + "step": 5177 + }, + { + "epoch": 1.0479659987856709, + "grad_norm": 0.321664035320282, + "learning_rate": 9.261157522677768e-05, + "loss": 0.1872, + "step": 5178 + }, + { + "epoch": 1.048168386966201, + "grad_norm": 0.3833097517490387, + "learning_rate": 9.25798553810739e-05, + "loss": 0.1946, + "step": 5179 + }, + { + "epoch": 1.0483707751467315, + "grad_norm": 0.31259262561798096, + "learning_rate": 9.254813628606254e-05, + "loss": 0.2, + "step": 5180 + }, + { + "epoch": 1.0485731633272617, + "grad_norm": 0.31232592463493347, + "learning_rate": 9.251641794495251e-05, + "loss": 0.2189, + "step": 5181 + }, + { + "epoch": 1.048775551507792, + "grad_norm": 0.26915574073791504, + "learning_rate": 9.248470036095278e-05, + "loss": 0.1807, + "step": 5182 + }, + { + "epoch": 1.048977939688322, + "grad_norm": 0.2748313546180725, + "learning_rate": 9.24529835372722e-05, + "loss": 0.1838, + "step": 5183 + }, + { + "epoch": 1.0491803278688525, + "grad_norm": 0.28373873233795166, + "learning_rate": 9.242126747711958e-05, + "loss": 0.2122, + "step": 5184 + }, + { + "epoch": 1.0493827160493827, + "grad_norm": 0.2703567445278168, + "learning_rate": 9.238955218370359e-05, + "loss": 0.1792, + "step": 5185 + }, + { + "epoch": 1.049585104229913, + "grad_norm": 0.30673250555992126, + "learning_rate": 9.235783766023285e-05, + "loss": 0.2202, + "step": 5186 + }, + { + "epoch": 1.0497874924104433, + "grad_norm": 0.27138593792915344, + "learning_rate": 9.232612390991591e-05, + "loss": 0.2394, + "step": 5187 + }, + { + "epoch": 1.0499898805909735, + "grad_norm": 0.3554273247718811, + "learning_rate": 9.229441093596122e-05, + "loss": 0.2146, + "step": 5188 + }, + { + "epoch": 1.0501922687715037, + "grad_norm": 0.2618618309497833, + "learning_rate": 9.226269874157719e-05, + "loss": 0.1822, + "step": 5189 + }, + { + "epoch": 1.050394656952034, + "grad_norm": 0.27396267652511597, + "learning_rate": 9.223098732997208e-05, + "loss": 0.1736, + "step": 5190 + }, + { + "epoch": 1.0505970451325644, + "grad_norm": 0.23274348676204681, + "learning_rate": 9.21992767043542e-05, + "loss": 0.1799, + "step": 5191 + }, + { + "epoch": 1.0507994333130946, + "grad_norm": 0.26986899971961975, + "learning_rate": 9.216756686793164e-05, + "loss": 0.1664, + "step": 5192 + }, + { + "epoch": 1.0510018214936248, + "grad_norm": 0.2661428451538086, + "learning_rate": 9.213585782391246e-05, + "loss": 0.206, + "step": 5193 + }, + { + "epoch": 1.051204209674155, + "grad_norm": 0.25959786772727966, + "learning_rate": 9.210414957550469e-05, + "loss": 0.2015, + "step": 5194 + }, + { + "epoch": 1.0514065978546854, + "grad_norm": 0.250900536775589, + "learning_rate": 9.207244212591621e-05, + "loss": 0.1823, + "step": 5195 + }, + { + "epoch": 1.0516089860352156, + "grad_norm": 0.2858547270298004, + "learning_rate": 9.204073547835485e-05, + "loss": 0.196, + "step": 5196 + }, + { + "epoch": 1.0518113742157458, + "grad_norm": 0.28630852699279785, + "learning_rate": 9.200902963602835e-05, + "loss": 0.2221, + "step": 5197 + }, + { + "epoch": 1.052013762396276, + "grad_norm": 0.3132813274860382, + "learning_rate": 9.19773246021444e-05, + "loss": 0.1934, + "step": 5198 + }, + { + "epoch": 1.0522161505768064, + "grad_norm": 0.32697784900665283, + "learning_rate": 9.194562037991056e-05, + "loss": 0.2026, + "step": 5199 + }, + { + "epoch": 1.0524185387573366, + "grad_norm": 0.30263441801071167, + "learning_rate": 9.191391697253433e-05, + "loss": 0.2074, + "step": 5200 + }, + { + "epoch": 1.0524185387573366, + "eval_loss": 0.2719886004924774, + "eval_runtime": 0.7381, + "eval_samples_per_second": 6.774, + "eval_steps_per_second": 1.355, + "step": 5200 + }, + { + "epoch": 1.0526209269378668, + "grad_norm": 0.2719072103500366, + "learning_rate": 9.188221438322314e-05, + "loss": 0.1895, + "step": 5201 + }, + { + "epoch": 1.052823315118397, + "grad_norm": 0.2846873700618744, + "learning_rate": 9.185051261518436e-05, + "loss": 0.2252, + "step": 5202 + }, + { + "epoch": 1.0530257032989274, + "grad_norm": 0.267671674489975, + "learning_rate": 9.181881167162516e-05, + "loss": 0.1791, + "step": 5203 + }, + { + "epoch": 1.0532280914794576, + "grad_norm": 0.33255940675735474, + "learning_rate": 9.178711155575276e-05, + "loss": 0.2234, + "step": 5204 + }, + { + "epoch": 1.0534304796599878, + "grad_norm": 0.28519174456596375, + "learning_rate": 9.175541227077422e-05, + "loss": 0.1832, + "step": 5205 + }, + { + "epoch": 1.053632867840518, + "grad_norm": 0.26935869455337524, + "learning_rate": 9.172371381989657e-05, + "loss": 0.1925, + "step": 5206 + }, + { + "epoch": 1.0538352560210484, + "grad_norm": 0.28792282938957214, + "learning_rate": 9.16920162063267e-05, + "loss": 0.2105, + "step": 5207 + }, + { + "epoch": 1.0540376442015786, + "grad_norm": 0.291089802980423, + "learning_rate": 9.166031943327147e-05, + "loss": 0.2215, + "step": 5208 + }, + { + "epoch": 1.0542400323821088, + "grad_norm": 0.4079270362854004, + "learning_rate": 9.16286235039376e-05, + "loss": 0.1799, + "step": 5209 + }, + { + "epoch": 1.054442420562639, + "grad_norm": 0.39470669627189636, + "learning_rate": 9.159692842153177e-05, + "loss": 0.1637, + "step": 5210 + }, + { + "epoch": 1.0546448087431695, + "grad_norm": 0.3241277039051056, + "learning_rate": 9.156523418926055e-05, + "loss": 0.2344, + "step": 5211 + }, + { + "epoch": 1.0548471969236997, + "grad_norm": 0.26196640729904175, + "learning_rate": 9.153354081033043e-05, + "loss": 0.1965, + "step": 5212 + }, + { + "epoch": 1.0550495851042299, + "grad_norm": 0.2696246802806854, + "learning_rate": 9.150184828794782e-05, + "loss": 0.1988, + "step": 5213 + }, + { + "epoch": 1.05525197328476, + "grad_norm": 0.30082598328590393, + "learning_rate": 9.147015662531903e-05, + "loss": 0.2245, + "step": 5214 + }, + { + "epoch": 1.0554543614652905, + "grad_norm": 0.2307879775762558, + "learning_rate": 9.143846582565027e-05, + "loss": 0.162, + "step": 5215 + }, + { + "epoch": 1.0556567496458207, + "grad_norm": 0.28145650029182434, + "learning_rate": 9.140677589214772e-05, + "loss": 0.1562, + "step": 5216 + }, + { + "epoch": 1.0558591378263509, + "grad_norm": 0.341371089220047, + "learning_rate": 9.137508682801742e-05, + "loss": 0.2095, + "step": 5217 + }, + { + "epoch": 1.0560615260068813, + "grad_norm": 0.2670517861843109, + "learning_rate": 9.134339863646533e-05, + "loss": 0.1889, + "step": 5218 + }, + { + "epoch": 1.0562639141874115, + "grad_norm": 0.3274596631526947, + "learning_rate": 9.13117113206973e-05, + "loss": 0.212, + "step": 5219 + }, + { + "epoch": 1.0564663023679417, + "grad_norm": 0.2960319221019745, + "learning_rate": 9.128002488391919e-05, + "loss": 0.2007, + "step": 5220 + }, + { + "epoch": 1.056668690548472, + "grad_norm": 0.40096405148506165, + "learning_rate": 9.124833932933665e-05, + "loss": 0.1928, + "step": 5221 + }, + { + "epoch": 1.0568710787290023, + "grad_norm": 0.28344669938087463, + "learning_rate": 9.121665466015533e-05, + "loss": 0.1538, + "step": 5222 + }, + { + "epoch": 1.0570734669095325, + "grad_norm": 0.2522047758102417, + "learning_rate": 9.118497087958071e-05, + "loss": 0.1903, + "step": 5223 + }, + { + "epoch": 1.0572758550900627, + "grad_norm": 0.2611517906188965, + "learning_rate": 9.11532879908183e-05, + "loss": 0.1681, + "step": 5224 + }, + { + "epoch": 1.057478243270593, + "grad_norm": 0.2570279538631439, + "learning_rate": 9.112160599707332e-05, + "loss": 0.1745, + "step": 5225 + }, + { + "epoch": 1.0576806314511233, + "grad_norm": 0.29664525389671326, + "learning_rate": 9.10899249015511e-05, + "loss": 0.2311, + "step": 5226 + }, + { + "epoch": 1.0578830196316535, + "grad_norm": 0.2817818820476532, + "learning_rate": 9.105824470745678e-05, + "loss": 0.186, + "step": 5227 + }, + { + "epoch": 1.0580854078121837, + "grad_norm": 0.2797265350818634, + "learning_rate": 9.102656541799543e-05, + "loss": 0.1979, + "step": 5228 + }, + { + "epoch": 1.058287795992714, + "grad_norm": 0.3641483187675476, + "learning_rate": 9.099488703637205e-05, + "loss": 0.1944, + "step": 5229 + }, + { + "epoch": 1.0584901841732444, + "grad_norm": 0.3078802227973938, + "learning_rate": 9.09632095657915e-05, + "loss": 0.1997, + "step": 5230 + }, + { + "epoch": 1.0586925723537746, + "grad_norm": 0.3237381875514984, + "learning_rate": 9.093153300945858e-05, + "loss": 0.1899, + "step": 5231 + }, + { + "epoch": 1.0588949605343048, + "grad_norm": 0.2978192865848541, + "learning_rate": 9.0899857370578e-05, + "loss": 0.1889, + "step": 5232 + }, + { + "epoch": 1.059097348714835, + "grad_norm": 0.28448426723480225, + "learning_rate": 9.086818265235437e-05, + "loss": 0.2004, + "step": 5233 + }, + { + "epoch": 1.0592997368953654, + "grad_norm": 0.34776198863983154, + "learning_rate": 9.083650885799218e-05, + "loss": 0.2632, + "step": 5234 + }, + { + "epoch": 1.0595021250758956, + "grad_norm": 0.22447577118873596, + "learning_rate": 9.080483599069589e-05, + "loss": 0.1518, + "step": 5235 + }, + { + "epoch": 1.0597045132564258, + "grad_norm": 0.2891514003276825, + "learning_rate": 9.077316405366981e-05, + "loss": 0.2074, + "step": 5236 + }, + { + "epoch": 1.059906901436956, + "grad_norm": 0.26560258865356445, + "learning_rate": 9.074149305011818e-05, + "loss": 0.1851, + "step": 5237 + }, + { + "epoch": 1.0601092896174864, + "grad_norm": 0.3098355531692505, + "learning_rate": 9.070982298324513e-05, + "loss": 0.1936, + "step": 5238 + }, + { + "epoch": 1.0603116777980166, + "grad_norm": 0.2618989050388336, + "learning_rate": 9.067815385625471e-05, + "loss": 0.2084, + "step": 5239 + }, + { + "epoch": 1.0605140659785468, + "grad_norm": 0.2665063738822937, + "learning_rate": 9.064648567235087e-05, + "loss": 0.189, + "step": 5240 + }, + { + "epoch": 1.060716454159077, + "grad_norm": 0.3155873715877533, + "learning_rate": 9.061481843473746e-05, + "loss": 0.2045, + "step": 5241 + }, + { + "epoch": 1.0609188423396074, + "grad_norm": 0.2748834490776062, + "learning_rate": 9.058315214661824e-05, + "loss": 0.2126, + "step": 5242 + }, + { + "epoch": 1.0611212305201376, + "grad_norm": 0.2678951025009155, + "learning_rate": 9.055148681119688e-05, + "loss": 0.1881, + "step": 5243 + }, + { + "epoch": 1.0613236187006678, + "grad_norm": 0.2735554873943329, + "learning_rate": 9.051982243167695e-05, + "loss": 0.2194, + "step": 5244 + }, + { + "epoch": 1.061526006881198, + "grad_norm": 0.36983054876327515, + "learning_rate": 9.04881590112619e-05, + "loss": 0.1863, + "step": 5245 + }, + { + "epoch": 1.0617283950617284, + "grad_norm": 0.3115123510360718, + "learning_rate": 9.045649655315515e-05, + "loss": 0.2507, + "step": 5246 + }, + { + "epoch": 1.0619307832422586, + "grad_norm": 0.33893606066703796, + "learning_rate": 9.04248350605599e-05, + "loss": 0.2123, + "step": 5247 + }, + { + "epoch": 1.0621331714227888, + "grad_norm": 0.2772587835788727, + "learning_rate": 9.039317453667938e-05, + "loss": 0.2104, + "step": 5248 + }, + { + "epoch": 1.0623355596033193, + "grad_norm": 0.3373522460460663, + "learning_rate": 9.036151498471665e-05, + "loss": 0.2273, + "step": 5249 + }, + { + "epoch": 1.0625379477838495, + "grad_norm": 0.3072078227996826, + "learning_rate": 9.03298564078747e-05, + "loss": 0.2511, + "step": 5250 + }, + { + "epoch": 1.0625379477838495, + "eval_loss": 0.2735154926776886, + "eval_runtime": 0.7369, + "eval_samples_per_second": 6.785, + "eval_steps_per_second": 1.357, + "step": 5250 + }, + { + "epoch": 1.0627403359643797, + "grad_norm": 0.2639048397541046, + "learning_rate": 9.029819880935642e-05, + "loss": 0.173, + "step": 5251 + }, + { + "epoch": 1.0629427241449099, + "grad_norm": 0.2672668695449829, + "learning_rate": 9.026654219236458e-05, + "loss": 0.1915, + "step": 5252 + }, + { + "epoch": 1.0631451123254403, + "grad_norm": 0.28084275126457214, + "learning_rate": 9.023488656010188e-05, + "loss": 0.2238, + "step": 5253 + }, + { + "epoch": 1.0633475005059705, + "grad_norm": 0.2745071053504944, + "learning_rate": 9.020323191577087e-05, + "loss": 0.1724, + "step": 5254 + }, + { + "epoch": 1.0635498886865007, + "grad_norm": 0.26804065704345703, + "learning_rate": 9.017157826257407e-05, + "loss": 0.1646, + "step": 5255 + }, + { + "epoch": 1.0637522768670309, + "grad_norm": 0.3335314691066742, + "learning_rate": 9.013992560371385e-05, + "loss": 0.1889, + "step": 5256 + }, + { + "epoch": 1.0639546650475613, + "grad_norm": 0.2998206913471222, + "learning_rate": 9.010827394239249e-05, + "loss": 0.2184, + "step": 5257 + }, + { + "epoch": 1.0641570532280915, + "grad_norm": 0.26737990975379944, + "learning_rate": 9.00766232818122e-05, + "loss": 0.2071, + "step": 5258 + }, + { + "epoch": 1.0643594414086217, + "grad_norm": 0.3056652545928955, + "learning_rate": 9.004497362517504e-05, + "loss": 0.204, + "step": 5259 + }, + { + "epoch": 1.064561829589152, + "grad_norm": 0.26440155506134033, + "learning_rate": 9.001332497568298e-05, + "loss": 0.163, + "step": 5260 + }, + { + "epoch": 1.0647642177696823, + "grad_norm": 0.25069424510002136, + "learning_rate": 8.998167733653791e-05, + "loss": 0.1881, + "step": 5261 + }, + { + "epoch": 1.0649666059502125, + "grad_norm": 0.2964242696762085, + "learning_rate": 8.99500307109416e-05, + "loss": 0.2169, + "step": 5262 + }, + { + "epoch": 1.0651689941307427, + "grad_norm": 0.26181793212890625, + "learning_rate": 8.991838510209575e-05, + "loss": 0.1768, + "step": 5263 + }, + { + "epoch": 1.065371382311273, + "grad_norm": 0.26684558391571045, + "learning_rate": 8.988674051320189e-05, + "loss": 0.1906, + "step": 5264 + }, + { + "epoch": 1.0655737704918034, + "grad_norm": 0.23875440657138824, + "learning_rate": 8.985509694746152e-05, + "loss": 0.1528, + "step": 5265 + }, + { + "epoch": 1.0657761586723336, + "grad_norm": 0.22852183878421783, + "learning_rate": 8.982345440807598e-05, + "loss": 0.1609, + "step": 5266 + }, + { + "epoch": 1.0659785468528638, + "grad_norm": 0.28481584787368774, + "learning_rate": 8.979181289824655e-05, + "loss": 0.2069, + "step": 5267 + }, + { + "epoch": 1.066180935033394, + "grad_norm": 0.26229703426361084, + "learning_rate": 8.976017242117438e-05, + "loss": 0.2085, + "step": 5268 + }, + { + "epoch": 1.0663833232139244, + "grad_norm": 0.2593900263309479, + "learning_rate": 8.972853298006054e-05, + "loss": 0.1716, + "step": 5269 + }, + { + "epoch": 1.0665857113944546, + "grad_norm": 0.30314525961875916, + "learning_rate": 8.969689457810593e-05, + "loss": 0.2148, + "step": 5270 + }, + { + "epoch": 1.0667880995749848, + "grad_norm": 0.27090978622436523, + "learning_rate": 8.966525721851143e-05, + "loss": 0.1875, + "step": 5271 + }, + { + "epoch": 1.0669904877555152, + "grad_norm": 0.2738669514656067, + "learning_rate": 8.963362090447775e-05, + "loss": 0.1887, + "step": 5272 + }, + { + "epoch": 1.0671928759360454, + "grad_norm": 0.2838186025619507, + "learning_rate": 8.960198563920553e-05, + "loss": 0.185, + "step": 5273 + }, + { + "epoch": 1.0673952641165756, + "grad_norm": 0.24561458826065063, + "learning_rate": 8.95703514258953e-05, + "loss": 0.1436, + "step": 5274 + }, + { + "epoch": 1.0675976522971058, + "grad_norm": 0.28942060470581055, + "learning_rate": 8.95387182677475e-05, + "loss": 0.1846, + "step": 5275 + }, + { + "epoch": 1.067800040477636, + "grad_norm": 0.30993714928627014, + "learning_rate": 8.950708616796238e-05, + "loss": 0.21, + "step": 5276 + }, + { + "epoch": 1.0680024286581664, + "grad_norm": 0.3060511350631714, + "learning_rate": 8.947545512974019e-05, + "loss": 0.1867, + "step": 5277 + }, + { + "epoch": 1.0682048168386966, + "grad_norm": 0.23082970082759857, + "learning_rate": 8.944382515628104e-05, + "loss": 0.1672, + "step": 5278 + }, + { + "epoch": 1.0684072050192268, + "grad_norm": 0.3195563554763794, + "learning_rate": 8.941219625078487e-05, + "loss": 0.237, + "step": 5279 + }, + { + "epoch": 1.0686095931997572, + "grad_norm": 0.2811078131198883, + "learning_rate": 8.93805684164516e-05, + "loss": 0.1512, + "step": 5280 + }, + { + "epoch": 1.0688119813802874, + "grad_norm": 0.37472841143608093, + "learning_rate": 8.9348941656481e-05, + "loss": 0.2245, + "step": 5281 + }, + { + "epoch": 1.0690143695608176, + "grad_norm": 0.24966472387313843, + "learning_rate": 8.931731597407268e-05, + "loss": 0.1916, + "step": 5282 + }, + { + "epoch": 1.0692167577413478, + "grad_norm": 0.30108532309532166, + "learning_rate": 8.928569137242628e-05, + "loss": 0.1838, + "step": 5283 + }, + { + "epoch": 1.0694191459218783, + "grad_norm": 0.30620646476745605, + "learning_rate": 8.925406785474119e-05, + "loss": 0.1921, + "step": 5284 + }, + { + "epoch": 1.0696215341024085, + "grad_norm": 0.2843058407306671, + "learning_rate": 8.922244542421676e-05, + "loss": 0.2085, + "step": 5285 + }, + { + "epoch": 1.0698239222829387, + "grad_norm": 0.27315425872802734, + "learning_rate": 8.919082408405221e-05, + "loss": 0.1745, + "step": 5286 + }, + { + "epoch": 1.0700263104634689, + "grad_norm": 0.2726389169692993, + "learning_rate": 8.915920383744667e-05, + "loss": 0.2096, + "step": 5287 + }, + { + "epoch": 1.0702286986439993, + "grad_norm": 0.276395320892334, + "learning_rate": 8.912758468759912e-05, + "loss": 0.202, + "step": 5288 + }, + { + "epoch": 1.0704310868245295, + "grad_norm": 0.30593031644821167, + "learning_rate": 8.909596663770849e-05, + "loss": 0.2287, + "step": 5289 + }, + { + "epoch": 1.0706334750050597, + "grad_norm": 0.4061230719089508, + "learning_rate": 8.906434969097351e-05, + "loss": 0.1944, + "step": 5290 + }, + { + "epoch": 1.0708358631855899, + "grad_norm": 0.3001694679260254, + "learning_rate": 8.903273385059293e-05, + "loss": 0.2023, + "step": 5291 + }, + { + "epoch": 1.0710382513661203, + "grad_norm": 0.24743273854255676, + "learning_rate": 8.900111911976524e-05, + "loss": 0.1624, + "step": 5292 + }, + { + "epoch": 1.0712406395466505, + "grad_norm": 0.28860440850257874, + "learning_rate": 8.896950550168888e-05, + "loss": 0.2338, + "step": 5293 + }, + { + "epoch": 1.0714430277271807, + "grad_norm": 0.30887338519096375, + "learning_rate": 8.893789299956223e-05, + "loss": 0.2008, + "step": 5294 + }, + { + "epoch": 1.071645415907711, + "grad_norm": 0.3015650808811188, + "learning_rate": 8.890628161658349e-05, + "loss": 0.2064, + "step": 5295 + }, + { + "epoch": 1.0718478040882413, + "grad_norm": 0.24830302596092224, + "learning_rate": 8.887467135595078e-05, + "loss": 0.1907, + "step": 5296 + }, + { + "epoch": 1.0720501922687715, + "grad_norm": 0.27147096395492554, + "learning_rate": 8.884306222086208e-05, + "loss": 0.1608, + "step": 5297 + }, + { + "epoch": 1.0722525804493017, + "grad_norm": 0.4186892807483673, + "learning_rate": 8.881145421451527e-05, + "loss": 0.2069, + "step": 5298 + }, + { + "epoch": 1.072454968629832, + "grad_norm": 0.28978657722473145, + "learning_rate": 8.877984734010812e-05, + "loss": 0.1996, + "step": 5299 + }, + { + "epoch": 1.0726573568103623, + "grad_norm": 0.25510936975479126, + "learning_rate": 8.874824160083829e-05, + "loss": 0.1706, + "step": 5300 + }, + { + "epoch": 1.0726573568103623, + "eval_loss": 0.27265626192092896, + "eval_runtime": 0.7384, + "eval_samples_per_second": 6.772, + "eval_steps_per_second": 1.354, + "step": 5300 + }, + { + "epoch": 1.0728597449908925, + "grad_norm": 0.2712719440460205, + "learning_rate": 8.871663699990331e-05, + "loss": 0.1873, + "step": 5301 + }, + { + "epoch": 1.0730621331714227, + "grad_norm": 0.27994102239608765, + "learning_rate": 8.86850335405006e-05, + "loss": 0.2115, + "step": 5302 + }, + { + "epoch": 1.0732645213519532, + "grad_norm": 0.294344037771225, + "learning_rate": 8.865343122582749e-05, + "loss": 0.2024, + "step": 5303 + }, + { + "epoch": 1.0734669095324834, + "grad_norm": 0.30034956336021423, + "learning_rate": 8.862183005908114e-05, + "loss": 0.1942, + "step": 5304 + }, + { + "epoch": 1.0736692977130136, + "grad_norm": 0.28707626461982727, + "learning_rate": 8.859023004345862e-05, + "loss": 0.2016, + "step": 5305 + }, + { + "epoch": 1.0738716858935438, + "grad_norm": 0.2606486976146698, + "learning_rate": 8.855863118215692e-05, + "loss": 0.2008, + "step": 5306 + }, + { + "epoch": 1.074074074074074, + "grad_norm": 0.2927470803260803, + "learning_rate": 8.852703347837285e-05, + "loss": 0.1894, + "step": 5307 + }, + { + "epoch": 1.0742764622546044, + "grad_norm": 0.2737131714820862, + "learning_rate": 8.849543693530315e-05, + "loss": 0.2034, + "step": 5308 + }, + { + "epoch": 1.0744788504351346, + "grad_norm": 0.25730380415916443, + "learning_rate": 8.846384155614443e-05, + "loss": 0.1926, + "step": 5309 + }, + { + "epoch": 1.0746812386156648, + "grad_norm": 0.3297762870788574, + "learning_rate": 8.843224734409317e-05, + "loss": 0.1842, + "step": 5310 + }, + { + "epoch": 1.0748836267961952, + "grad_norm": 0.2969495356082916, + "learning_rate": 8.840065430234576e-05, + "loss": 0.2472, + "step": 5311 + }, + { + "epoch": 1.0750860149767254, + "grad_norm": 0.25306811928749084, + "learning_rate": 8.836906243409843e-05, + "loss": 0.1767, + "step": 5312 + }, + { + "epoch": 1.0752884031572556, + "grad_norm": 0.25566038489341736, + "learning_rate": 8.833747174254736e-05, + "loss": 0.1653, + "step": 5313 + }, + { + "epoch": 1.0754907913377858, + "grad_norm": 0.27874088287353516, + "learning_rate": 8.830588223088846e-05, + "loss": 0.2032, + "step": 5314 + }, + { + "epoch": 1.0756931795183162, + "grad_norm": 0.251594215631485, + "learning_rate": 8.82742939023177e-05, + "loss": 0.1993, + "step": 5315 + }, + { + "epoch": 1.0758955676988464, + "grad_norm": 0.3001585900783539, + "learning_rate": 8.824270676003087e-05, + "loss": 0.196, + "step": 5316 + }, + { + "epoch": 1.0760979558793766, + "grad_norm": 0.23913423717021942, + "learning_rate": 8.821112080722359e-05, + "loss": 0.1715, + "step": 5317 + }, + { + "epoch": 1.0763003440599068, + "grad_norm": 0.3131200075149536, + "learning_rate": 8.817953604709141e-05, + "loss": 0.1991, + "step": 5318 + }, + { + "epoch": 1.0765027322404372, + "grad_norm": 0.30624499917030334, + "learning_rate": 8.814795248282974e-05, + "loss": 0.2247, + "step": 5319 + }, + { + "epoch": 1.0767051204209674, + "grad_norm": 0.30333131551742554, + "learning_rate": 8.811637011763388e-05, + "loss": 0.2114, + "step": 5320 + }, + { + "epoch": 1.0769075086014976, + "grad_norm": 0.25488388538360596, + "learning_rate": 8.8084788954699e-05, + "loss": 0.2014, + "step": 5321 + }, + { + "epoch": 1.0771098967820278, + "grad_norm": 0.2921813428401947, + "learning_rate": 8.805320899722014e-05, + "loss": 0.2073, + "step": 5322 + }, + { + "epoch": 1.0773122849625583, + "grad_norm": 0.3513261675834656, + "learning_rate": 8.802163024839224e-05, + "loss": 0.1927, + "step": 5323 + }, + { + "epoch": 1.0775146731430885, + "grad_norm": 0.29600194096565247, + "learning_rate": 8.799005271141011e-05, + "loss": 0.1911, + "step": 5324 + }, + { + "epoch": 1.0777170613236187, + "grad_norm": 0.33089059591293335, + "learning_rate": 8.795847638946841e-05, + "loss": 0.1839, + "step": 5325 + }, + { + "epoch": 1.0779194495041489, + "grad_norm": 0.35707876086235046, + "learning_rate": 8.792690128576175e-05, + "loss": 0.2012, + "step": 5326 + }, + { + "epoch": 1.0781218376846793, + "grad_norm": 0.2894890308380127, + "learning_rate": 8.78953274034845e-05, + "loss": 0.2285, + "step": 5327 + }, + { + "epoch": 1.0783242258652095, + "grad_norm": 0.2733427882194519, + "learning_rate": 8.786375474583104e-05, + "loss": 0.1767, + "step": 5328 + }, + { + "epoch": 1.0785266140457397, + "grad_norm": 0.29151982069015503, + "learning_rate": 8.783218331599552e-05, + "loss": 0.2081, + "step": 5329 + }, + { + "epoch": 1.0787290022262699, + "grad_norm": 0.2583785653114319, + "learning_rate": 8.7800613117172e-05, + "loss": 0.2054, + "step": 5330 + }, + { + "epoch": 1.0789313904068003, + "grad_norm": 0.24573227763175964, + "learning_rate": 8.776904415255448e-05, + "loss": 0.153, + "step": 5331 + }, + { + "epoch": 1.0791337785873305, + "grad_norm": 0.30387353897094727, + "learning_rate": 8.77374764253367e-05, + "loss": 0.1989, + "step": 5332 + }, + { + "epoch": 1.0793361667678607, + "grad_norm": 0.27814245223999023, + "learning_rate": 8.770590993871238e-05, + "loss": 0.1832, + "step": 5333 + }, + { + "epoch": 1.0795385549483911, + "grad_norm": 0.2869229018688202, + "learning_rate": 8.767434469587512e-05, + "loss": 0.1993, + "step": 5334 + }, + { + "epoch": 1.0797409431289213, + "grad_norm": 0.28225669264793396, + "learning_rate": 8.764278070001836e-05, + "loss": 0.2109, + "step": 5335 + }, + { + "epoch": 1.0799433313094515, + "grad_norm": 0.2703625559806824, + "learning_rate": 8.761121795433534e-05, + "loss": 0.1905, + "step": 5336 + }, + { + "epoch": 1.0801457194899817, + "grad_norm": 0.2503441870212555, + "learning_rate": 8.75796564620193e-05, + "loss": 0.1854, + "step": 5337 + }, + { + "epoch": 1.080348107670512, + "grad_norm": 0.29169875383377075, + "learning_rate": 8.754809622626328e-05, + "loss": 0.1762, + "step": 5338 + }, + { + "epoch": 1.0805504958510423, + "grad_norm": 0.37799257040023804, + "learning_rate": 8.751653725026025e-05, + "loss": 0.216, + "step": 5339 + }, + { + "epoch": 1.0807528840315725, + "grad_norm": 0.2747899293899536, + "learning_rate": 8.748497953720298e-05, + "loss": 0.1783, + "step": 5340 + }, + { + "epoch": 1.0809552722121027, + "grad_norm": 0.3033110201358795, + "learning_rate": 8.745342309028417e-05, + "loss": 0.2141, + "step": 5341 + }, + { + "epoch": 1.0811576603926332, + "grad_norm": 0.29245656728744507, + "learning_rate": 8.742186791269636e-05, + "loss": 0.2019, + "step": 5342 + }, + { + "epoch": 1.0813600485731634, + "grad_norm": 0.25983068346977234, + "learning_rate": 8.739031400763194e-05, + "loss": 0.1859, + "step": 5343 + }, + { + "epoch": 1.0815624367536936, + "grad_norm": 0.2650619149208069, + "learning_rate": 8.735876137828327e-05, + "loss": 0.2064, + "step": 5344 + }, + { + "epoch": 1.0817648249342238, + "grad_norm": 0.2759478688240051, + "learning_rate": 8.732721002784247e-05, + "loss": 0.2229, + "step": 5345 + }, + { + "epoch": 1.0819672131147542, + "grad_norm": 0.30478212237358093, + "learning_rate": 8.72956599595016e-05, + "loss": 0.2038, + "step": 5346 + }, + { + "epoch": 1.0821696012952844, + "grad_norm": 0.349378764629364, + "learning_rate": 8.726411117645255e-05, + "loss": 0.2141, + "step": 5347 + }, + { + "epoch": 1.0823719894758146, + "grad_norm": 0.3019539713859558, + "learning_rate": 8.723256368188708e-05, + "loss": 0.222, + "step": 5348 + }, + { + "epoch": 1.0825743776563448, + "grad_norm": 0.2606692612171173, + "learning_rate": 8.720101747899685e-05, + "loss": 0.172, + "step": 5349 + }, + { + "epoch": 1.0827767658368752, + "grad_norm": 0.29608097672462463, + "learning_rate": 8.716947257097339e-05, + "loss": 0.1961, + "step": 5350 + }, + { + "epoch": 1.0827767658368752, + "eval_loss": 0.26948341727256775, + "eval_runtime": 0.741, + "eval_samples_per_second": 6.748, + "eval_steps_per_second": 1.35, + "step": 5350 + }, + { + "epoch": 1.0829791540174054, + "grad_norm": 0.2929864823818207, + "learning_rate": 8.713792896100806e-05, + "loss": 0.2005, + "step": 5351 + }, + { + "epoch": 1.0831815421979356, + "grad_norm": 0.3245900869369507, + "learning_rate": 8.710638665229211e-05, + "loss": 0.2013, + "step": 5352 + }, + { + "epoch": 1.0833839303784658, + "grad_norm": 0.2754800021648407, + "learning_rate": 8.707484564801667e-05, + "loss": 0.1602, + "step": 5353 + }, + { + "epoch": 1.0835863185589962, + "grad_norm": 0.3620850741863251, + "learning_rate": 8.704330595137273e-05, + "loss": 0.2033, + "step": 5354 + }, + { + "epoch": 1.0837887067395264, + "grad_norm": 0.31015318632125854, + "learning_rate": 8.701176756555114e-05, + "loss": 0.2109, + "step": 5355 + }, + { + "epoch": 1.0839910949200566, + "grad_norm": 0.2776755094528198, + "learning_rate": 8.698023049374262e-05, + "loss": 0.2029, + "step": 5356 + }, + { + "epoch": 1.0841934831005868, + "grad_norm": 0.2785882353782654, + "learning_rate": 8.694869473913775e-05, + "loss": 0.1942, + "step": 5357 + }, + { + "epoch": 1.0843958712811173, + "grad_norm": 0.23001353442668915, + "learning_rate": 8.691716030492707e-05, + "loss": 0.1692, + "step": 5358 + }, + { + "epoch": 1.0845982594616475, + "grad_norm": 0.23214296996593475, + "learning_rate": 8.688562719430077e-05, + "loss": 0.1735, + "step": 5359 + }, + { + "epoch": 1.0848006476421777, + "grad_norm": 0.3181362450122833, + "learning_rate": 8.685409541044912e-05, + "loss": 0.1579, + "step": 5360 + }, + { + "epoch": 1.0850030358227079, + "grad_norm": 0.25155559182167053, + "learning_rate": 8.682256495656215e-05, + "loss": 0.1945, + "step": 5361 + }, + { + "epoch": 1.0852054240032383, + "grad_norm": 0.2986045181751251, + "learning_rate": 8.679103583582979e-05, + "loss": 0.1929, + "step": 5362 + }, + { + "epoch": 1.0854078121837685, + "grad_norm": 0.2835460603237152, + "learning_rate": 8.675950805144183e-05, + "loss": 0.1953, + "step": 5363 + }, + { + "epoch": 1.0856102003642987, + "grad_norm": 0.30347853899002075, + "learning_rate": 8.672798160658791e-05, + "loss": 0.221, + "step": 5364 + }, + { + "epoch": 1.085812588544829, + "grad_norm": 0.31518036127090454, + "learning_rate": 8.669645650445755e-05, + "loss": 0.2007, + "step": 5365 + }, + { + "epoch": 1.0860149767253593, + "grad_norm": 0.2516017556190491, + "learning_rate": 8.666493274824012e-05, + "loss": 0.1734, + "step": 5366 + }, + { + "epoch": 1.0862173649058895, + "grad_norm": 0.255487322807312, + "learning_rate": 8.663341034112487e-05, + "loss": 0.1795, + "step": 5367 + }, + { + "epoch": 1.0864197530864197, + "grad_norm": 0.2932477593421936, + "learning_rate": 8.660188928630092e-05, + "loss": 0.2065, + "step": 5368 + }, + { + "epoch": 1.08662214126695, + "grad_norm": 0.3519365191459656, + "learning_rate": 8.657036958695721e-05, + "loss": 0.1865, + "step": 5369 + }, + { + "epoch": 1.0868245294474803, + "grad_norm": 0.2964744567871094, + "learning_rate": 8.65388512462826e-05, + "loss": 0.2013, + "step": 5370 + }, + { + "epoch": 1.0870269176280105, + "grad_norm": 0.3263915181159973, + "learning_rate": 8.650733426746579e-05, + "loss": 0.2065, + "step": 5371 + }, + { + "epoch": 1.0872293058085407, + "grad_norm": 0.26859644055366516, + "learning_rate": 8.647581865369529e-05, + "loss": 0.1972, + "step": 5372 + }, + { + "epoch": 1.0874316939890711, + "grad_norm": 0.31503501534461975, + "learning_rate": 8.644430440815956e-05, + "loss": 0.2107, + "step": 5373 + }, + { + "epoch": 1.0876340821696013, + "grad_norm": 0.2556743025779724, + "learning_rate": 8.641279153404688e-05, + "loss": 0.1937, + "step": 5374 + }, + { + "epoch": 1.0878364703501315, + "grad_norm": 0.2752053439617157, + "learning_rate": 8.638128003454538e-05, + "loss": 0.1779, + "step": 5375 + }, + { + "epoch": 1.0880388585306617, + "grad_norm": 0.25501495599746704, + "learning_rate": 8.634976991284307e-05, + "loss": 0.1801, + "step": 5376 + }, + { + "epoch": 1.0882412467111922, + "grad_norm": 0.31966909766197205, + "learning_rate": 8.631826117212781e-05, + "loss": 0.2226, + "step": 5377 + }, + { + "epoch": 1.0884436348917224, + "grad_norm": 0.3165377974510193, + "learning_rate": 8.628675381558732e-05, + "loss": 0.2003, + "step": 5378 + }, + { + "epoch": 1.0886460230722526, + "grad_norm": 0.27411338686943054, + "learning_rate": 8.62552478464092e-05, + "loss": 0.207, + "step": 5379 + }, + { + "epoch": 1.0888484112527828, + "grad_norm": 0.30073752999305725, + "learning_rate": 8.62237432677809e-05, + "loss": 0.2094, + "step": 5380 + }, + { + "epoch": 1.0890507994333132, + "grad_norm": 0.2878848612308502, + "learning_rate": 8.619224008288969e-05, + "loss": 0.1836, + "step": 5381 + }, + { + "epoch": 1.0892531876138434, + "grad_norm": 0.3357694447040558, + "learning_rate": 8.616073829492273e-05, + "loss": 0.226, + "step": 5382 + }, + { + "epoch": 1.0894555757943736, + "grad_norm": 0.28499335050582886, + "learning_rate": 8.612923790706707e-05, + "loss": 0.2015, + "step": 5383 + }, + { + "epoch": 1.0896579639749038, + "grad_norm": 0.3176371455192566, + "learning_rate": 8.609773892250955e-05, + "loss": 0.1925, + "step": 5384 + }, + { + "epoch": 1.0898603521554342, + "grad_norm": 0.3828350901603699, + "learning_rate": 8.606624134443695e-05, + "loss": 0.2082, + "step": 5385 + }, + { + "epoch": 1.0900627403359644, + "grad_norm": 0.340249240398407, + "learning_rate": 8.603474517603584e-05, + "loss": 0.2003, + "step": 5386 + }, + { + "epoch": 1.0902651285164946, + "grad_norm": 0.3102370798587799, + "learning_rate": 8.600325042049266e-05, + "loss": 0.2237, + "step": 5387 + }, + { + "epoch": 1.0904675166970248, + "grad_norm": 0.25209981203079224, + "learning_rate": 8.597175708099377e-05, + "loss": 0.1913, + "step": 5388 + }, + { + "epoch": 1.0906699048775552, + "grad_norm": 0.30471372604370117, + "learning_rate": 8.594026516072528e-05, + "loss": 0.2305, + "step": 5389 + }, + { + "epoch": 1.0908722930580854, + "grad_norm": 0.35013729333877563, + "learning_rate": 8.590877466287323e-05, + "loss": 0.228, + "step": 5390 + }, + { + "epoch": 1.0910746812386156, + "grad_norm": 0.277339369058609, + "learning_rate": 8.587728559062352e-05, + "loss": 0.2104, + "step": 5391 + }, + { + "epoch": 1.0912770694191458, + "grad_norm": 0.2655515968799591, + "learning_rate": 8.584579794716184e-05, + "loss": 0.1828, + "step": 5392 + }, + { + "epoch": 1.0914794575996762, + "grad_norm": 0.2481672465801239, + "learning_rate": 8.581431173567383e-05, + "loss": 0.1907, + "step": 5393 + }, + { + "epoch": 1.0916818457802064, + "grad_norm": 0.42780938744544983, + "learning_rate": 8.57828269593449e-05, + "loss": 0.2154, + "step": 5394 + }, + { + "epoch": 1.0918842339607366, + "grad_norm": 0.2682367265224457, + "learning_rate": 8.575134362136034e-05, + "loss": 0.2119, + "step": 5395 + }, + { + "epoch": 1.092086622141267, + "grad_norm": 0.2849738299846649, + "learning_rate": 8.571986172490536e-05, + "loss": 0.2048, + "step": 5396 + }, + { + "epoch": 1.0922890103217973, + "grad_norm": 0.29217204451560974, + "learning_rate": 8.56883812731649e-05, + "loss": 0.2057, + "step": 5397 + }, + { + "epoch": 1.0924913985023275, + "grad_norm": 0.3060377538204193, + "learning_rate": 8.565690226932388e-05, + "loss": 0.2257, + "step": 5398 + }, + { + "epoch": 1.0926937866828577, + "grad_norm": 0.317339152097702, + "learning_rate": 8.562542471656698e-05, + "loss": 0.1812, + "step": 5399 + }, + { + "epoch": 1.092896174863388, + "grad_norm": 0.4128323197364807, + "learning_rate": 8.559394861807876e-05, + "loss": 0.2057, + "step": 5400 + }, + { + "epoch": 1.092896174863388, + "eval_loss": 0.26553845405578613, + "eval_runtime": 0.7388, + "eval_samples_per_second": 6.767, + "eval_steps_per_second": 1.353, + "step": 5400 + }, + { + "epoch": 1.0930985630439183, + "grad_norm": 0.29070258140563965, + "learning_rate": 8.556247397704364e-05, + "loss": 0.1732, + "step": 5401 + }, + { + "epoch": 1.0933009512244485, + "grad_norm": 0.2655215859413147, + "learning_rate": 8.553100079664598e-05, + "loss": 0.196, + "step": 5402 + }, + { + "epoch": 1.0935033394049787, + "grad_norm": 0.2547283172607422, + "learning_rate": 8.549952908006981e-05, + "loss": 0.1807, + "step": 5403 + }, + { + "epoch": 1.093705727585509, + "grad_norm": 0.26383882761001587, + "learning_rate": 8.546805883049912e-05, + "loss": 0.1459, + "step": 5404 + }, + { + "epoch": 1.0939081157660393, + "grad_norm": 0.2940872311592102, + "learning_rate": 8.543659005111776e-05, + "loss": 0.1863, + "step": 5405 + }, + { + "epoch": 1.0941105039465695, + "grad_norm": 0.33460089564323425, + "learning_rate": 8.54051227451094e-05, + "loss": 0.2038, + "step": 5406 + }, + { + "epoch": 1.0943128921270997, + "grad_norm": 0.3572755455970764, + "learning_rate": 8.53736569156576e-05, + "loss": 0.2042, + "step": 5407 + }, + { + "epoch": 1.0945152803076301, + "grad_norm": 0.3915098011493683, + "learning_rate": 8.534219256594569e-05, + "loss": 0.1748, + "step": 5408 + }, + { + "epoch": 1.0947176684881603, + "grad_norm": 0.31204935908317566, + "learning_rate": 8.531072969915696e-05, + "loss": 0.232, + "step": 5409 + }, + { + "epoch": 1.0949200566686905, + "grad_norm": 0.2606232464313507, + "learning_rate": 8.527926831847445e-05, + "loss": 0.1788, + "step": 5410 + }, + { + "epoch": 1.0951224448492207, + "grad_norm": 0.27161234617233276, + "learning_rate": 8.524780842708112e-05, + "loss": 0.1911, + "step": 5411 + }, + { + "epoch": 1.0953248330297511, + "grad_norm": 0.382242351770401, + "learning_rate": 8.521635002815973e-05, + "loss": 0.2008, + "step": 5412 + }, + { + "epoch": 1.0955272212102813, + "grad_norm": 0.31286850571632385, + "learning_rate": 8.518489312489293e-05, + "loss": 0.2099, + "step": 5413 + }, + { + "epoch": 1.0957296093908115, + "grad_norm": 0.28045061230659485, + "learning_rate": 8.515343772046318e-05, + "loss": 0.2031, + "step": 5414 + }, + { + "epoch": 1.0959319975713417, + "grad_norm": 0.32816460728645325, + "learning_rate": 8.512198381805282e-05, + "loss": 0.2097, + "step": 5415 + }, + { + "epoch": 1.0961343857518722, + "grad_norm": 0.27461788058280945, + "learning_rate": 8.509053142084402e-05, + "loss": 0.1847, + "step": 5416 + }, + { + "epoch": 1.0963367739324024, + "grad_norm": 0.2543027997016907, + "learning_rate": 8.505908053201882e-05, + "loss": 0.2017, + "step": 5417 + }, + { + "epoch": 1.0965391621129326, + "grad_norm": 0.29116228222846985, + "learning_rate": 8.502763115475908e-05, + "loss": 0.1862, + "step": 5418 + }, + { + "epoch": 1.0967415502934628, + "grad_norm": 0.5247349143028259, + "learning_rate": 8.499618329224652e-05, + "loss": 0.2204, + "step": 5419 + }, + { + "epoch": 1.0969439384739932, + "grad_norm": 0.27782142162323, + "learning_rate": 8.496473694766268e-05, + "loss": 0.1908, + "step": 5420 + }, + { + "epoch": 1.0971463266545234, + "grad_norm": 0.27946728467941284, + "learning_rate": 8.4933292124189e-05, + "loss": 0.2034, + "step": 5421 + }, + { + "epoch": 1.0973487148350536, + "grad_norm": 0.26838362216949463, + "learning_rate": 8.490184882500674e-05, + "loss": 0.2023, + "step": 5422 + }, + { + "epoch": 1.0975511030155838, + "grad_norm": 0.2386351376771927, + "learning_rate": 8.487040705329699e-05, + "loss": 0.182, + "step": 5423 + }, + { + "epoch": 1.0977534911961142, + "grad_norm": 0.31640326976776123, + "learning_rate": 8.483896681224072e-05, + "loss": 0.1941, + "step": 5424 + }, + { + "epoch": 1.0979558793766444, + "grad_norm": 0.3387574553489685, + "learning_rate": 8.480752810501868e-05, + "loss": 0.221, + "step": 5425 + }, + { + "epoch": 1.0981582675571746, + "grad_norm": 0.25791817903518677, + "learning_rate": 8.477609093481154e-05, + "loss": 0.1801, + "step": 5426 + }, + { + "epoch": 1.098360655737705, + "grad_norm": 0.3015289902687073, + "learning_rate": 8.474465530479976e-05, + "loss": 0.2132, + "step": 5427 + }, + { + "epoch": 1.0985630439182352, + "grad_norm": 0.3095368444919586, + "learning_rate": 8.47132212181637e-05, + "loss": 0.1777, + "step": 5428 + }, + { + "epoch": 1.0987654320987654, + "grad_norm": 0.2534477412700653, + "learning_rate": 8.468178867808352e-05, + "loss": 0.2022, + "step": 5429 + }, + { + "epoch": 1.0989678202792956, + "grad_norm": 0.3415544927120209, + "learning_rate": 8.465035768773921e-05, + "loss": 0.1755, + "step": 5430 + }, + { + "epoch": 1.099170208459826, + "grad_norm": 0.2486085444688797, + "learning_rate": 8.461892825031066e-05, + "loss": 0.2079, + "step": 5431 + }, + { + "epoch": 1.0993725966403562, + "grad_norm": 0.3206421136856079, + "learning_rate": 8.458750036897756e-05, + "loss": 0.2082, + "step": 5432 + }, + { + "epoch": 1.0995749848208864, + "grad_norm": 0.2875009775161743, + "learning_rate": 8.455607404691944e-05, + "loss": 0.1765, + "step": 5433 + }, + { + "epoch": 1.0997773730014166, + "grad_norm": 0.3375379741191864, + "learning_rate": 8.45246492873157e-05, + "loss": 0.2048, + "step": 5434 + }, + { + "epoch": 1.099979761181947, + "grad_norm": 0.3194500207901001, + "learning_rate": 8.449322609334558e-05, + "loss": 0.1952, + "step": 5435 + }, + { + "epoch": 1.1001821493624773, + "grad_norm": 0.308965802192688, + "learning_rate": 8.446180446818813e-05, + "loss": 0.243, + "step": 5436 + }, + { + "epoch": 1.1003845375430075, + "grad_norm": 0.3235142230987549, + "learning_rate": 8.443038441502225e-05, + "loss": 0.2185, + "step": 5437 + }, + { + "epoch": 1.1005869257235377, + "grad_norm": 0.23256078362464905, + "learning_rate": 8.43989659370267e-05, + "loss": 0.1578, + "step": 5438 + }, + { + "epoch": 1.100789313904068, + "grad_norm": 0.5195769667625427, + "learning_rate": 8.43675490373801e-05, + "loss": 0.2077, + "step": 5439 + }, + { + "epoch": 1.1009917020845983, + "grad_norm": 0.42865368723869324, + "learning_rate": 8.433613371926083e-05, + "loss": 0.21, + "step": 5440 + }, + { + "epoch": 1.1011940902651285, + "grad_norm": 0.30132371187210083, + "learning_rate": 8.430471998584721e-05, + "loss": 0.2243, + "step": 5441 + }, + { + "epoch": 1.1013964784456587, + "grad_norm": 0.26939135789871216, + "learning_rate": 8.427330784031732e-05, + "loss": 0.1697, + "step": 5442 + }, + { + "epoch": 1.101598866626189, + "grad_norm": 0.320512980222702, + "learning_rate": 8.424189728584915e-05, + "loss": 0.2202, + "step": 5443 + }, + { + "epoch": 1.1018012548067193, + "grad_norm": 0.2802683711051941, + "learning_rate": 8.421048832562044e-05, + "loss": 0.196, + "step": 5444 + }, + { + "epoch": 1.1020036429872495, + "grad_norm": 0.33519646525382996, + "learning_rate": 8.417908096280885e-05, + "loss": 0.2054, + "step": 5445 + }, + { + "epoch": 1.1022060311677797, + "grad_norm": 0.3033265769481659, + "learning_rate": 8.414767520059184e-05, + "loss": 0.2043, + "step": 5446 + }, + { + "epoch": 1.1024084193483101, + "grad_norm": 0.24827246367931366, + "learning_rate": 8.411627104214674e-05, + "loss": 0.2004, + "step": 5447 + }, + { + "epoch": 1.1026108075288403, + "grad_norm": 0.34880825877189636, + "learning_rate": 8.408486849065066e-05, + "loss": 0.2216, + "step": 5448 + }, + { + "epoch": 1.1028131957093705, + "grad_norm": 0.3037523031234741, + "learning_rate": 8.405346754928057e-05, + "loss": 0.2109, + "step": 5449 + }, + { + "epoch": 1.1030155838899007, + "grad_norm": 0.25012820959091187, + "learning_rate": 8.402206822121332e-05, + "loss": 0.1732, + "step": 5450 + }, + { + "epoch": 1.1030155838899007, + "eval_loss": 0.26805707812309265, + "eval_runtime": 0.7385, + "eval_samples_per_second": 6.77, + "eval_steps_per_second": 1.354, + "step": 5450 + }, + { + "epoch": 1.1032179720704312, + "grad_norm": 0.2542113959789276, + "learning_rate": 8.399067050962555e-05, + "loss": 0.2117, + "step": 5451 + }, + { + "epoch": 1.1034203602509614, + "grad_norm": 0.32532787322998047, + "learning_rate": 8.395927441769376e-05, + "loss": 0.1961, + "step": 5452 + }, + { + "epoch": 1.1036227484314916, + "grad_norm": 0.2972103953361511, + "learning_rate": 8.392787994859427e-05, + "loss": 0.2223, + "step": 5453 + }, + { + "epoch": 1.1038251366120218, + "grad_norm": 0.312966912984848, + "learning_rate": 8.389648710550324e-05, + "loss": 0.2092, + "step": 5454 + }, + { + "epoch": 1.1040275247925522, + "grad_norm": 0.2706752121448517, + "learning_rate": 8.386509589159666e-05, + "loss": 0.2111, + "step": 5455 + }, + { + "epoch": 1.1042299129730824, + "grad_norm": 0.3043646216392517, + "learning_rate": 8.38337063100504e-05, + "loss": 0.1995, + "step": 5456 + }, + { + "epoch": 1.1044323011536126, + "grad_norm": 0.24441584944725037, + "learning_rate": 8.38023183640401e-05, + "loss": 0.1391, + "step": 5457 + }, + { + "epoch": 1.104634689334143, + "grad_norm": 0.243491068482399, + "learning_rate": 8.377093205674124e-05, + "loss": 0.1765, + "step": 5458 + }, + { + "epoch": 1.1048370775146732, + "grad_norm": 0.2956346273422241, + "learning_rate": 8.373954739132922e-05, + "loss": 0.2106, + "step": 5459 + }, + { + "epoch": 1.1050394656952034, + "grad_norm": 0.2790488302707672, + "learning_rate": 8.370816437097915e-05, + "loss": 0.1816, + "step": 5460 + }, + { + "epoch": 1.1052418538757336, + "grad_norm": 0.25901350378990173, + "learning_rate": 8.367678299886608e-05, + "loss": 0.1998, + "step": 5461 + }, + { + "epoch": 1.105444242056264, + "grad_norm": 0.5830068588256836, + "learning_rate": 8.364540327816483e-05, + "loss": 0.2226, + "step": 5462 + }, + { + "epoch": 1.1056466302367942, + "grad_norm": 0.2617352604866028, + "learning_rate": 8.361402521205005e-05, + "loss": 0.1773, + "step": 5463 + }, + { + "epoch": 1.1058490184173244, + "grad_norm": 0.30582985281944275, + "learning_rate": 8.358264880369629e-05, + "loss": 0.1983, + "step": 5464 + }, + { + "epoch": 1.1060514065978546, + "grad_norm": 0.2862445116043091, + "learning_rate": 8.355127405627783e-05, + "loss": 0.1742, + "step": 5465 + }, + { + "epoch": 1.106253794778385, + "grad_norm": 0.26939576864242554, + "learning_rate": 8.351990097296888e-05, + "loss": 0.1604, + "step": 5466 + }, + { + "epoch": 1.1064561829589152, + "grad_norm": 0.42266708612442017, + "learning_rate": 8.348852955694342e-05, + "loss": 0.198, + "step": 5467 + }, + { + "epoch": 1.1066585711394454, + "grad_norm": 0.2866937816143036, + "learning_rate": 8.34571598113753e-05, + "loss": 0.2114, + "step": 5468 + }, + { + "epoch": 1.1068609593199756, + "grad_norm": 0.30896636843681335, + "learning_rate": 8.342579173943818e-05, + "loss": 0.2039, + "step": 5469 + }, + { + "epoch": 1.107063347500506, + "grad_norm": 0.25059643387794495, + "learning_rate": 8.339442534430552e-05, + "loss": 0.1659, + "step": 5470 + }, + { + "epoch": 1.1072657356810363, + "grad_norm": 0.3479415774345398, + "learning_rate": 8.336306062915066e-05, + "loss": 0.1989, + "step": 5471 + }, + { + "epoch": 1.1074681238615665, + "grad_norm": 0.34983018040657043, + "learning_rate": 8.333169759714676e-05, + "loss": 0.196, + "step": 5472 + }, + { + "epoch": 1.1076705120420967, + "grad_norm": 0.33293071389198303, + "learning_rate": 8.33003362514668e-05, + "loss": 0.2463, + "step": 5473 + }, + { + "epoch": 1.107872900222627, + "grad_norm": 0.3174991309642792, + "learning_rate": 8.32689765952836e-05, + "loss": 0.1945, + "step": 5474 + }, + { + "epoch": 1.1080752884031573, + "grad_norm": 0.3204096853733063, + "learning_rate": 8.323761863176977e-05, + "loss": 0.1979, + "step": 5475 + }, + { + "epoch": 1.1082776765836875, + "grad_norm": 0.3265916705131531, + "learning_rate": 8.32062623640978e-05, + "loss": 0.1879, + "step": 5476 + }, + { + "epoch": 1.1084800647642177, + "grad_norm": 0.2553459703922272, + "learning_rate": 8.317490779544e-05, + "loss": 0.1804, + "step": 5477 + }, + { + "epoch": 1.108682452944748, + "grad_norm": 0.26617133617401123, + "learning_rate": 8.314355492896849e-05, + "loss": 0.1855, + "step": 5478 + }, + { + "epoch": 1.1088848411252783, + "grad_norm": 0.2975890040397644, + "learning_rate": 8.311220376785521e-05, + "loss": 0.1883, + "step": 5479 + }, + { + "epoch": 1.1090872293058085, + "grad_norm": 0.27596038579940796, + "learning_rate": 8.308085431527197e-05, + "loss": 0.2003, + "step": 5480 + }, + { + "epoch": 1.1092896174863387, + "grad_norm": 0.32845911383628845, + "learning_rate": 8.304950657439033e-05, + "loss": 0.2082, + "step": 5481 + }, + { + "epoch": 1.1094920056668691, + "grad_norm": 0.30826571583747864, + "learning_rate": 8.301816054838178e-05, + "loss": 0.2075, + "step": 5482 + }, + { + "epoch": 1.1096943938473993, + "grad_norm": 0.33803990483283997, + "learning_rate": 8.298681624041755e-05, + "loss": 0.1956, + "step": 5483 + }, + { + "epoch": 1.1098967820279295, + "grad_norm": 0.312499463558197, + "learning_rate": 8.295547365366873e-05, + "loss": 0.1909, + "step": 5484 + }, + { + "epoch": 1.1100991702084597, + "grad_norm": 0.24040253460407257, + "learning_rate": 8.292413279130624e-05, + "loss": 0.1785, + "step": 5485 + }, + { + "epoch": 1.1103015583889901, + "grad_norm": 0.30531415343284607, + "learning_rate": 8.289279365650084e-05, + "loss": 0.211, + "step": 5486 + }, + { + "epoch": 1.1105039465695203, + "grad_norm": 0.2846146523952484, + "learning_rate": 8.286145625242305e-05, + "loss": 0.1942, + "step": 5487 + }, + { + "epoch": 1.1107063347500505, + "grad_norm": 0.3238934278488159, + "learning_rate": 8.283012058224329e-05, + "loss": 0.2314, + "step": 5488 + }, + { + "epoch": 1.110908722930581, + "grad_norm": 0.26682621240615845, + "learning_rate": 8.279878664913177e-05, + "loss": 0.1915, + "step": 5489 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.28587329387664795, + "learning_rate": 8.276745445625852e-05, + "loss": 0.2053, + "step": 5490 + }, + { + "epoch": 1.1113134992916414, + "grad_norm": 0.25702354311943054, + "learning_rate": 8.273612400679346e-05, + "loss": 0.1832, + "step": 5491 + }, + { + "epoch": 1.1115158874721716, + "grad_norm": 0.39831334352493286, + "learning_rate": 8.270479530390617e-05, + "loss": 0.1789, + "step": 5492 + }, + { + "epoch": 1.111718275652702, + "grad_norm": 0.28756701946258545, + "learning_rate": 8.267346835076624e-05, + "loss": 0.2076, + "step": 5493 + }, + { + "epoch": 1.1119206638332322, + "grad_norm": 0.2621087431907654, + "learning_rate": 8.264214315054295e-05, + "loss": 0.2131, + "step": 5494 + }, + { + "epoch": 1.1121230520137624, + "grad_norm": 0.30722442269325256, + "learning_rate": 8.261081970640549e-05, + "loss": 0.1999, + "step": 5495 + }, + { + "epoch": 1.1123254401942926, + "grad_norm": 0.2630216181278229, + "learning_rate": 8.257949802152282e-05, + "loss": 0.2014, + "step": 5496 + }, + { + "epoch": 1.112527828374823, + "grad_norm": 0.3075388967990875, + "learning_rate": 8.254817809906377e-05, + "loss": 0.214, + "step": 5497 + }, + { + "epoch": 1.1127302165553532, + "grad_norm": 0.2784331440925598, + "learning_rate": 8.251685994219693e-05, + "loss": 0.2134, + "step": 5498 + }, + { + "epoch": 1.1129326047358834, + "grad_norm": 0.28776970505714417, + "learning_rate": 8.248554355409076e-05, + "loss": 0.1863, + "step": 5499 + }, + { + "epoch": 1.1131349929164136, + "grad_norm": 0.3288789689540863, + "learning_rate": 8.24542289379135e-05, + "loss": 0.2347, + "step": 5500 + }, + { + "epoch": 1.1131349929164136, + "eval_loss": 0.266431599855423, + "eval_runtime": 0.7362, + "eval_samples_per_second": 6.792, + "eval_steps_per_second": 1.358, + "step": 5500 + }, + { + "epoch": 1.113337381096944, + "grad_norm": 0.2643654942512512, + "learning_rate": 8.242291609683326e-05, + "loss": 0.1786, + "step": 5501 + }, + { + "epoch": 1.1135397692774742, + "grad_norm": 0.30738750100135803, + "learning_rate": 8.239160503401794e-05, + "loss": 0.1953, + "step": 5502 + }, + { + "epoch": 1.1137421574580044, + "grad_norm": 0.2640698552131653, + "learning_rate": 8.236029575263525e-05, + "loss": 0.1898, + "step": 5503 + }, + { + "epoch": 1.1139445456385346, + "grad_norm": 0.27601543068885803, + "learning_rate": 8.232898825585275e-05, + "loss": 0.188, + "step": 5504 + }, + { + "epoch": 1.114146933819065, + "grad_norm": 0.2772660255432129, + "learning_rate": 8.22976825468378e-05, + "loss": 0.2217, + "step": 5505 + }, + { + "epoch": 1.1143493219995952, + "grad_norm": 0.27593713998794556, + "learning_rate": 8.226637862875758e-05, + "loss": 0.1725, + "step": 5506 + }, + { + "epoch": 1.1145517101801254, + "grad_norm": 0.2594548463821411, + "learning_rate": 8.22350765047791e-05, + "loss": 0.1799, + "step": 5507 + }, + { + "epoch": 1.1147540983606556, + "grad_norm": 0.3393990397453308, + "learning_rate": 8.220377617806916e-05, + "loss": 0.189, + "step": 5508 + }, + { + "epoch": 1.114956486541186, + "grad_norm": 0.28749755024909973, + "learning_rate": 8.217247765179442e-05, + "loss": 0.1873, + "step": 5509 + }, + { + "epoch": 1.1151588747217163, + "grad_norm": 0.27113446593284607, + "learning_rate": 8.214118092912133e-05, + "loss": 0.2105, + "step": 5510 + }, + { + "epoch": 1.1153612629022465, + "grad_norm": 0.2811520993709564, + "learning_rate": 8.210988601321616e-05, + "loss": 0.2098, + "step": 5511 + }, + { + "epoch": 1.1155636510827767, + "grad_norm": 0.29087939858436584, + "learning_rate": 8.207859290724501e-05, + "loss": 0.1824, + "step": 5512 + }, + { + "epoch": 1.115766039263307, + "grad_norm": 0.2858453392982483, + "learning_rate": 8.204730161437383e-05, + "loss": 0.212, + "step": 5513 + }, + { + "epoch": 1.1159684274438373, + "grad_norm": 0.31338831782341003, + "learning_rate": 8.201601213776824e-05, + "loss": 0.2128, + "step": 5514 + }, + { + "epoch": 1.1161708156243675, + "grad_norm": 0.32885226607322693, + "learning_rate": 8.198472448059385e-05, + "loss": 0.1949, + "step": 5515 + }, + { + "epoch": 1.1163732038048977, + "grad_norm": 0.27789217233657837, + "learning_rate": 8.1953438646016e-05, + "loss": 0.1884, + "step": 5516 + }, + { + "epoch": 1.116575591985428, + "grad_norm": 0.29066577553749084, + "learning_rate": 8.192215463719986e-05, + "loss": 0.1751, + "step": 5517 + }, + { + "epoch": 1.1167779801659583, + "grad_norm": 0.2519930899143219, + "learning_rate": 8.189087245731045e-05, + "loss": 0.1754, + "step": 5518 + }, + { + "epoch": 1.1169803683464885, + "grad_norm": 0.26689547300338745, + "learning_rate": 8.185959210951252e-05, + "loss": 0.1657, + "step": 5519 + }, + { + "epoch": 1.117182756527019, + "grad_norm": 0.2732199728488922, + "learning_rate": 8.182831359697071e-05, + "loss": 0.1926, + "step": 5520 + }, + { + "epoch": 1.1173851447075491, + "grad_norm": 0.2849332392215729, + "learning_rate": 8.179703692284948e-05, + "loss": 0.1994, + "step": 5521 + }, + { + "epoch": 1.1175875328880793, + "grad_norm": 0.30812686681747437, + "learning_rate": 8.176576209031304e-05, + "loss": 0.2105, + "step": 5522 + }, + { + "epoch": 1.1177899210686095, + "grad_norm": 0.301147997379303, + "learning_rate": 8.173448910252548e-05, + "loss": 0.1888, + "step": 5523 + }, + { + "epoch": 1.11799230924914, + "grad_norm": 0.2884552478790283, + "learning_rate": 8.170321796265064e-05, + "loss": 0.1824, + "step": 5524 + }, + { + "epoch": 1.1181946974296701, + "grad_norm": 0.28129085898399353, + "learning_rate": 8.16719486738522e-05, + "loss": 0.2102, + "step": 5525 + }, + { + "epoch": 1.1183970856102003, + "grad_norm": 0.2771812081336975, + "learning_rate": 8.16406812392937e-05, + "loss": 0.1954, + "step": 5526 + }, + { + "epoch": 1.1185994737907305, + "grad_norm": 0.4494684934616089, + "learning_rate": 8.160941566213843e-05, + "loss": 0.2242, + "step": 5527 + }, + { + "epoch": 1.118801861971261, + "grad_norm": 0.29533445835113525, + "learning_rate": 8.157815194554952e-05, + "loss": 0.2021, + "step": 5528 + }, + { + "epoch": 1.1190042501517912, + "grad_norm": 0.34723520278930664, + "learning_rate": 8.154689009268988e-05, + "loss": 0.2154, + "step": 5529 + }, + { + "epoch": 1.1192066383323214, + "grad_norm": 0.3085881173610687, + "learning_rate": 8.15156301067223e-05, + "loss": 0.2047, + "step": 5530 + }, + { + "epoch": 1.1194090265128516, + "grad_norm": 0.2965511977672577, + "learning_rate": 8.14843719908093e-05, + "loss": 0.1804, + "step": 5531 + }, + { + "epoch": 1.119611414693382, + "grad_norm": 0.2811489403247833, + "learning_rate": 8.145311574811325e-05, + "loss": 0.2255, + "step": 5532 + }, + { + "epoch": 1.1198138028739122, + "grad_norm": 0.2637896239757538, + "learning_rate": 8.142186138179635e-05, + "loss": 0.1785, + "step": 5533 + }, + { + "epoch": 1.1200161910544424, + "grad_norm": 0.28036361932754517, + "learning_rate": 8.139060889502056e-05, + "loss": 0.1853, + "step": 5534 + }, + { + "epoch": 1.1202185792349726, + "grad_norm": 0.2461838275194168, + "learning_rate": 8.135935829094772e-05, + "loss": 0.1899, + "step": 5535 + }, + { + "epoch": 1.120420967415503, + "grad_norm": 0.296796977519989, + "learning_rate": 8.132810957273944e-05, + "loss": 0.1901, + "step": 5536 + }, + { + "epoch": 1.1206233555960332, + "grad_norm": 0.25972190499305725, + "learning_rate": 8.129686274355709e-05, + "loss": 0.168, + "step": 5537 + }, + { + "epoch": 1.1208257437765634, + "grad_norm": 0.26876628398895264, + "learning_rate": 8.12656178065619e-05, + "loss": 0.224, + "step": 5538 + }, + { + "epoch": 1.1210281319570936, + "grad_norm": 0.35002103447914124, + "learning_rate": 8.123437476491492e-05, + "loss": 0.1862, + "step": 5539 + }, + { + "epoch": 1.121230520137624, + "grad_norm": 0.26477545499801636, + "learning_rate": 8.1203133621777e-05, + "loss": 0.1967, + "step": 5540 + }, + { + "epoch": 1.1214329083181542, + "grad_norm": 0.303501158952713, + "learning_rate": 8.117189438030879e-05, + "loss": 0.2196, + "step": 5541 + }, + { + "epoch": 1.1216352964986844, + "grad_norm": 0.2881081998348236, + "learning_rate": 8.114065704367074e-05, + "loss": 0.1848, + "step": 5542 + }, + { + "epoch": 1.1218376846792146, + "grad_norm": 0.33362969756126404, + "learning_rate": 8.110942161502313e-05, + "loss": 0.228, + "step": 5543 + }, + { + "epoch": 1.122040072859745, + "grad_norm": 0.28015798330307007, + "learning_rate": 8.107818809752602e-05, + "loss": 0.2227, + "step": 5544 + }, + { + "epoch": 1.1222424610402753, + "grad_norm": 0.29228246212005615, + "learning_rate": 8.104695649433928e-05, + "loss": 0.2313, + "step": 5545 + }, + { + "epoch": 1.1224448492208055, + "grad_norm": 0.2991606295108795, + "learning_rate": 8.101572680862264e-05, + "loss": 0.2007, + "step": 5546 + }, + { + "epoch": 1.1226472374013357, + "grad_norm": 0.31790077686309814, + "learning_rate": 8.098449904353554e-05, + "loss": 0.2082, + "step": 5547 + }, + { + "epoch": 1.122849625581866, + "grad_norm": 0.2826329171657562, + "learning_rate": 8.09532732022373e-05, + "loss": 0.1866, + "step": 5548 + }, + { + "epoch": 1.1230520137623963, + "grad_norm": 0.26081180572509766, + "learning_rate": 8.092204928788703e-05, + "loss": 0.203, + "step": 5549 + }, + { + "epoch": 1.1232544019429265, + "grad_norm": 0.2624530792236328, + "learning_rate": 8.089082730364363e-05, + "loss": 0.2001, + "step": 5550 + }, + { + "epoch": 1.1232544019429265, + "eval_loss": 0.2674531638622284, + "eval_runtime": 0.7399, + "eval_samples_per_second": 6.758, + "eval_steps_per_second": 1.352, + "step": 5550 + }, + { + "epoch": 1.123456790123457, + "grad_norm": 0.26963090896606445, + "learning_rate": 8.085960725266581e-05, + "loss": 0.1897, + "step": 5551 + }, + { + "epoch": 1.123659178303987, + "grad_norm": 0.26589468121528625, + "learning_rate": 8.08283891381121e-05, + "loss": 0.1715, + "step": 5552 + }, + { + "epoch": 1.1238615664845173, + "grad_norm": 0.24697373807430267, + "learning_rate": 8.079717296314079e-05, + "loss": 0.1569, + "step": 5553 + }, + { + "epoch": 1.1240639546650475, + "grad_norm": 0.26703882217407227, + "learning_rate": 8.076595873091001e-05, + "loss": 0.1884, + "step": 5554 + }, + { + "epoch": 1.124266342845578, + "grad_norm": 0.30867865681648254, + "learning_rate": 8.073474644457774e-05, + "loss": 0.2118, + "step": 5555 + }, + { + "epoch": 1.1244687310261081, + "grad_norm": 0.2832573354244232, + "learning_rate": 8.070353610730168e-05, + "loss": 0.212, + "step": 5556 + }, + { + "epoch": 1.1246711192066383, + "grad_norm": 0.31668218970298767, + "learning_rate": 8.067232772223934e-05, + "loss": 0.1914, + "step": 5557 + }, + { + "epoch": 1.1248735073871685, + "grad_norm": 0.3052870035171509, + "learning_rate": 8.064112129254814e-05, + "loss": 0.2067, + "step": 5558 + }, + { + "epoch": 1.125075895567699, + "grad_norm": 0.25671231746673584, + "learning_rate": 8.06099168213851e-05, + "loss": 0.1761, + "step": 5559 + }, + { + "epoch": 1.1252782837482291, + "grad_norm": 0.37939679622650146, + "learning_rate": 8.057871431190723e-05, + "loss": 0.2041, + "step": 5560 + }, + { + "epoch": 1.1254806719287593, + "grad_norm": 0.276947021484375, + "learning_rate": 8.054751376727125e-05, + "loss": 0.176, + "step": 5561 + }, + { + "epoch": 1.1256830601092895, + "grad_norm": 0.26697877049446106, + "learning_rate": 8.051631519063372e-05, + "loss": 0.1857, + "step": 5562 + }, + { + "epoch": 1.12588544828982, + "grad_norm": 0.33952033519744873, + "learning_rate": 8.048511858515099e-05, + "loss": 0.1875, + "step": 5563 + }, + { + "epoch": 1.1260878364703502, + "grad_norm": 0.35365763306617737, + "learning_rate": 8.045392395397919e-05, + "loss": 0.1963, + "step": 5564 + }, + { + "epoch": 1.1262902246508804, + "grad_norm": 0.30936864018440247, + "learning_rate": 8.042273130027425e-05, + "loss": 0.2162, + "step": 5565 + }, + { + "epoch": 1.1264926128314106, + "grad_norm": 0.23229964077472687, + "learning_rate": 8.039154062719195e-05, + "loss": 0.1731, + "step": 5566 + }, + { + "epoch": 1.126695001011941, + "grad_norm": 0.2607414424419403, + "learning_rate": 8.036035193788782e-05, + "loss": 0.1897, + "step": 5567 + }, + { + "epoch": 1.1268973891924712, + "grad_norm": 0.29121124744415283, + "learning_rate": 8.03291652355172e-05, + "loss": 0.2099, + "step": 5568 + }, + { + "epoch": 1.1270997773730014, + "grad_norm": 0.2816689610481262, + "learning_rate": 8.029798052323524e-05, + "loss": 0.1858, + "step": 5569 + }, + { + "epoch": 1.1273021655535316, + "grad_norm": 0.29074355959892273, + "learning_rate": 8.026679780419689e-05, + "loss": 0.1957, + "step": 5570 + }, + { + "epoch": 1.127504553734062, + "grad_norm": 0.2680456340312958, + "learning_rate": 8.023561708155687e-05, + "loss": 0.1999, + "step": 5571 + }, + { + "epoch": 1.1277069419145922, + "grad_norm": 0.371389776468277, + "learning_rate": 8.020443835846973e-05, + "loss": 0.2162, + "step": 5572 + }, + { + "epoch": 1.1279093300951224, + "grad_norm": 0.27187761664390564, + "learning_rate": 8.017326163808981e-05, + "loss": 0.1823, + "step": 5573 + }, + { + "epoch": 1.1281117182756528, + "grad_norm": 0.2521708011627197, + "learning_rate": 8.014208692357121e-05, + "loss": 0.1898, + "step": 5574 + }, + { + "epoch": 1.128314106456183, + "grad_norm": 0.26570048928260803, + "learning_rate": 8.011091421806792e-05, + "loss": 0.1737, + "step": 5575 + }, + { + "epoch": 1.1285164946367132, + "grad_norm": 0.29539164900779724, + "learning_rate": 8.007974352473362e-05, + "loss": 0.2042, + "step": 5576 + }, + { + "epoch": 1.1287188828172434, + "grad_norm": 0.3029521107673645, + "learning_rate": 8.004857484672186e-05, + "loss": 0.2217, + "step": 5577 + }, + { + "epoch": 1.1289212709977736, + "grad_norm": 0.22743040323257446, + "learning_rate": 8.001740818718595e-05, + "loss": 0.1799, + "step": 5578 + }, + { + "epoch": 1.129123659178304, + "grad_norm": 0.2918127179145813, + "learning_rate": 7.9986243549279e-05, + "loss": 0.2112, + "step": 5579 + }, + { + "epoch": 1.1293260473588342, + "grad_norm": 0.2503848671913147, + "learning_rate": 7.995508093615394e-05, + "loss": 0.1691, + "step": 5580 + }, + { + "epoch": 1.1295284355393644, + "grad_norm": 0.3075704276561737, + "learning_rate": 7.992392035096345e-05, + "loss": 0.1898, + "step": 5581 + }, + { + "epoch": 1.1297308237198949, + "grad_norm": 0.22540538012981415, + "learning_rate": 7.989276179686002e-05, + "loss": 0.1709, + "step": 5582 + }, + { + "epoch": 1.129933211900425, + "grad_norm": 0.3081169128417969, + "learning_rate": 7.9861605276996e-05, + "loss": 0.1968, + "step": 5583 + }, + { + "epoch": 1.1301356000809553, + "grad_norm": 0.2457832396030426, + "learning_rate": 7.983045079452344e-05, + "loss": 0.1791, + "step": 5584 + }, + { + "epoch": 1.1303379882614855, + "grad_norm": 0.3851803243160248, + "learning_rate": 7.979929835259422e-05, + "loss": 0.1885, + "step": 5585 + }, + { + "epoch": 1.1305403764420159, + "grad_norm": 0.29105520248413086, + "learning_rate": 7.976814795436004e-05, + "loss": 0.1779, + "step": 5586 + }, + { + "epoch": 1.130742764622546, + "grad_norm": 0.9472611546516418, + "learning_rate": 7.973699960297236e-05, + "loss": 0.1643, + "step": 5587 + }, + { + "epoch": 1.1309451528030763, + "grad_norm": 0.2521185576915741, + "learning_rate": 7.970585330158244e-05, + "loss": 0.1771, + "step": 5588 + }, + { + "epoch": 1.1311475409836065, + "grad_norm": 0.2751530706882477, + "learning_rate": 7.967470905334133e-05, + "loss": 0.1961, + "step": 5589 + }, + { + "epoch": 1.131349929164137, + "grad_norm": 0.29079005122184753, + "learning_rate": 7.96435668613999e-05, + "loss": 0.1781, + "step": 5590 + }, + { + "epoch": 1.131552317344667, + "grad_norm": 0.27955174446105957, + "learning_rate": 7.961242672890877e-05, + "loss": 0.1799, + "step": 5591 + }, + { + "epoch": 1.1317547055251973, + "grad_norm": 0.3343000113964081, + "learning_rate": 7.958128865901838e-05, + "loss": 0.1942, + "step": 5592 + }, + { + "epoch": 1.1319570937057275, + "grad_norm": 0.2868947982788086, + "learning_rate": 7.955015265487895e-05, + "loss": 0.2287, + "step": 5593 + }, + { + "epoch": 1.132159481886258, + "grad_norm": 0.33234843611717224, + "learning_rate": 7.95190187196405e-05, + "loss": 0.2211, + "step": 5594 + }, + { + "epoch": 1.1323618700667881, + "grad_norm": 0.3007453382015228, + "learning_rate": 7.948788685645284e-05, + "loss": 0.1778, + "step": 5595 + }, + { + "epoch": 1.1325642582473183, + "grad_norm": 0.30757924914360046, + "learning_rate": 7.945675706846555e-05, + "loss": 0.2153, + "step": 5596 + }, + { + "epoch": 1.1327666464278485, + "grad_norm": 0.2718496024608612, + "learning_rate": 7.942562935882803e-05, + "loss": 0.1933, + "step": 5597 + }, + { + "epoch": 1.132969034608379, + "grad_norm": 0.2909453511238098, + "learning_rate": 7.939450373068942e-05, + "loss": 0.1942, + "step": 5598 + }, + { + "epoch": 1.1331714227889091, + "grad_norm": 0.27906984090805054, + "learning_rate": 7.936338018719873e-05, + "loss": 0.1797, + "step": 5599 + }, + { + "epoch": 1.1333738109694393, + "grad_norm": 0.29126763343811035, + "learning_rate": 7.93322587315047e-05, + "loss": 0.1891, + "step": 5600 + }, + { + "epoch": 1.1333738109694393, + "eval_loss": 0.27354076504707336, + "eval_runtime": 0.7387, + "eval_samples_per_second": 6.768, + "eval_steps_per_second": 1.354, + "step": 5600 + }, + { + "epoch": 1.1335761991499695, + "grad_norm": 0.3037993311882019, + "learning_rate": 7.930113936675587e-05, + "loss": 0.1996, + "step": 5601 + }, + { + "epoch": 1.1337785873305, + "grad_norm": 0.2610785663127899, + "learning_rate": 7.927002209610058e-05, + "loss": 0.1559, + "step": 5602 + }, + { + "epoch": 1.1339809755110302, + "grad_norm": 0.31370899081230164, + "learning_rate": 7.923890692268692e-05, + "loss": 0.2117, + "step": 5603 + }, + { + "epoch": 1.1341833636915604, + "grad_norm": 0.3044915497303009, + "learning_rate": 7.92077938496628e-05, + "loss": 0.1878, + "step": 5604 + }, + { + "epoch": 1.1343857518720908, + "grad_norm": 0.26320895552635193, + "learning_rate": 7.917668288017595e-05, + "loss": 0.1882, + "step": 5605 + }, + { + "epoch": 1.134588140052621, + "grad_norm": 0.2812601923942566, + "learning_rate": 7.914557401737381e-05, + "loss": 0.1816, + "step": 5606 + }, + { + "epoch": 1.1347905282331512, + "grad_norm": 0.36465582251548767, + "learning_rate": 7.911446726440369e-05, + "loss": 0.234, + "step": 5607 + }, + { + "epoch": 1.1349929164136814, + "grad_norm": 0.39917242527008057, + "learning_rate": 7.908336262441261e-05, + "loss": 0.2317, + "step": 5608 + }, + { + "epoch": 1.1351953045942116, + "grad_norm": 0.2993834316730499, + "learning_rate": 7.905226010054741e-05, + "loss": 0.2286, + "step": 5609 + }, + { + "epoch": 1.135397692774742, + "grad_norm": 0.24275238811969757, + "learning_rate": 7.902115969595474e-05, + "loss": 0.1344, + "step": 5610 + }, + { + "epoch": 1.1356000809552722, + "grad_norm": 0.2810470461845398, + "learning_rate": 7.899006141378102e-05, + "loss": 0.186, + "step": 5611 + }, + { + "epoch": 1.1358024691358024, + "grad_norm": 0.24735689163208008, + "learning_rate": 7.895896525717241e-05, + "loss": 0.198, + "step": 5612 + }, + { + "epoch": 1.1360048573163328, + "grad_norm": 0.2500055134296417, + "learning_rate": 7.892787122927493e-05, + "loss": 0.1961, + "step": 5613 + }, + { + "epoch": 1.136207245496863, + "grad_norm": 0.2348107546567917, + "learning_rate": 7.889677933323431e-05, + "loss": 0.175, + "step": 5614 + }, + { + "epoch": 1.1364096336773932, + "grad_norm": 0.27956265211105347, + "learning_rate": 7.886568957219615e-05, + "loss": 0.1484, + "step": 5615 + }, + { + "epoch": 1.1366120218579234, + "grad_norm": 0.3258465528488159, + "learning_rate": 7.883460194930575e-05, + "loss": 0.2197, + "step": 5616 + }, + { + "epoch": 1.1368144100384538, + "grad_norm": 0.3220537602901459, + "learning_rate": 7.880351646770824e-05, + "loss": 0.2107, + "step": 5617 + }, + { + "epoch": 1.137016798218984, + "grad_norm": 0.30312204360961914, + "learning_rate": 7.877243313054851e-05, + "loss": 0.1828, + "step": 5618 + }, + { + "epoch": 1.1372191863995142, + "grad_norm": 0.3163522183895111, + "learning_rate": 7.874135194097128e-05, + "loss": 0.1994, + "step": 5619 + }, + { + "epoch": 1.1374215745800444, + "grad_norm": 0.2842673063278198, + "learning_rate": 7.871027290212097e-05, + "loss": 0.2129, + "step": 5620 + }, + { + "epoch": 1.1376239627605749, + "grad_norm": 0.3099961578845978, + "learning_rate": 7.867919601714186e-05, + "loss": 0.2129, + "step": 5621 + }, + { + "epoch": 1.137826350941105, + "grad_norm": 0.26963162422180176, + "learning_rate": 7.8648121289178e-05, + "loss": 0.1578, + "step": 5622 + }, + { + "epoch": 1.1380287391216353, + "grad_norm": 0.27318212389945984, + "learning_rate": 7.861704872137318e-05, + "loss": 0.184, + "step": 5623 + }, + { + "epoch": 1.1382311273021655, + "grad_norm": 0.26080846786499023, + "learning_rate": 7.858597831687102e-05, + "loss": 0.168, + "step": 5624 + }, + { + "epoch": 1.138433515482696, + "grad_norm": 0.3094097077846527, + "learning_rate": 7.855491007881485e-05, + "loss": 0.2037, + "step": 5625 + }, + { + "epoch": 1.138635903663226, + "grad_norm": 0.33787310123443604, + "learning_rate": 7.852384401034785e-05, + "loss": 0.2268, + "step": 5626 + }, + { + "epoch": 1.1388382918437563, + "grad_norm": 0.2730053663253784, + "learning_rate": 7.849278011461298e-05, + "loss": 0.1785, + "step": 5627 + }, + { + "epoch": 1.1390406800242865, + "grad_norm": 0.27871865034103394, + "learning_rate": 7.846171839475295e-05, + "loss": 0.2044, + "step": 5628 + }, + { + "epoch": 1.139243068204817, + "grad_norm": 0.2883583605289459, + "learning_rate": 7.843065885391025e-05, + "loss": 0.2119, + "step": 5629 + }, + { + "epoch": 1.139445456385347, + "grad_norm": 0.2682492733001709, + "learning_rate": 7.839960149522715e-05, + "loss": 0.1624, + "step": 5630 + }, + { + "epoch": 1.1396478445658773, + "grad_norm": 0.27851811051368713, + "learning_rate": 7.836854632184575e-05, + "loss": 0.2059, + "step": 5631 + }, + { + "epoch": 1.1398502327464075, + "grad_norm": 0.3172750771045685, + "learning_rate": 7.833749333690783e-05, + "loss": 0.1817, + "step": 5632 + }, + { + "epoch": 1.140052620926938, + "grad_norm": 0.3163038492202759, + "learning_rate": 7.830644254355504e-05, + "loss": 0.1772, + "step": 5633 + }, + { + "epoch": 1.1402550091074681, + "grad_norm": 0.3040738105773926, + "learning_rate": 7.827539394492878e-05, + "loss": 0.2129, + "step": 5634 + }, + { + "epoch": 1.1404573972879983, + "grad_norm": 0.29727524518966675, + "learning_rate": 7.824434754417018e-05, + "loss": 0.1933, + "step": 5635 + }, + { + "epoch": 1.1406597854685288, + "grad_norm": 0.2724418044090271, + "learning_rate": 7.821330334442023e-05, + "loss": 0.1936, + "step": 5636 + }, + { + "epoch": 1.140862173649059, + "grad_norm": 0.30874359607696533, + "learning_rate": 7.818226134881965e-05, + "loss": 0.1836, + "step": 5637 + }, + { + "epoch": 1.1410645618295892, + "grad_norm": 0.2828230559825897, + "learning_rate": 7.815122156050893e-05, + "loss": 0.1892, + "step": 5638 + }, + { + "epoch": 1.1412669500101194, + "grad_norm": 0.24108606576919556, + "learning_rate": 7.812018398262834e-05, + "loss": 0.179, + "step": 5639 + }, + { + "epoch": 1.1414693381906496, + "grad_norm": 0.30316025018692017, + "learning_rate": 7.808914861831797e-05, + "loss": 0.1994, + "step": 5640 + }, + { + "epoch": 1.14167172637118, + "grad_norm": 0.2653295695781708, + "learning_rate": 7.805811547071763e-05, + "loss": 0.1821, + "step": 5641 + }, + { + "epoch": 1.1418741145517102, + "grad_norm": 0.2590709328651428, + "learning_rate": 7.802708454296694e-05, + "loss": 0.1874, + "step": 5642 + }, + { + "epoch": 1.1420765027322404, + "grad_norm": 0.2897018790245056, + "learning_rate": 7.799605583820527e-05, + "loss": 0.1935, + "step": 5643 + }, + { + "epoch": 1.1422788909127708, + "grad_norm": 0.2743369936943054, + "learning_rate": 7.796502935957178e-05, + "loss": 0.1983, + "step": 5644 + }, + { + "epoch": 1.142481279093301, + "grad_norm": 0.3204341530799866, + "learning_rate": 7.793400511020541e-05, + "loss": 0.2016, + "step": 5645 + }, + { + "epoch": 1.1426836672738312, + "grad_norm": 0.2664974629878998, + "learning_rate": 7.790298309324489e-05, + "loss": 0.1804, + "step": 5646 + }, + { + "epoch": 1.1428860554543614, + "grad_norm": 0.3154468536376953, + "learning_rate": 7.787196331182869e-05, + "loss": 0.2191, + "step": 5647 + }, + { + "epoch": 1.1430884436348918, + "grad_norm": 0.2492353469133377, + "learning_rate": 7.784094576909503e-05, + "loss": 0.1782, + "step": 5648 + }, + { + "epoch": 1.143290831815422, + "grad_norm": 0.25519439578056335, + "learning_rate": 7.780993046818194e-05, + "loss": 0.1779, + "step": 5649 + }, + { + "epoch": 1.1434932199959522, + "grad_norm": 0.28019189834594727, + "learning_rate": 7.777891741222727e-05, + "loss": 0.1686, + "step": 5650 + }, + { + "epoch": 1.1434932199959522, + "eval_loss": 0.26935452222824097, + "eval_runtime": 0.7379, + "eval_samples_per_second": 6.776, + "eval_steps_per_second": 1.355, + "step": 5650 + }, + { + "epoch": 1.1436956081764824, + "grad_norm": 0.303603857755661, + "learning_rate": 7.774790660436858e-05, + "loss": 0.1917, + "step": 5651 + }, + { + "epoch": 1.1438979963570128, + "grad_norm": 0.27201592922210693, + "learning_rate": 7.77168980477432e-05, + "loss": 0.1909, + "step": 5652 + }, + { + "epoch": 1.144100384537543, + "grad_norm": 0.24054737389087677, + "learning_rate": 7.768589174548826e-05, + "loss": 0.1763, + "step": 5653 + }, + { + "epoch": 1.1443027727180732, + "grad_norm": 0.3936968743801117, + "learning_rate": 7.765488770074066e-05, + "loss": 0.1579, + "step": 5654 + }, + { + "epoch": 1.1445051608986034, + "grad_norm": 0.3285905420780182, + "learning_rate": 7.762388591663705e-05, + "loss": 0.2147, + "step": 5655 + }, + { + "epoch": 1.1447075490791339, + "grad_norm": 0.2840346693992615, + "learning_rate": 7.759288639631388e-05, + "loss": 0.2141, + "step": 5656 + }, + { + "epoch": 1.144909937259664, + "grad_norm": 0.3871752619743347, + "learning_rate": 7.756188914290736e-05, + "loss": 0.2267, + "step": 5657 + }, + { + "epoch": 1.1451123254401943, + "grad_norm": 0.2643132507801056, + "learning_rate": 7.753089415955343e-05, + "loss": 0.2084, + "step": 5658 + }, + { + "epoch": 1.1453147136207245, + "grad_norm": 0.22197052836418152, + "learning_rate": 7.749990144938788e-05, + "loss": 0.181, + "step": 5659 + }, + { + "epoch": 1.1455171018012549, + "grad_norm": 0.36441561579704285, + "learning_rate": 7.74689110155462e-05, + "loss": 0.2048, + "step": 5660 + }, + { + "epoch": 1.145719489981785, + "grad_norm": 0.2794022560119629, + "learning_rate": 7.743792286116372e-05, + "loss": 0.1842, + "step": 5661 + }, + { + "epoch": 1.1459218781623153, + "grad_norm": 0.2635161876678467, + "learning_rate": 7.740693698937542e-05, + "loss": 0.1906, + "step": 5662 + }, + { + "epoch": 1.1461242663428455, + "grad_norm": 0.30082035064697266, + "learning_rate": 7.73759534033162e-05, + "loss": 0.2073, + "step": 5663 + }, + { + "epoch": 1.146326654523376, + "grad_norm": 0.3124663829803467, + "learning_rate": 7.73449721061206e-05, + "loss": 0.2077, + "step": 5664 + }, + { + "epoch": 1.146529042703906, + "grad_norm": 0.26671409606933594, + "learning_rate": 7.731399310092303e-05, + "loss": 0.2005, + "step": 5665 + }, + { + "epoch": 1.1467314308844363, + "grad_norm": 0.27739301323890686, + "learning_rate": 7.728301639085758e-05, + "loss": 0.1975, + "step": 5666 + }, + { + "epoch": 1.1469338190649667, + "grad_norm": 0.3559306859970093, + "learning_rate": 7.725204197905818e-05, + "loss": 0.1811, + "step": 5667 + }, + { + "epoch": 1.147136207245497, + "grad_norm": 0.27497610449790955, + "learning_rate": 7.722106986865846e-05, + "loss": 0.1797, + "step": 5668 + }, + { + "epoch": 1.1473385954260271, + "grad_norm": 0.2708543837070465, + "learning_rate": 7.719010006279193e-05, + "loss": 0.2243, + "step": 5669 + }, + { + "epoch": 1.1475409836065573, + "grad_norm": 0.28557464480400085, + "learning_rate": 7.715913256459168e-05, + "loss": 0.1541, + "step": 5670 + }, + { + "epoch": 1.1477433717870875, + "grad_norm": 0.24130353331565857, + "learning_rate": 7.712816737719075e-05, + "loss": 0.1835, + "step": 5671 + }, + { + "epoch": 1.147945759967618, + "grad_norm": 0.32948946952819824, + "learning_rate": 7.709720450372184e-05, + "loss": 0.2035, + "step": 5672 + }, + { + "epoch": 1.1481481481481481, + "grad_norm": 0.2876203954219818, + "learning_rate": 7.706624394731746e-05, + "loss": 0.1851, + "step": 5673 + }, + { + "epoch": 1.1483505363286783, + "grad_norm": 0.281044065952301, + "learning_rate": 7.703528571110988e-05, + "loss": 0.1947, + "step": 5674 + }, + { + "epoch": 1.1485529245092088, + "grad_norm": 0.3992454707622528, + "learning_rate": 7.700432979823113e-05, + "loss": 0.2331, + "step": 5675 + }, + { + "epoch": 1.148755312689739, + "grad_norm": 0.2621214687824249, + "learning_rate": 7.6973376211813e-05, + "loss": 0.179, + "step": 5676 + }, + { + "epoch": 1.1489577008702692, + "grad_norm": 0.26856037974357605, + "learning_rate": 7.694242495498705e-05, + "loss": 0.1815, + "step": 5677 + }, + { + "epoch": 1.1491600890507994, + "grad_norm": 0.28116244077682495, + "learning_rate": 7.691147603088456e-05, + "loss": 0.1928, + "step": 5678 + }, + { + "epoch": 1.1493624772313298, + "grad_norm": 0.2723270654678345, + "learning_rate": 7.68805294426367e-05, + "loss": 0.1825, + "step": 5679 + }, + { + "epoch": 1.14956486541186, + "grad_norm": 0.29364144802093506, + "learning_rate": 7.684958519337429e-05, + "loss": 0.1983, + "step": 5680 + }, + { + "epoch": 1.1497672535923902, + "grad_norm": 0.3394298851490021, + "learning_rate": 7.681864328622792e-05, + "loss": 0.1917, + "step": 5681 + }, + { + "epoch": 1.1499696417729204, + "grad_norm": 0.2952880263328552, + "learning_rate": 7.678770372432799e-05, + "loss": 0.1874, + "step": 5682 + }, + { + "epoch": 1.1501720299534508, + "grad_norm": 0.2824990153312683, + "learning_rate": 7.675676651080464e-05, + "loss": 0.177, + "step": 5683 + }, + { + "epoch": 1.150374418133981, + "grad_norm": 0.29086270928382874, + "learning_rate": 7.672583164878775e-05, + "loss": 0.1867, + "step": 5684 + }, + { + "epoch": 1.1505768063145112, + "grad_norm": 0.27432793378829956, + "learning_rate": 7.669489914140701e-05, + "loss": 0.1921, + "step": 5685 + }, + { + "epoch": 1.1507791944950414, + "grad_norm": 0.32622668147087097, + "learning_rate": 7.666396899179183e-05, + "loss": 0.2174, + "step": 5686 + }, + { + "epoch": 1.1509815826755718, + "grad_norm": 0.3424051105976105, + "learning_rate": 7.663304120307141e-05, + "loss": 0.2197, + "step": 5687 + }, + { + "epoch": 1.151183970856102, + "grad_norm": 0.3257910907268524, + "learning_rate": 7.660211577837469e-05, + "loss": 0.2088, + "step": 5688 + }, + { + "epoch": 1.1513863590366322, + "grad_norm": 0.27886244654655457, + "learning_rate": 7.657119272083039e-05, + "loss": 0.1725, + "step": 5689 + }, + { + "epoch": 1.1515887472171624, + "grad_norm": 0.3835498094558716, + "learning_rate": 7.654027203356699e-05, + "loss": 0.2182, + "step": 5690 + }, + { + "epoch": 1.1517911353976928, + "grad_norm": 0.3601949214935303, + "learning_rate": 7.650935371971272e-05, + "loss": 0.1524, + "step": 5691 + }, + { + "epoch": 1.151993523578223, + "grad_norm": 0.3818325400352478, + "learning_rate": 7.647843778239554e-05, + "loss": 0.2032, + "step": 5692 + }, + { + "epoch": 1.1521959117587532, + "grad_norm": 0.24166607856750488, + "learning_rate": 7.64475242247432e-05, + "loss": 0.1826, + "step": 5693 + }, + { + "epoch": 1.1523982999392834, + "grad_norm": 0.2476678043603897, + "learning_rate": 7.641661304988322e-05, + "loss": 0.1766, + "step": 5694 + }, + { + "epoch": 1.1526006881198139, + "grad_norm": 0.282958447933197, + "learning_rate": 7.638570426094288e-05, + "loss": 0.2119, + "step": 5695 + }, + { + "epoch": 1.152803076300344, + "grad_norm": 0.251973420381546, + "learning_rate": 7.63547978610492e-05, + "loss": 0.1762, + "step": 5696 + }, + { + "epoch": 1.1530054644808743, + "grad_norm": 0.32014891505241394, + "learning_rate": 7.632389385332898e-05, + "loss": 0.1977, + "step": 5697 + }, + { + "epoch": 1.1532078526614047, + "grad_norm": 0.2665617763996124, + "learning_rate": 7.629299224090873e-05, + "loss": 0.1813, + "step": 5698 + }, + { + "epoch": 1.1534102408419349, + "grad_norm": 0.31495678424835205, + "learning_rate": 7.626209302691478e-05, + "loss": 0.2024, + "step": 5699 + }, + { + "epoch": 1.153612629022465, + "grad_norm": 0.26400405168533325, + "learning_rate": 7.623119621447317e-05, + "loss": 0.1825, + "step": 5700 + }, + { + "epoch": 1.153612629022465, + "eval_loss": 0.27036550641059875, + "eval_runtime": 0.7387, + "eval_samples_per_second": 6.769, + "eval_steps_per_second": 1.354, + "step": 5700 + }, + { + "epoch": 1.1538150172029953, + "grad_norm": 0.3164843022823334, + "learning_rate": 7.620030180670975e-05, + "loss": 0.1982, + "step": 5701 + }, + { + "epoch": 1.1540174053835255, + "grad_norm": 0.25917619466781616, + "learning_rate": 7.616940980675004e-05, + "loss": 0.1793, + "step": 5702 + }, + { + "epoch": 1.154219793564056, + "grad_norm": 0.2807494103908539, + "learning_rate": 7.613852021771939e-05, + "loss": 0.1985, + "step": 5703 + }, + { + "epoch": 1.154422181744586, + "grad_norm": 0.32781749963760376, + "learning_rate": 7.610763304274291e-05, + "loss": 0.2268, + "step": 5704 + }, + { + "epoch": 1.1546245699251163, + "grad_norm": 0.2813950479030609, + "learning_rate": 7.60767482849454e-05, + "loss": 0.1821, + "step": 5705 + }, + { + "epoch": 1.1548269581056467, + "grad_norm": 0.34443584084510803, + "learning_rate": 7.604586594745149e-05, + "loss": 0.2046, + "step": 5706 + }, + { + "epoch": 1.155029346286177, + "grad_norm": 0.3148466646671295, + "learning_rate": 7.601498603338548e-05, + "loss": 0.1884, + "step": 5707 + }, + { + "epoch": 1.1552317344667071, + "grad_norm": 0.29084309935569763, + "learning_rate": 7.598410854587155e-05, + "loss": 0.1952, + "step": 5708 + }, + { + "epoch": 1.1554341226472373, + "grad_norm": 0.2680000364780426, + "learning_rate": 7.595323348803351e-05, + "loss": 0.1603, + "step": 5709 + }, + { + "epoch": 1.1556365108277677, + "grad_norm": 0.32312873005867004, + "learning_rate": 7.592236086299499e-05, + "loss": 0.2157, + "step": 5710 + }, + { + "epoch": 1.155838899008298, + "grad_norm": 0.43009570240974426, + "learning_rate": 7.589149067387934e-05, + "loss": 0.2189, + "step": 5711 + }, + { + "epoch": 1.1560412871888281, + "grad_norm": 0.22385583817958832, + "learning_rate": 7.586062292380971e-05, + "loss": 0.1582, + "step": 5712 + }, + { + "epoch": 1.1562436753693583, + "grad_norm": 0.2577807605266571, + "learning_rate": 7.582975761590901e-05, + "loss": 0.1635, + "step": 5713 + }, + { + "epoch": 1.1564460635498888, + "grad_norm": 0.27913615107536316, + "learning_rate": 7.579889475329975e-05, + "loss": 0.1903, + "step": 5714 + }, + { + "epoch": 1.156648451730419, + "grad_norm": 0.34618043899536133, + "learning_rate": 7.576803433910439e-05, + "loss": 0.1793, + "step": 5715 + }, + { + "epoch": 1.1568508399109492, + "grad_norm": 0.2728598713874817, + "learning_rate": 7.573717637644508e-05, + "loss": 0.2004, + "step": 5716 + }, + { + "epoch": 1.1570532280914794, + "grad_norm": 0.2509997487068176, + "learning_rate": 7.570632086844364e-05, + "loss": 0.1848, + "step": 5717 + }, + { + "epoch": 1.1572556162720098, + "grad_norm": 0.2825949788093567, + "learning_rate": 7.567546781822177e-05, + "loss": 0.1845, + "step": 5718 + }, + { + "epoch": 1.15745800445254, + "grad_norm": 0.3017740547657013, + "learning_rate": 7.564461722890081e-05, + "loss": 0.1882, + "step": 5719 + }, + { + "epoch": 1.1576603926330702, + "grad_norm": 0.25365105271339417, + "learning_rate": 7.561376910360194e-05, + "loss": 0.174, + "step": 5720 + }, + { + "epoch": 1.1578627808136004, + "grad_norm": 0.6366367936134338, + "learning_rate": 7.558292344544603e-05, + "loss": 0.1798, + "step": 5721 + }, + { + "epoch": 1.1580651689941308, + "grad_norm": 0.28383052349090576, + "learning_rate": 7.555208025755372e-05, + "loss": 0.2105, + "step": 5722 + }, + { + "epoch": 1.158267557174661, + "grad_norm": 0.30744868516921997, + "learning_rate": 7.552123954304539e-05, + "loss": 0.1909, + "step": 5723 + }, + { + "epoch": 1.1584699453551912, + "grad_norm": 0.35757702589035034, + "learning_rate": 7.54904013050412e-05, + "loss": 0.2017, + "step": 5724 + }, + { + "epoch": 1.1586723335357214, + "grad_norm": 0.28952452540397644, + "learning_rate": 7.5459565546661e-05, + "loss": 0.177, + "step": 5725 + }, + { + "epoch": 1.1588747217162518, + "grad_norm": 0.3115183115005493, + "learning_rate": 7.54287322710245e-05, + "loss": 0.2081, + "step": 5726 + }, + { + "epoch": 1.159077109896782, + "grad_norm": 0.23546357452869415, + "learning_rate": 7.539790148125103e-05, + "loss": 0.1674, + "step": 5727 + }, + { + "epoch": 1.1592794980773122, + "grad_norm": 0.2618536353111267, + "learning_rate": 7.536707318045972e-05, + "loss": 0.1827, + "step": 5728 + }, + { + "epoch": 1.1594818862578427, + "grad_norm": 0.31117162108421326, + "learning_rate": 7.53362473717695e-05, + "loss": 0.1912, + "step": 5729 + }, + { + "epoch": 1.1596842744383729, + "grad_norm": 0.28290385007858276, + "learning_rate": 7.530542405829897e-05, + "loss": 0.1975, + "step": 5730 + }, + { + "epoch": 1.159886662618903, + "grad_norm": 0.27966201305389404, + "learning_rate": 7.527460324316652e-05, + "loss": 0.1771, + "step": 5731 + }, + { + "epoch": 1.1600890507994333, + "grad_norm": 0.30912506580352783, + "learning_rate": 7.524378492949027e-05, + "loss": 0.2024, + "step": 5732 + }, + { + "epoch": 1.1602914389799635, + "grad_norm": 0.2371935397386551, + "learning_rate": 7.52129691203881e-05, + "loss": 0.1673, + "step": 5733 + }, + { + "epoch": 1.1604938271604939, + "grad_norm": 0.2745453715324402, + "learning_rate": 7.518215581897763e-05, + "loss": 0.1787, + "step": 5734 + }, + { + "epoch": 1.160696215341024, + "grad_norm": 0.3203498125076294, + "learning_rate": 7.51513450283762e-05, + "loss": 0.1605, + "step": 5735 + }, + { + "epoch": 1.1608986035215543, + "grad_norm": 0.313037633895874, + "learning_rate": 7.5120536751701e-05, + "loss": 0.2123, + "step": 5736 + }, + { + "epoch": 1.1611009917020847, + "grad_norm": 0.308227002620697, + "learning_rate": 7.50897309920688e-05, + "loss": 0.1832, + "step": 5737 + }, + { + "epoch": 1.161303379882615, + "grad_norm": 0.34059271216392517, + "learning_rate": 7.505892775259624e-05, + "loss": 0.2515, + "step": 5738 + }, + { + "epoch": 1.161505768063145, + "grad_norm": 0.3082710802555084, + "learning_rate": 7.502812703639966e-05, + "loss": 0.2191, + "step": 5739 + }, + { + "epoch": 1.1617081562436753, + "grad_norm": 0.26456886529922485, + "learning_rate": 7.499732884659516e-05, + "loss": 0.1952, + "step": 5740 + }, + { + "epoch": 1.1619105444242057, + "grad_norm": 0.2999734878540039, + "learning_rate": 7.496653318629857e-05, + "loss": 0.2187, + "step": 5741 + }, + { + "epoch": 1.162112932604736, + "grad_norm": 0.248836487531662, + "learning_rate": 7.493574005862549e-05, + "loss": 0.1693, + "step": 5742 + }, + { + "epoch": 1.1623153207852661, + "grad_norm": 0.2821309268474579, + "learning_rate": 7.490494946669124e-05, + "loss": 0.1903, + "step": 5743 + }, + { + "epoch": 1.1625177089657963, + "grad_norm": 0.3294956684112549, + "learning_rate": 7.487416141361087e-05, + "loss": 0.1664, + "step": 5744 + }, + { + "epoch": 1.1627200971463267, + "grad_norm": 0.2846418619155884, + "learning_rate": 7.48433759024992e-05, + "loss": 0.1768, + "step": 5745 + }, + { + "epoch": 1.162922485326857, + "grad_norm": 0.29694685339927673, + "learning_rate": 7.481259293647081e-05, + "loss": 0.2119, + "step": 5746 + }, + { + "epoch": 1.1631248735073871, + "grad_norm": 0.31481388211250305, + "learning_rate": 7.478181251863998e-05, + "loss": 0.184, + "step": 5747 + }, + { + "epoch": 1.1633272616879173, + "grad_norm": 0.29571497440338135, + "learning_rate": 7.475103465212073e-05, + "loss": 0.1918, + "step": 5748 + }, + { + "epoch": 1.1635296498684478, + "grad_norm": 0.30913057923316956, + "learning_rate": 7.472025934002686e-05, + "loss": 0.2121, + "step": 5749 + }, + { + "epoch": 1.163732038048978, + "grad_norm": 0.40576112270355225, + "learning_rate": 7.468948658547191e-05, + "loss": 0.1983, + "step": 5750 + }, + { + "epoch": 1.163732038048978, + "eval_loss": 0.26926201581954956, + "eval_runtime": 0.7385, + "eval_samples_per_second": 6.77, + "eval_steps_per_second": 1.354, + "step": 5750 + }, + { + "epoch": 1.1639344262295082, + "grad_norm": 0.2734135091304779, + "learning_rate": 7.465871639156912e-05, + "loss": 0.1961, + "step": 5751 + }, + { + "epoch": 1.1641368144100386, + "grad_norm": 0.2998042404651642, + "learning_rate": 7.462794876143151e-05, + "loss": 0.2001, + "step": 5752 + }, + { + "epoch": 1.1643392025905688, + "grad_norm": 0.2595381736755371, + "learning_rate": 7.45971836981718e-05, + "loss": 0.1578, + "step": 5753 + }, + { + "epoch": 1.164541590771099, + "grad_norm": 0.2931159436702728, + "learning_rate": 7.45664212049025e-05, + "loss": 0.1983, + "step": 5754 + }, + { + "epoch": 1.1647439789516292, + "grad_norm": 0.26160353422164917, + "learning_rate": 7.453566128473584e-05, + "loss": 0.1849, + "step": 5755 + }, + { + "epoch": 1.1649463671321594, + "grad_norm": 0.35950490832328796, + "learning_rate": 7.450490394078377e-05, + "loss": 0.1843, + "step": 5756 + }, + { + "epoch": 1.1651487553126898, + "grad_norm": 0.26078858971595764, + "learning_rate": 7.4474149176158e-05, + "loss": 0.1849, + "step": 5757 + }, + { + "epoch": 1.16535114349322, + "grad_norm": 0.29962724447250366, + "learning_rate": 7.444339699397001e-05, + "loss": 0.2012, + "step": 5758 + }, + { + "epoch": 1.1655535316737502, + "grad_norm": 0.2402295470237732, + "learning_rate": 7.441264739733091e-05, + "loss": 0.1849, + "step": 5759 + }, + { + "epoch": 1.1657559198542806, + "grad_norm": 0.33564725518226624, + "learning_rate": 7.438190038935168e-05, + "loss": 0.2195, + "step": 5760 + }, + { + "epoch": 1.1659583080348108, + "grad_norm": 0.25888490676879883, + "learning_rate": 7.435115597314295e-05, + "loss": 0.1859, + "step": 5761 + }, + { + "epoch": 1.166160696215341, + "grad_norm": 0.3285493552684784, + "learning_rate": 7.432041415181513e-05, + "loss": 0.2098, + "step": 5762 + }, + { + "epoch": 1.1663630843958712, + "grad_norm": 0.3084118664264679, + "learning_rate": 7.428967492847836e-05, + "loss": 0.1942, + "step": 5763 + }, + { + "epoch": 1.1665654725764014, + "grad_norm": 0.2995312511920929, + "learning_rate": 7.425893830624248e-05, + "loss": 0.1952, + "step": 5764 + }, + { + "epoch": 1.1667678607569318, + "grad_norm": 0.2527577877044678, + "learning_rate": 7.422820428821716e-05, + "loss": 0.2032, + "step": 5765 + }, + { + "epoch": 1.166970248937462, + "grad_norm": 0.3025054931640625, + "learning_rate": 7.419747287751169e-05, + "loss": 0.2194, + "step": 5766 + }, + { + "epoch": 1.1671726371179922, + "grad_norm": 0.311558336019516, + "learning_rate": 7.416674407723518e-05, + "loss": 0.2092, + "step": 5767 + }, + { + "epoch": 1.1673750252985227, + "grad_norm": 0.2646612226963043, + "learning_rate": 7.413601789049644e-05, + "loss": 0.2167, + "step": 5768 + }, + { + "epoch": 1.1675774134790529, + "grad_norm": 0.284598708152771, + "learning_rate": 7.410529432040401e-05, + "loss": 0.1923, + "step": 5769 + }, + { + "epoch": 1.167779801659583, + "grad_norm": 0.2764895260334015, + "learning_rate": 7.40745733700662e-05, + "loss": 0.1709, + "step": 5770 + }, + { + "epoch": 1.1679821898401133, + "grad_norm": 0.271132230758667, + "learning_rate": 7.404385504259102e-05, + "loss": 0.1925, + "step": 5771 + }, + { + "epoch": 1.1681845780206437, + "grad_norm": 0.2631952464580536, + "learning_rate": 7.401313934108622e-05, + "loss": 0.1893, + "step": 5772 + }, + { + "epoch": 1.1683869662011739, + "grad_norm": 0.2543254494667053, + "learning_rate": 7.398242626865932e-05, + "loss": 0.15, + "step": 5773 + }, + { + "epoch": 1.168589354381704, + "grad_norm": 0.26711201667785645, + "learning_rate": 7.395171582841753e-05, + "loss": 0.1725, + "step": 5774 + }, + { + "epoch": 1.1687917425622343, + "grad_norm": 0.29442912340164185, + "learning_rate": 7.392100802346782e-05, + "loss": 0.2194, + "step": 5775 + }, + { + "epoch": 1.1689941307427647, + "grad_norm": 0.2915812134742737, + "learning_rate": 7.389030285691684e-05, + "loss": 0.2029, + "step": 5776 + }, + { + "epoch": 1.169196518923295, + "grad_norm": 0.3449213206768036, + "learning_rate": 7.385960033187109e-05, + "loss": 0.2279, + "step": 5777 + }, + { + "epoch": 1.169398907103825, + "grad_norm": 0.312273234128952, + "learning_rate": 7.382890045143667e-05, + "loss": 0.2489, + "step": 5778 + }, + { + "epoch": 1.1696012952843553, + "grad_norm": 0.28574004769325256, + "learning_rate": 7.379820321871951e-05, + "loss": 0.1972, + "step": 5779 + }, + { + "epoch": 1.1698036834648857, + "grad_norm": 0.2767406404018402, + "learning_rate": 7.376750863682522e-05, + "loss": 0.1987, + "step": 5780 + }, + { + "epoch": 1.170006071645416, + "grad_norm": 0.26038801670074463, + "learning_rate": 7.373681670885912e-05, + "loss": 0.1837, + "step": 5781 + }, + { + "epoch": 1.1702084598259461, + "grad_norm": 0.2696447968482971, + "learning_rate": 7.370612743792636e-05, + "loss": 0.1758, + "step": 5782 + }, + { + "epoch": 1.1704108480064765, + "grad_norm": 0.2397984117269516, + "learning_rate": 7.36754408271317e-05, + "loss": 0.1637, + "step": 5783 + }, + { + "epoch": 1.1706132361870067, + "grad_norm": 0.25606390833854675, + "learning_rate": 7.364475687957972e-05, + "loss": 0.1731, + "step": 5784 + }, + { + "epoch": 1.170815624367537, + "grad_norm": 0.267202764749527, + "learning_rate": 7.361407559837472e-05, + "loss": 0.1717, + "step": 5785 + }, + { + "epoch": 1.1710180125480671, + "grad_norm": 0.2734069228172302, + "learning_rate": 7.358339698662066e-05, + "loss": 0.2113, + "step": 5786 + }, + { + "epoch": 1.1712204007285973, + "grad_norm": 0.25181517004966736, + "learning_rate": 7.355272104742132e-05, + "loss": 0.1635, + "step": 5787 + }, + { + "epoch": 1.1714227889091278, + "grad_norm": 0.2656586170196533, + "learning_rate": 7.352204778388016e-05, + "loss": 0.1758, + "step": 5788 + }, + { + "epoch": 1.171625177089658, + "grad_norm": 0.2795346677303314, + "learning_rate": 7.349137719910037e-05, + "loss": 0.2048, + "step": 5789 + }, + { + "epoch": 1.1718275652701882, + "grad_norm": 0.3402021527290344, + "learning_rate": 7.346070929618487e-05, + "loss": 0.1918, + "step": 5790 + }, + { + "epoch": 1.1720299534507186, + "grad_norm": 0.3135707378387451, + "learning_rate": 7.343004407823635e-05, + "loss": 0.1981, + "step": 5791 + }, + { + "epoch": 1.1722323416312488, + "grad_norm": 0.3004648685455322, + "learning_rate": 7.339938154835717e-05, + "loss": 0.2041, + "step": 5792 + }, + { + "epoch": 1.172434729811779, + "grad_norm": 0.2892182171344757, + "learning_rate": 7.336872170964943e-05, + "loss": 0.1955, + "step": 5793 + }, + { + "epoch": 1.1726371179923092, + "grad_norm": 0.3182966411113739, + "learning_rate": 7.333806456521501e-05, + "loss": 0.2039, + "step": 5794 + }, + { + "epoch": 1.1728395061728394, + "grad_norm": 0.2533356547355652, + "learning_rate": 7.330741011815545e-05, + "loss": 0.1735, + "step": 5795 + }, + { + "epoch": 1.1730418943533698, + "grad_norm": 0.26306235790252686, + "learning_rate": 7.327675837157206e-05, + "loss": 0.1828, + "step": 5796 + }, + { + "epoch": 1.1732442825339, + "grad_norm": 0.31353071331977844, + "learning_rate": 7.324610932856584e-05, + "loss": 0.2047, + "step": 5797 + }, + { + "epoch": 1.1734466707144302, + "grad_norm": 0.2928347587585449, + "learning_rate": 7.321546299223756e-05, + "loss": 0.1931, + "step": 5798 + }, + { + "epoch": 1.1736490588949606, + "grad_norm": 0.2831988036632538, + "learning_rate": 7.318481936568768e-05, + "loss": 0.1872, + "step": 5799 + }, + { + "epoch": 1.1738514470754908, + "grad_norm": 0.2879961133003235, + "learning_rate": 7.315417845201641e-05, + "loss": 0.2032, + "step": 5800 + }, + { + "epoch": 1.1738514470754908, + "eval_loss": 0.2707487642765045, + "eval_runtime": 0.7405, + "eval_samples_per_second": 6.752, + "eval_steps_per_second": 1.35, + "step": 5800 + }, + { + "epoch": 1.174053835256021, + "grad_norm": 0.27901849150657654, + "learning_rate": 7.312354025432368e-05, + "loss": 0.1909, + "step": 5801 + }, + { + "epoch": 1.1742562234365512, + "grad_norm": 0.2636677026748657, + "learning_rate": 7.309290477570916e-05, + "loss": 0.1768, + "step": 5802 + }, + { + "epoch": 1.1744586116170816, + "grad_norm": 0.2820374071598053, + "learning_rate": 7.306227201927218e-05, + "loss": 0.2055, + "step": 5803 + }, + { + "epoch": 1.1746609997976118, + "grad_norm": 0.32560697197914124, + "learning_rate": 7.303164198811185e-05, + "loss": 0.1841, + "step": 5804 + }, + { + "epoch": 1.174863387978142, + "grad_norm": 0.293254554271698, + "learning_rate": 7.300101468532703e-05, + "loss": 0.2125, + "step": 5805 + }, + { + "epoch": 1.1750657761586722, + "grad_norm": 0.24610738456249237, + "learning_rate": 7.297039011401623e-05, + "loss": 0.2007, + "step": 5806 + }, + { + "epoch": 1.1752681643392027, + "grad_norm": 0.2838497757911682, + "learning_rate": 7.293976827727775e-05, + "loss": 0.1804, + "step": 5807 + }, + { + "epoch": 1.1754705525197329, + "grad_norm": 0.24666501581668854, + "learning_rate": 7.290914917820957e-05, + "loss": 0.1206, + "step": 5808 + }, + { + "epoch": 1.175672940700263, + "grad_norm": 0.26965487003326416, + "learning_rate": 7.287853281990941e-05, + "loss": 0.218, + "step": 5809 + }, + { + "epoch": 1.1758753288807933, + "grad_norm": 0.2837803065776825, + "learning_rate": 7.284791920547472e-05, + "loss": 0.1739, + "step": 5810 + }, + { + "epoch": 1.1760777170613237, + "grad_norm": 0.3394570052623749, + "learning_rate": 7.281730833800266e-05, + "loss": 0.2183, + "step": 5811 + }, + { + "epoch": 1.176280105241854, + "grad_norm": 0.28940433263778687, + "learning_rate": 7.278670022059012e-05, + "loss": 0.1727, + "step": 5812 + }, + { + "epoch": 1.176482493422384, + "grad_norm": 0.3111019432544708, + "learning_rate": 7.27560948563337e-05, + "loss": 0.1958, + "step": 5813 + }, + { + "epoch": 1.1766848816029145, + "grad_norm": 0.26451563835144043, + "learning_rate": 7.272549224832974e-05, + "loss": 0.1971, + "step": 5814 + }, + { + "epoch": 1.1768872697834447, + "grad_norm": 0.29414859414100647, + "learning_rate": 7.269489239967429e-05, + "loss": 0.1824, + "step": 5815 + }, + { + "epoch": 1.177089657963975, + "grad_norm": 0.2569786608219147, + "learning_rate": 7.266429531346313e-05, + "loss": 0.1561, + "step": 5816 + }, + { + "epoch": 1.177292046144505, + "grad_norm": 0.3070293962955475, + "learning_rate": 7.263370099279172e-05, + "loss": 0.2182, + "step": 5817 + }, + { + "epoch": 1.1774944343250353, + "grad_norm": 0.3086085617542267, + "learning_rate": 7.26031094407553e-05, + "loss": 0.2163, + "step": 5818 + }, + { + "epoch": 1.1776968225055657, + "grad_norm": 0.9129588603973389, + "learning_rate": 7.257252066044878e-05, + "loss": 0.1985, + "step": 5819 + }, + { + "epoch": 1.177899210686096, + "grad_norm": 0.34575098752975464, + "learning_rate": 7.254193465496683e-05, + "loss": 0.1907, + "step": 5820 + }, + { + "epoch": 1.1781015988666261, + "grad_norm": 0.3244081437587738, + "learning_rate": 7.251135142740384e-05, + "loss": 0.1941, + "step": 5821 + }, + { + "epoch": 1.1783039870471566, + "grad_norm": 0.2724374532699585, + "learning_rate": 7.248077098085386e-05, + "loss": 0.174, + "step": 5822 + }, + { + "epoch": 1.1785063752276868, + "grad_norm": 0.3696213662624359, + "learning_rate": 7.245019331841072e-05, + "loss": 0.1876, + "step": 5823 + }, + { + "epoch": 1.178708763408217, + "grad_norm": 0.26547983288764954, + "learning_rate": 7.241961844316793e-05, + "loss": 0.1784, + "step": 5824 + }, + { + "epoch": 1.1789111515887472, + "grad_norm": 0.32802996039390564, + "learning_rate": 7.238904635821882e-05, + "loss": 0.2381, + "step": 5825 + }, + { + "epoch": 1.1791135397692774, + "grad_norm": 0.29037603735923767, + "learning_rate": 7.235847706665621e-05, + "loss": 0.2041, + "step": 5826 + }, + { + "epoch": 1.1793159279498078, + "grad_norm": 0.2806302607059479, + "learning_rate": 7.232791057157287e-05, + "loss": 0.1926, + "step": 5827 + }, + { + "epoch": 1.179518316130338, + "grad_norm": 0.3043530583381653, + "learning_rate": 7.229734687606118e-05, + "loss": 0.2091, + "step": 5828 + }, + { + "epoch": 1.1797207043108682, + "grad_norm": 0.30080434679985046, + "learning_rate": 7.226678598321324e-05, + "loss": 0.1938, + "step": 5829 + }, + { + "epoch": 1.1799230924913986, + "grad_norm": 0.3069448471069336, + "learning_rate": 7.223622789612088e-05, + "loss": 0.1975, + "step": 5830 + }, + { + "epoch": 1.1801254806719288, + "grad_norm": 0.27904027700424194, + "learning_rate": 7.220567261787567e-05, + "loss": 0.1966, + "step": 5831 + }, + { + "epoch": 1.180327868852459, + "grad_norm": 0.33528608083724976, + "learning_rate": 7.217512015156886e-05, + "loss": 0.1983, + "step": 5832 + }, + { + "epoch": 1.1805302570329892, + "grad_norm": 0.3564962148666382, + "learning_rate": 7.214457050029144e-05, + "loss": 0.1881, + "step": 5833 + }, + { + "epoch": 1.1807326452135196, + "grad_norm": 0.2963548004627228, + "learning_rate": 7.211402366713408e-05, + "loss": 0.2, + "step": 5834 + }, + { + "epoch": 1.1809350333940498, + "grad_norm": 0.26449206471443176, + "learning_rate": 7.208347965518723e-05, + "loss": 0.1658, + "step": 5835 + }, + { + "epoch": 1.18113742157458, + "grad_norm": 0.2890670597553253, + "learning_rate": 7.205293846754095e-05, + "loss": 0.1917, + "step": 5836 + }, + { + "epoch": 1.1813398097551102, + "grad_norm": 0.2905254662036896, + "learning_rate": 7.202240010728514e-05, + "loss": 0.2009, + "step": 5837 + }, + { + "epoch": 1.1815421979356406, + "grad_norm": 0.2885470986366272, + "learning_rate": 7.19918645775093e-05, + "loss": 0.1886, + "step": 5838 + }, + { + "epoch": 1.1817445861161708, + "grad_norm": 0.2656796872615814, + "learning_rate": 7.196133188130272e-05, + "loss": 0.1747, + "step": 5839 + }, + { + "epoch": 1.181946974296701, + "grad_norm": 0.28297480940818787, + "learning_rate": 7.19308020217544e-05, + "loss": 0.184, + "step": 5840 + }, + { + "epoch": 1.1821493624772312, + "grad_norm": 0.2764962911605835, + "learning_rate": 7.190027500195297e-05, + "loss": 0.179, + "step": 5841 + }, + { + "epoch": 1.1823517506577617, + "grad_norm": 0.28837940096855164, + "learning_rate": 7.186975082498689e-05, + "loss": 0.2043, + "step": 5842 + }, + { + "epoch": 1.1825541388382919, + "grad_norm": 0.41320478916168213, + "learning_rate": 7.183922949394424e-05, + "loss": 0.2055, + "step": 5843 + }, + { + "epoch": 1.182756527018822, + "grad_norm": 0.32843437790870667, + "learning_rate": 7.180871101191287e-05, + "loss": 0.2397, + "step": 5844 + }, + { + "epoch": 1.1829589151993525, + "grad_norm": 0.2618347704410553, + "learning_rate": 7.17781953819803e-05, + "loss": 0.1995, + "step": 5845 + }, + { + "epoch": 1.1831613033798827, + "grad_norm": 0.30906206369400024, + "learning_rate": 7.174768260723382e-05, + "loss": 0.1996, + "step": 5846 + }, + { + "epoch": 1.1833636915604129, + "grad_norm": 0.30363285541534424, + "learning_rate": 7.171717269076036e-05, + "loss": 0.2062, + "step": 5847 + }, + { + "epoch": 1.183566079740943, + "grad_norm": 0.3194428086280823, + "learning_rate": 7.168666563564661e-05, + "loss": 0.1685, + "step": 5848 + }, + { + "epoch": 1.1837684679214733, + "grad_norm": 0.2629626393318176, + "learning_rate": 7.16561614449789e-05, + "loss": 0.193, + "step": 5849 + }, + { + "epoch": 1.1839708561020037, + "grad_norm": 0.34626123309135437, + "learning_rate": 7.16256601218434e-05, + "loss": 0.2522, + "step": 5850 + }, + { + "epoch": 1.1839708561020037, + "eval_loss": 0.2733302414417267, + "eval_runtime": 0.7396, + "eval_samples_per_second": 6.761, + "eval_steps_per_second": 1.352, + "step": 5850 + }, + { + "epoch": 1.184173244282534, + "grad_norm": 0.3396178185939789, + "learning_rate": 7.159516166932587e-05, + "loss": 0.2486, + "step": 5851 + }, + { + "epoch": 1.184375632463064, + "grad_norm": 0.2949715256690979, + "learning_rate": 7.156466609051182e-05, + "loss": 0.1871, + "step": 5852 + }, + { + "epoch": 1.1845780206435945, + "grad_norm": 0.3098287284374237, + "learning_rate": 7.153417338848651e-05, + "loss": 0.1727, + "step": 5853 + }, + { + "epoch": 1.1847804088241247, + "grad_norm": 0.2525773048400879, + "learning_rate": 7.150368356633484e-05, + "loss": 0.1943, + "step": 5854 + }, + { + "epoch": 1.184982797004655, + "grad_norm": 0.31998711824417114, + "learning_rate": 7.147319662714145e-05, + "loss": 0.2131, + "step": 5855 + }, + { + "epoch": 1.1851851851851851, + "grad_norm": 0.36343008279800415, + "learning_rate": 7.144271257399068e-05, + "loss": 0.1942, + "step": 5856 + }, + { + "epoch": 1.1853875733657153, + "grad_norm": 0.32088908553123474, + "learning_rate": 7.141223140996663e-05, + "loss": 0.1923, + "step": 5857 + }, + { + "epoch": 1.1855899615462457, + "grad_norm": 0.27131387591362, + "learning_rate": 7.138175313815302e-05, + "loss": 0.176, + "step": 5858 + }, + { + "epoch": 1.185792349726776, + "grad_norm": 0.24319404363632202, + "learning_rate": 7.135127776163336e-05, + "loss": 0.1786, + "step": 5859 + }, + { + "epoch": 1.1859947379073061, + "grad_norm": 0.26026079058647156, + "learning_rate": 7.132080528349078e-05, + "loss": 0.199, + "step": 5860 + }, + { + "epoch": 1.1861971260878366, + "grad_norm": 0.2787209153175354, + "learning_rate": 7.129033570680821e-05, + "loss": 0.2177, + "step": 5861 + }, + { + "epoch": 1.1863995142683668, + "grad_norm": 0.28949227929115295, + "learning_rate": 7.125986903466823e-05, + "loss": 0.2146, + "step": 5862 + }, + { + "epoch": 1.186601902448897, + "grad_norm": 0.2977891266345978, + "learning_rate": 7.122940527015314e-05, + "loss": 0.198, + "step": 5863 + }, + { + "epoch": 1.1868042906294272, + "grad_norm": 0.33024102449417114, + "learning_rate": 7.119894441634494e-05, + "loss": 0.198, + "step": 5864 + }, + { + "epoch": 1.1870066788099576, + "grad_norm": 0.29550671577453613, + "learning_rate": 7.116848647632532e-05, + "loss": 0.2051, + "step": 5865 + }, + { + "epoch": 1.1872090669904878, + "grad_norm": 0.2810512185096741, + "learning_rate": 7.113803145317573e-05, + "loss": 0.1662, + "step": 5866 + }, + { + "epoch": 1.187411455171018, + "grad_norm": 0.2707330882549286, + "learning_rate": 7.110757934997726e-05, + "loss": 0.1954, + "step": 5867 + }, + { + "epoch": 1.1876138433515482, + "grad_norm": 0.2579241394996643, + "learning_rate": 7.107713016981075e-05, + "loss": 0.1824, + "step": 5868 + }, + { + "epoch": 1.1878162315320786, + "grad_norm": 0.23849055171012878, + "learning_rate": 7.104668391575677e-05, + "loss": 0.2018, + "step": 5869 + }, + { + "epoch": 1.1880186197126088, + "grad_norm": 0.23894186317920685, + "learning_rate": 7.101624059089547e-05, + "loss": 0.15, + "step": 5870 + }, + { + "epoch": 1.188221007893139, + "grad_norm": 0.29739347100257874, + "learning_rate": 7.098580019830681e-05, + "loss": 0.2244, + "step": 5871 + }, + { + "epoch": 1.1884233960736692, + "grad_norm": 0.32754260301589966, + "learning_rate": 7.095536274107046e-05, + "loss": 0.1911, + "step": 5872 + }, + { + "epoch": 1.1886257842541996, + "grad_norm": 0.3321670889854431, + "learning_rate": 7.092492822226573e-05, + "loss": 0.1978, + "step": 5873 + }, + { + "epoch": 1.1888281724347298, + "grad_norm": 0.2944977283477783, + "learning_rate": 7.089449664497169e-05, + "loss": 0.1975, + "step": 5874 + }, + { + "epoch": 1.18903056061526, + "grad_norm": 0.30659884214401245, + "learning_rate": 7.086406801226709e-05, + "loss": 0.1958, + "step": 5875 + }, + { + "epoch": 1.1892329487957904, + "grad_norm": 0.2836208939552307, + "learning_rate": 7.083364232723035e-05, + "loss": 0.1985, + "step": 5876 + }, + { + "epoch": 1.1894353369763206, + "grad_norm": 0.3166012763977051, + "learning_rate": 7.080321959293964e-05, + "loss": 0.2182, + "step": 5877 + }, + { + "epoch": 1.1896377251568508, + "grad_norm": 0.28273382782936096, + "learning_rate": 7.077279981247282e-05, + "loss": 0.1937, + "step": 5878 + }, + { + "epoch": 1.189840113337381, + "grad_norm": 0.3097810447216034, + "learning_rate": 7.074238298890744e-05, + "loss": 0.2273, + "step": 5879 + }, + { + "epoch": 1.1900425015179112, + "grad_norm": 0.3035329580307007, + "learning_rate": 7.071196912532075e-05, + "loss": 0.2155, + "step": 5880 + }, + { + "epoch": 1.1902448896984417, + "grad_norm": 0.2822204828262329, + "learning_rate": 7.06815582247897e-05, + "loss": 0.1871, + "step": 5881 + }, + { + "epoch": 1.1904472778789719, + "grad_norm": 0.3719690442085266, + "learning_rate": 7.065115029039097e-05, + "loss": 0.2051, + "step": 5882 + }, + { + "epoch": 1.190649666059502, + "grad_norm": 0.28150033950805664, + "learning_rate": 7.062074532520089e-05, + "loss": 0.217, + "step": 5883 + }, + { + "epoch": 1.1908520542400325, + "grad_norm": 0.2478693574666977, + "learning_rate": 7.059034333229552e-05, + "loss": 0.1606, + "step": 5884 + }, + { + "epoch": 1.1910544424205627, + "grad_norm": 0.35104045271873474, + "learning_rate": 7.055994431475064e-05, + "loss": 0.1931, + "step": 5885 + }, + { + "epoch": 1.1912568306010929, + "grad_norm": 0.31746405363082886, + "learning_rate": 7.052954827564167e-05, + "loss": 0.2154, + "step": 5886 + }, + { + "epoch": 1.191459218781623, + "grad_norm": 0.298566997051239, + "learning_rate": 7.049915521804377e-05, + "loss": 0.2133, + "step": 5887 + }, + { + "epoch": 1.1916616069621533, + "grad_norm": 0.2977025508880615, + "learning_rate": 7.04687651450318e-05, + "loss": 0.1913, + "step": 5888 + }, + { + "epoch": 1.1918639951426837, + "grad_norm": 0.2290477752685547, + "learning_rate": 7.043837805968027e-05, + "loss": 0.1734, + "step": 5889 + }, + { + "epoch": 1.192066383323214, + "grad_norm": 0.2559657394886017, + "learning_rate": 7.040799396506351e-05, + "loss": 0.1614, + "step": 5890 + }, + { + "epoch": 1.192268771503744, + "grad_norm": 0.28855013847351074, + "learning_rate": 7.037761286425543e-05, + "loss": 0.2001, + "step": 5891 + }, + { + "epoch": 1.1924711596842745, + "grad_norm": 0.28611060976982117, + "learning_rate": 7.034723476032965e-05, + "loss": 0.2088, + "step": 5892 + }, + { + "epoch": 1.1926735478648047, + "grad_norm": 0.2739086151123047, + "learning_rate": 7.031685965635948e-05, + "loss": 0.1936, + "step": 5893 + }, + { + "epoch": 1.192875936045335, + "grad_norm": 0.36974024772644043, + "learning_rate": 7.0286487555418e-05, + "loss": 0.2255, + "step": 5894 + }, + { + "epoch": 1.1930783242258651, + "grad_norm": 0.33781471848487854, + "learning_rate": 7.025611846057794e-05, + "loss": 0.2057, + "step": 5895 + }, + { + "epoch": 1.1932807124063955, + "grad_norm": 0.23322023451328278, + "learning_rate": 7.022575237491171e-05, + "loss": 0.1592, + "step": 5896 + }, + { + "epoch": 1.1934831005869257, + "grad_norm": 0.33305227756500244, + "learning_rate": 7.019538930149144e-05, + "loss": 0.2014, + "step": 5897 + }, + { + "epoch": 1.193685488767456, + "grad_norm": 0.3045668303966522, + "learning_rate": 7.016502924338892e-05, + "loss": 0.1735, + "step": 5898 + }, + { + "epoch": 1.1938878769479861, + "grad_norm": 0.26250389218330383, + "learning_rate": 7.013467220367571e-05, + "loss": 0.1937, + "step": 5899 + }, + { + "epoch": 1.1940902651285166, + "grad_norm": 0.2677670419216156, + "learning_rate": 7.010431818542297e-05, + "loss": 0.1967, + "step": 5900 + }, + { + "epoch": 1.1940902651285166, + "eval_loss": 0.27170634269714355, + "eval_runtime": 0.7392, + "eval_samples_per_second": 6.764, + "eval_steps_per_second": 1.353, + "step": 5900 + }, + { + "epoch": 1.1942926533090468, + "grad_norm": 0.32133355736732483, + "learning_rate": 7.007396719170165e-05, + "loss": 0.1986, + "step": 5901 + }, + { + "epoch": 1.194495041489577, + "grad_norm": 0.2636141777038574, + "learning_rate": 7.004361922558232e-05, + "loss": 0.1639, + "step": 5902 + }, + { + "epoch": 1.1946974296701072, + "grad_norm": 0.27497488260269165, + "learning_rate": 7.001327429013525e-05, + "loss": 0.2042, + "step": 5903 + }, + { + "epoch": 1.1948998178506376, + "grad_norm": 0.29585936665534973, + "learning_rate": 6.998293238843044e-05, + "loss": 0.1888, + "step": 5904 + }, + { + "epoch": 1.1951022060311678, + "grad_norm": 0.2776990830898285, + "learning_rate": 6.995259352353758e-05, + "loss": 0.2039, + "step": 5905 + }, + { + "epoch": 1.195304594211698, + "grad_norm": 0.2712746262550354, + "learning_rate": 6.992225769852601e-05, + "loss": 0.17, + "step": 5906 + }, + { + "epoch": 1.1955069823922284, + "grad_norm": 0.3972209095954895, + "learning_rate": 6.989192491646481e-05, + "loss": 0.2051, + "step": 5907 + }, + { + "epoch": 1.1957093705727586, + "grad_norm": 0.2815692126750946, + "learning_rate": 6.986159518042273e-05, + "loss": 0.1901, + "step": 5908 + }, + { + "epoch": 1.1959117587532888, + "grad_norm": 0.32967308163642883, + "learning_rate": 6.983126849346821e-05, + "loss": 0.2278, + "step": 5909 + }, + { + "epoch": 1.196114146933819, + "grad_norm": 0.2739197313785553, + "learning_rate": 6.980094485866938e-05, + "loss": 0.1795, + "step": 5910 + }, + { + "epoch": 1.1963165351143492, + "grad_norm": 0.2594100832939148, + "learning_rate": 6.977062427909408e-05, + "loss": 0.1745, + "step": 5911 + }, + { + "epoch": 1.1965189232948796, + "grad_norm": 0.27367496490478516, + "learning_rate": 6.974030675780982e-05, + "loss": 0.1788, + "step": 5912 + }, + { + "epoch": 1.1967213114754098, + "grad_norm": 0.35172849893569946, + "learning_rate": 6.970999229788381e-05, + "loss": 0.2273, + "step": 5913 + }, + { + "epoch": 1.19692369965594, + "grad_norm": 0.3972116708755493, + "learning_rate": 6.967968090238297e-05, + "loss": 0.1738, + "step": 5914 + }, + { + "epoch": 1.1971260878364705, + "grad_norm": 0.24987083673477173, + "learning_rate": 6.964937257437386e-05, + "loss": 0.1734, + "step": 5915 + }, + { + "epoch": 1.1973284760170007, + "grad_norm": 0.27676859498023987, + "learning_rate": 6.961906731692276e-05, + "loss": 0.1973, + "step": 5916 + }, + { + "epoch": 1.1975308641975309, + "grad_norm": 0.2857378423213959, + "learning_rate": 6.958876513309565e-05, + "loss": 0.1837, + "step": 5917 + }, + { + "epoch": 1.197733252378061, + "grad_norm": 0.36634954810142517, + "learning_rate": 6.955846602595817e-05, + "loss": 0.2126, + "step": 5918 + }, + { + "epoch": 1.1979356405585915, + "grad_norm": 0.2530911862850189, + "learning_rate": 6.952816999857567e-05, + "loss": 0.1965, + "step": 5919 + }, + { + "epoch": 1.1981380287391217, + "grad_norm": 0.27030566334724426, + "learning_rate": 6.949787705401321e-05, + "loss": 0.1827, + "step": 5920 + }, + { + "epoch": 1.1983404169196519, + "grad_norm": 0.2770173251628876, + "learning_rate": 6.946758719533549e-05, + "loss": 0.1862, + "step": 5921 + }, + { + "epoch": 1.198542805100182, + "grad_norm": 0.2795921564102173, + "learning_rate": 6.943730042560692e-05, + "loss": 0.2097, + "step": 5922 + }, + { + "epoch": 1.1987451932807125, + "grad_norm": 0.3575659990310669, + "learning_rate": 6.940701674789162e-05, + "loss": 0.2033, + "step": 5923 + }, + { + "epoch": 1.1989475814612427, + "grad_norm": 0.30576780438423157, + "learning_rate": 6.937673616525334e-05, + "loss": 0.1951, + "step": 5924 + }, + { + "epoch": 1.199149969641773, + "grad_norm": 0.3140013515949249, + "learning_rate": 6.934645868075558e-05, + "loss": 0.1944, + "step": 5925 + }, + { + "epoch": 1.199352357822303, + "grad_norm": 0.3479853570461273, + "learning_rate": 6.931618429746147e-05, + "loss": 0.1831, + "step": 5926 + }, + { + "epoch": 1.1995547460028335, + "grad_norm": 0.26819106936454773, + "learning_rate": 6.928591301843389e-05, + "loss": 0.1886, + "step": 5927 + }, + { + "epoch": 1.1997571341833637, + "grad_norm": 0.2444477528333664, + "learning_rate": 6.925564484673534e-05, + "loss": 0.1746, + "step": 5928 + }, + { + "epoch": 1.199959522363894, + "grad_norm": 0.2941974103450775, + "learning_rate": 6.922537978542804e-05, + "loss": 0.1963, + "step": 5929 + }, + { + "epoch": 1.2001619105444241, + "grad_norm": 0.28198763728141785, + "learning_rate": 6.91951178375739e-05, + "loss": 0.192, + "step": 5930 + }, + { + "epoch": 1.2003642987249545, + "grad_norm": 0.3067132830619812, + "learning_rate": 6.916485900623453e-05, + "loss": 0.2116, + "step": 5931 + }, + { + "epoch": 1.2005666869054847, + "grad_norm": 0.2618827521800995, + "learning_rate": 6.913460329447116e-05, + "loss": 0.1794, + "step": 5932 + }, + { + "epoch": 1.200769075086015, + "grad_norm": 0.2466832846403122, + "learning_rate": 6.910435070534475e-05, + "loss": 0.1855, + "step": 5933 + }, + { + "epoch": 1.2009714632665451, + "grad_norm": 0.28418102860450745, + "learning_rate": 6.907410124191598e-05, + "loss": 0.1869, + "step": 5934 + }, + { + "epoch": 1.2011738514470756, + "grad_norm": 0.26387548446655273, + "learning_rate": 6.904385490724512e-05, + "loss": 0.1716, + "step": 5935 + }, + { + "epoch": 1.2013762396276058, + "grad_norm": 0.3512365221977234, + "learning_rate": 6.901361170439223e-05, + "loss": 0.2098, + "step": 5936 + }, + { + "epoch": 1.201578627808136, + "grad_norm": 0.28574854135513306, + "learning_rate": 6.898337163641695e-05, + "loss": 0.1929, + "step": 5937 + }, + { + "epoch": 1.2017810159886664, + "grad_norm": 0.28817689418792725, + "learning_rate": 6.895313470637868e-05, + "loss": 0.2125, + "step": 5938 + }, + { + "epoch": 1.2019834041691966, + "grad_norm": 0.2598557770252228, + "learning_rate": 6.892290091733646e-05, + "loss": 0.1669, + "step": 5939 + }, + { + "epoch": 1.2021857923497268, + "grad_norm": 0.29402410984039307, + "learning_rate": 6.889267027234905e-05, + "loss": 0.2113, + "step": 5940 + }, + { + "epoch": 1.202388180530257, + "grad_norm": 0.283035546541214, + "learning_rate": 6.886244277447485e-05, + "loss": 0.1857, + "step": 5941 + }, + { + "epoch": 1.2025905687107872, + "grad_norm": 0.2820853888988495, + "learning_rate": 6.883221842677196e-05, + "loss": 0.1733, + "step": 5942 + }, + { + "epoch": 1.2027929568913176, + "grad_norm": 0.3180203139781952, + "learning_rate": 6.880199723229817e-05, + "loss": 0.1972, + "step": 5943 + }, + { + "epoch": 1.2029953450718478, + "grad_norm": 0.31403082609176636, + "learning_rate": 6.877177919411095e-05, + "loss": 0.2062, + "step": 5944 + }, + { + "epoch": 1.203197733252378, + "grad_norm": 0.2898111045360565, + "learning_rate": 6.874156431526743e-05, + "loss": 0.189, + "step": 5945 + }, + { + "epoch": 1.2034001214329084, + "grad_norm": 0.3071668744087219, + "learning_rate": 6.871135259882445e-05, + "loss": 0.1931, + "step": 5946 + }, + { + "epoch": 1.2036025096134386, + "grad_norm": 0.33244362473487854, + "learning_rate": 6.868114404783849e-05, + "loss": 0.2055, + "step": 5947 + }, + { + "epoch": 1.2038048977939688, + "grad_norm": 0.3338499963283539, + "learning_rate": 6.865093866536576e-05, + "loss": 0.208, + "step": 5948 + }, + { + "epoch": 1.204007285974499, + "grad_norm": 0.27435195446014404, + "learning_rate": 6.862073645446211e-05, + "loss": 0.2154, + "step": 5949 + }, + { + "epoch": 1.2042096741550294, + "grad_norm": 0.3393666744232178, + "learning_rate": 6.859053741818311e-05, + "loss": 0.2009, + "step": 5950 + }, + { + "epoch": 1.2042096741550294, + "eval_loss": 0.2669513523578644, + "eval_runtime": 0.7388, + "eval_samples_per_second": 6.768, + "eval_steps_per_second": 1.354, + "step": 5950 + }, + { + "epoch": 1.2044120623355596, + "grad_norm": 0.31124457716941833, + "learning_rate": 6.856034155958394e-05, + "loss": 0.1991, + "step": 5951 + }, + { + "epoch": 1.2046144505160898, + "grad_norm": 0.2741898000240326, + "learning_rate": 6.853014888171952e-05, + "loss": 0.193, + "step": 5952 + }, + { + "epoch": 1.20481683869662, + "grad_norm": 0.2568211257457733, + "learning_rate": 6.849995938764442e-05, + "loss": 0.1955, + "step": 5953 + }, + { + "epoch": 1.2050192268771505, + "grad_norm": 0.3031553626060486, + "learning_rate": 6.846977308041292e-05, + "loss": 0.1921, + "step": 5954 + }, + { + "epoch": 1.2052216150576807, + "grad_norm": 0.48870640993118286, + "learning_rate": 6.843958996307892e-05, + "loss": 0.2202, + "step": 5955 + }, + { + "epoch": 1.2054240032382109, + "grad_norm": 0.2706666588783264, + "learning_rate": 6.840941003869606e-05, + "loss": 0.1782, + "step": 5956 + }, + { + "epoch": 1.205626391418741, + "grad_norm": 0.2780493199825287, + "learning_rate": 6.83792333103176e-05, + "loss": 0.2277, + "step": 5957 + }, + { + "epoch": 1.2058287795992715, + "grad_norm": 0.3681378960609436, + "learning_rate": 6.834905978099655e-05, + "loss": 0.1998, + "step": 5958 + }, + { + "epoch": 1.2060311677798017, + "grad_norm": 0.3750686049461365, + "learning_rate": 6.83188894537855e-05, + "loss": 0.2168, + "step": 5959 + }, + { + "epoch": 1.2062335559603319, + "grad_norm": 0.31940484046936035, + "learning_rate": 6.82887223317368e-05, + "loss": 0.1864, + "step": 5960 + }, + { + "epoch": 1.206435944140862, + "grad_norm": 0.24824610352516174, + "learning_rate": 6.825855841790242e-05, + "loss": 0.1506, + "step": 5961 + }, + { + "epoch": 1.2066383323213925, + "grad_norm": 0.2760399580001831, + "learning_rate": 6.822839771533405e-05, + "loss": 0.1842, + "step": 5962 + }, + { + "epoch": 1.2068407205019227, + "grad_norm": 0.2778237462043762, + "learning_rate": 6.8198240227083e-05, + "loss": 0.2034, + "step": 5963 + }, + { + "epoch": 1.207043108682453, + "grad_norm": 0.27293694019317627, + "learning_rate": 6.816808595620034e-05, + "loss": 0.2083, + "step": 5964 + }, + { + "epoch": 1.207245496862983, + "grad_norm": 0.25446030497550964, + "learning_rate": 6.813793490573672e-05, + "loss": 0.1804, + "step": 5965 + }, + { + "epoch": 1.2074478850435135, + "grad_norm": 0.25617870688438416, + "learning_rate": 6.810778707874253e-05, + "loss": 0.1838, + "step": 5966 + }, + { + "epoch": 1.2076502732240437, + "grad_norm": 0.330093652009964, + "learning_rate": 6.80776424782678e-05, + "loss": 0.2248, + "step": 5967 + }, + { + "epoch": 1.207852661404574, + "grad_norm": 0.3035315275192261, + "learning_rate": 6.804750110736224e-05, + "loss": 0.2039, + "step": 5968 + }, + { + "epoch": 1.2080550495851043, + "grad_norm": 0.2813340425491333, + "learning_rate": 6.801736296907524e-05, + "loss": 0.2072, + "step": 5969 + }, + { + "epoch": 1.2082574377656345, + "grad_norm": 0.30979204177856445, + "learning_rate": 6.798722806645588e-05, + "loss": 0.2057, + "step": 5970 + }, + { + "epoch": 1.2084598259461647, + "grad_norm": 0.273605078458786, + "learning_rate": 6.795709640255286e-05, + "loss": 0.1723, + "step": 5971 + }, + { + "epoch": 1.208662214126695, + "grad_norm": 0.2786939740180969, + "learning_rate": 6.792696798041463e-05, + "loss": 0.1982, + "step": 5972 + }, + { + "epoch": 1.2088646023072251, + "grad_norm": 0.2765547037124634, + "learning_rate": 6.789684280308922e-05, + "loss": 0.1726, + "step": 5973 + }, + { + "epoch": 1.2090669904877556, + "grad_norm": 0.2661789357662201, + "learning_rate": 6.78667208736244e-05, + "loss": 0.1907, + "step": 5974 + }, + { + "epoch": 1.2092693786682858, + "grad_norm": 0.276862770318985, + "learning_rate": 6.78366021950676e-05, + "loss": 0.1961, + "step": 5975 + }, + { + "epoch": 1.209471766848816, + "grad_norm": 0.36788874864578247, + "learning_rate": 6.78064867704659e-05, + "loss": 0.2106, + "step": 5976 + }, + { + "epoch": 1.2096741550293464, + "grad_norm": 0.3060303032398224, + "learning_rate": 6.777637460286607e-05, + "loss": 0.1976, + "step": 5977 + }, + { + "epoch": 1.2098765432098766, + "grad_norm": 0.4363291263580322, + "learning_rate": 6.774626569531453e-05, + "loss": 0.1675, + "step": 5978 + }, + { + "epoch": 1.2100789313904068, + "grad_norm": 0.26958876848220825, + "learning_rate": 6.771616005085739e-05, + "loss": 0.1882, + "step": 5979 + }, + { + "epoch": 1.210281319570937, + "grad_norm": 0.34765639901161194, + "learning_rate": 6.768605767254048e-05, + "loss": 0.2025, + "step": 5980 + }, + { + "epoch": 1.2104837077514674, + "grad_norm": 0.2938627302646637, + "learning_rate": 6.765595856340914e-05, + "loss": 0.172, + "step": 5981 + }, + { + "epoch": 1.2106860959319976, + "grad_norm": 0.3988211452960968, + "learning_rate": 6.762586272650854e-05, + "loss": 0.2233, + "step": 5982 + }, + { + "epoch": 1.2108884841125278, + "grad_norm": 0.30394846200942993, + "learning_rate": 6.759577016488343e-05, + "loss": 0.2043, + "step": 5983 + }, + { + "epoch": 1.211090872293058, + "grad_norm": 0.26461684703826904, + "learning_rate": 6.756568088157829e-05, + "loss": 0.2121, + "step": 5984 + }, + { + "epoch": 1.2112932604735884, + "grad_norm": 0.3332306146621704, + "learning_rate": 6.753559487963723e-05, + "loss": 0.1988, + "step": 5985 + }, + { + "epoch": 1.2114956486541186, + "grad_norm": 0.2860182523727417, + "learning_rate": 6.750551216210404e-05, + "loss": 0.2115, + "step": 5986 + }, + { + "epoch": 1.2116980368346488, + "grad_norm": 0.30926713347435, + "learning_rate": 6.747543273202216e-05, + "loss": 0.1919, + "step": 5987 + }, + { + "epoch": 1.211900425015179, + "grad_norm": 0.30893874168395996, + "learning_rate": 6.744535659243473e-05, + "loss": 0.1935, + "step": 5988 + }, + { + "epoch": 1.2121028131957094, + "grad_norm": 0.2799331843852997, + "learning_rate": 6.741528374638453e-05, + "loss": 0.1952, + "step": 5989 + }, + { + "epoch": 1.2123052013762396, + "grad_norm": 0.2534390687942505, + "learning_rate": 6.7385214196914e-05, + "loss": 0.1648, + "step": 5990 + }, + { + "epoch": 1.2125075895567698, + "grad_norm": 0.2808387279510498, + "learning_rate": 6.735514794706528e-05, + "loss": 0.1927, + "step": 5991 + }, + { + "epoch": 1.2127099777373, + "grad_norm": 0.29062768816947937, + "learning_rate": 6.732508499988015e-05, + "loss": 0.2046, + "step": 5992 + }, + { + "epoch": 1.2129123659178305, + "grad_norm": 0.2915301024913788, + "learning_rate": 6.729502535840007e-05, + "loss": 0.1989, + "step": 5993 + }, + { + "epoch": 1.2131147540983607, + "grad_norm": 0.27135908603668213, + "learning_rate": 6.726496902566616e-05, + "loss": 0.1998, + "step": 5994 + }, + { + "epoch": 1.2133171422788909, + "grad_norm": 0.28507882356643677, + "learning_rate": 6.723491600471919e-05, + "loss": 0.1819, + "step": 5995 + }, + { + "epoch": 1.213519530459421, + "grad_norm": 0.2728084623813629, + "learning_rate": 6.720486629859963e-05, + "loss": 0.2034, + "step": 5996 + }, + { + "epoch": 1.2137219186399515, + "grad_norm": 0.2556982934474945, + "learning_rate": 6.717481991034757e-05, + "loss": 0.196, + "step": 5997 + }, + { + "epoch": 1.2139243068204817, + "grad_norm": 0.2836724519729614, + "learning_rate": 6.71447768430028e-05, + "loss": 0.1905, + "step": 5998 + }, + { + "epoch": 1.214126695001012, + "grad_norm": 0.3173321783542633, + "learning_rate": 6.711473709960478e-05, + "loss": 0.2118, + "step": 5999 + }, + { + "epoch": 1.2143290831815423, + "grad_norm": 0.28240132331848145, + "learning_rate": 6.708470068319258e-05, + "loss": 0.1941, + "step": 6000 + }, + { + "epoch": 1.2143290831815423, + "eval_loss": 0.2702457010746002, + "eval_runtime": 0.7379, + "eval_samples_per_second": 6.776, + "eval_steps_per_second": 1.355, + "step": 6000 + }, + { + "epoch": 1.2145314713620725, + "grad_norm": 0.25992351770401, + "learning_rate": 6.7054667596805e-05, + "loss": 0.1922, + "step": 6001 + }, + { + "epoch": 1.2147338595426027, + "grad_norm": 0.2810695469379425, + "learning_rate": 6.702463784348045e-05, + "loss": 0.1942, + "step": 6002 + }, + { + "epoch": 1.214936247723133, + "grad_norm": 0.3523179292678833, + "learning_rate": 6.69946114262571e-05, + "loss": 0.2278, + "step": 6003 + }, + { + "epoch": 1.2151386359036631, + "grad_norm": 0.2615192234516144, + "learning_rate": 6.696458834817258e-05, + "loss": 0.1566, + "step": 6004 + }, + { + "epoch": 1.2153410240841935, + "grad_norm": 0.27877670526504517, + "learning_rate": 6.693456861226438e-05, + "loss": 0.2057, + "step": 6005 + }, + { + "epoch": 1.2155434122647237, + "grad_norm": 0.3264453113079071, + "learning_rate": 6.690455222156959e-05, + "loss": 0.1961, + "step": 6006 + }, + { + "epoch": 1.215745800445254, + "grad_norm": 0.27684032917022705, + "learning_rate": 6.687453917912492e-05, + "loss": 0.1956, + "step": 6007 + }, + { + "epoch": 1.2159481886257844, + "grad_norm": 0.27163904905319214, + "learning_rate": 6.68445294879668e-05, + "loss": 0.1715, + "step": 6008 + }, + { + "epoch": 1.2161505768063146, + "grad_norm": 0.2859479784965515, + "learning_rate": 6.68145231511313e-05, + "loss": 0.1967, + "step": 6009 + }, + { + "epoch": 1.2163529649868448, + "grad_norm": 0.3354541063308716, + "learning_rate": 6.678452017165413e-05, + "loss": 0.2147, + "step": 6010 + }, + { + "epoch": 1.216555353167375, + "grad_norm": 0.3036941587924957, + "learning_rate": 6.675452055257067e-05, + "loss": 0.2056, + "step": 6011 + }, + { + "epoch": 1.2167577413479054, + "grad_norm": 0.2504706084728241, + "learning_rate": 6.6724524296916e-05, + "loss": 0.1774, + "step": 6012 + }, + { + "epoch": 1.2169601295284356, + "grad_norm": 0.2577364444732666, + "learning_rate": 6.669453140772477e-05, + "loss": 0.1485, + "step": 6013 + }, + { + "epoch": 1.2171625177089658, + "grad_norm": 0.28054022789001465, + "learning_rate": 6.666454188803142e-05, + "loss": 0.1875, + "step": 6014 + }, + { + "epoch": 1.217364905889496, + "grad_norm": 0.28038644790649414, + "learning_rate": 6.663455574086992e-05, + "loss": 0.2117, + "step": 6015 + }, + { + "epoch": 1.2175672940700264, + "grad_norm": 0.28910353779792786, + "learning_rate": 6.660457296927398e-05, + "loss": 0.1783, + "step": 6016 + }, + { + "epoch": 1.2177696822505566, + "grad_norm": 0.2723628580570221, + "learning_rate": 6.657459357627693e-05, + "loss": 0.2085, + "step": 6017 + }, + { + "epoch": 1.2179720704310868, + "grad_norm": 0.2672482430934906, + "learning_rate": 6.654461756491177e-05, + "loss": 0.171, + "step": 6018 + }, + { + "epoch": 1.218174458611617, + "grad_norm": 0.25246506929397583, + "learning_rate": 6.651464493821116e-05, + "loss": 0.1909, + "step": 6019 + }, + { + "epoch": 1.2183768467921474, + "grad_norm": 0.23635558784008026, + "learning_rate": 6.648467569920742e-05, + "loss": 0.1799, + "step": 6020 + }, + { + "epoch": 1.2185792349726776, + "grad_norm": 0.2640747129917145, + "learning_rate": 6.645470985093253e-05, + "loss": 0.1684, + "step": 6021 + }, + { + "epoch": 1.2187816231532078, + "grad_norm": 0.26772499084472656, + "learning_rate": 6.642474739641811e-05, + "loss": 0.1854, + "step": 6022 + }, + { + "epoch": 1.218984011333738, + "grad_norm": 0.2757259011268616, + "learning_rate": 6.639478833869543e-05, + "loss": 0.2053, + "step": 6023 + }, + { + "epoch": 1.2191863995142684, + "grad_norm": 0.25957462191581726, + "learning_rate": 6.636483268079545e-05, + "loss": 0.1799, + "step": 6024 + }, + { + "epoch": 1.2193887876947986, + "grad_norm": 0.34773412346839905, + "learning_rate": 6.633488042574882e-05, + "loss": 0.2232, + "step": 6025 + }, + { + "epoch": 1.2195911758753288, + "grad_norm": 0.24219773709774017, + "learning_rate": 6.630493157658571e-05, + "loss": 0.1838, + "step": 6026 + }, + { + "epoch": 1.219793564055859, + "grad_norm": 0.34960097074508667, + "learning_rate": 6.627498613633606e-05, + "loss": 0.214, + "step": 6027 + }, + { + "epoch": 1.2199959522363895, + "grad_norm": 0.3201826214790344, + "learning_rate": 6.624504410802944e-05, + "loss": 0.2323, + "step": 6028 + }, + { + "epoch": 1.2201983404169197, + "grad_norm": 0.28623464703559875, + "learning_rate": 6.621510549469507e-05, + "loss": 0.1956, + "step": 6029 + }, + { + "epoch": 1.2204007285974499, + "grad_norm": 0.25482234358787537, + "learning_rate": 6.618517029936182e-05, + "loss": 0.1587, + "step": 6030 + }, + { + "epoch": 1.2206031167779803, + "grad_norm": 0.29422393441200256, + "learning_rate": 6.615523852505825e-05, + "loss": 0.1794, + "step": 6031 + }, + { + "epoch": 1.2208055049585105, + "grad_norm": 0.27444425225257874, + "learning_rate": 6.612531017481248e-05, + "loss": 0.2016, + "step": 6032 + }, + { + "epoch": 1.2210078931390407, + "grad_norm": 0.3157899081707001, + "learning_rate": 6.60953852516524e-05, + "loss": 0.2259, + "step": 6033 + }, + { + "epoch": 1.2212102813195709, + "grad_norm": 0.3079804480075836, + "learning_rate": 6.606546375860548e-05, + "loss": 0.2183, + "step": 6034 + }, + { + "epoch": 1.221412669500101, + "grad_norm": 0.3177958130836487, + "learning_rate": 6.603554569869888e-05, + "loss": 0.1983, + "step": 6035 + }, + { + "epoch": 1.2216150576806315, + "grad_norm": 0.28156325221061707, + "learning_rate": 6.600563107495937e-05, + "loss": 0.205, + "step": 6036 + }, + { + "epoch": 1.2218174458611617, + "grad_norm": 0.3072798252105713, + "learning_rate": 6.59757198904134e-05, + "loss": 0.1835, + "step": 6037 + }, + { + "epoch": 1.222019834041692, + "grad_norm": 0.30911877751350403, + "learning_rate": 6.594581214808708e-05, + "loss": 0.1635, + "step": 6038 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.2736305296421051, + "learning_rate": 6.591590785100617e-05, + "loss": 0.1832, + "step": 6039 + }, + { + "epoch": 1.2224246104027525, + "grad_norm": 0.27624863386154175, + "learning_rate": 6.588600700219608e-05, + "loss": 0.2023, + "step": 6040 + }, + { + "epoch": 1.2226269985832827, + "grad_norm": 0.2884620130062103, + "learning_rate": 6.585610960468182e-05, + "loss": 0.1468, + "step": 6041 + }, + { + "epoch": 1.222829386763813, + "grad_norm": 0.2596905529499054, + "learning_rate": 6.58262156614881e-05, + "loss": 0.1543, + "step": 6042 + }, + { + "epoch": 1.2230317749443433, + "grad_norm": 0.26053115725517273, + "learning_rate": 6.579632517563934e-05, + "loss": 0.1909, + "step": 6043 + }, + { + "epoch": 1.2232341631248735, + "grad_norm": 0.2973617911338806, + "learning_rate": 6.576643815015949e-05, + "loss": 0.1814, + "step": 6044 + }, + { + "epoch": 1.2234365513054037, + "grad_norm": 0.2856094539165497, + "learning_rate": 6.573655458807222e-05, + "loss": 0.2318, + "step": 6045 + }, + { + "epoch": 1.223638939485934, + "grad_norm": 0.29081985354423523, + "learning_rate": 6.570667449240083e-05, + "loss": 0.2022, + "step": 6046 + }, + { + "epoch": 1.2238413276664644, + "grad_norm": 0.36599013209342957, + "learning_rate": 6.567679786616834e-05, + "loss": 0.2136, + "step": 6047 + }, + { + "epoch": 1.2240437158469946, + "grad_norm": 0.36280548572540283, + "learning_rate": 6.564692471239723e-05, + "loss": 0.1944, + "step": 6048 + }, + { + "epoch": 1.2242461040275248, + "grad_norm": 0.30856937170028687, + "learning_rate": 6.561705503410982e-05, + "loss": 0.2411, + "step": 6049 + }, + { + "epoch": 1.224448492208055, + "grad_norm": 0.29295429587364197, + "learning_rate": 6.558718883432802e-05, + "loss": 0.218, + "step": 6050 + }, + { + "epoch": 1.224448492208055, + "eval_loss": 0.26702526211738586, + "eval_runtime": 0.7381, + "eval_samples_per_second": 6.774, + "eval_steps_per_second": 1.355, + "step": 6050 + }, + { + "epoch": 1.2246508803885854, + "grad_norm": 0.2865926921367645, + "learning_rate": 6.555732611607335e-05, + "loss": 0.1894, + "step": 6051 + }, + { + "epoch": 1.2248532685691156, + "grad_norm": 0.2995022237300873, + "learning_rate": 6.552746688236702e-05, + "loss": 0.1862, + "step": 6052 + }, + { + "epoch": 1.2250556567496458, + "grad_norm": 0.27584776282310486, + "learning_rate": 6.549761113622988e-05, + "loss": 0.1876, + "step": 6053 + }, + { + "epoch": 1.225258044930176, + "grad_norm": 0.35745367407798767, + "learning_rate": 6.54677588806824e-05, + "loss": 0.2175, + "step": 6054 + }, + { + "epoch": 1.2254604331107064, + "grad_norm": 0.24347814917564392, + "learning_rate": 6.543791011874476e-05, + "loss": 0.1802, + "step": 6055 + }, + { + "epoch": 1.2256628212912366, + "grad_norm": 0.35814881324768066, + "learning_rate": 6.54080648534367e-05, + "loss": 0.1824, + "step": 6056 + }, + { + "epoch": 1.2258652094717668, + "grad_norm": 0.23016677796840668, + "learning_rate": 6.537822308777769e-05, + "loss": 0.1863, + "step": 6057 + }, + { + "epoch": 1.226067597652297, + "grad_norm": 0.3142828047275543, + "learning_rate": 6.534838482478675e-05, + "loss": 0.2092, + "step": 6058 + }, + { + "epoch": 1.2262699858328274, + "grad_norm": 0.26234740018844604, + "learning_rate": 6.531855006748267e-05, + "loss": 0.179, + "step": 6059 + }, + { + "epoch": 1.2264723740133576, + "grad_norm": 0.31807130575180054, + "learning_rate": 6.528871881888376e-05, + "loss": 0.223, + "step": 6060 + }, + { + "epoch": 1.2266747621938878, + "grad_norm": 0.3054802417755127, + "learning_rate": 6.525889108200808e-05, + "loss": 0.2306, + "step": 6061 + }, + { + "epoch": 1.2268771503744182, + "grad_norm": 0.2600635886192322, + "learning_rate": 6.522906685987326e-05, + "loss": 0.1949, + "step": 6062 + }, + { + "epoch": 1.2270795385549484, + "grad_norm": 0.27494141459465027, + "learning_rate": 6.51992461554966e-05, + "loss": 0.1904, + "step": 6063 + }, + { + "epoch": 1.2272819267354786, + "grad_norm": 0.2199246734380722, + "learning_rate": 6.516942897189506e-05, + "loss": 0.1731, + "step": 6064 + }, + { + "epoch": 1.2274843149160088, + "grad_norm": 0.27572178840637207, + "learning_rate": 6.513961531208523e-05, + "loss": 0.1338, + "step": 6065 + }, + { + "epoch": 1.227686703096539, + "grad_norm": 0.2560636103153229, + "learning_rate": 6.510980517908334e-05, + "loss": 0.1797, + "step": 6066 + }, + { + "epoch": 1.2278890912770695, + "grad_norm": 0.27987346053123474, + "learning_rate": 6.507999857590525e-05, + "loss": 0.1868, + "step": 6067 + }, + { + "epoch": 1.2280914794575997, + "grad_norm": 0.2953600585460663, + "learning_rate": 6.50501955055665e-05, + "loss": 0.2177, + "step": 6068 + }, + { + "epoch": 1.2282938676381299, + "grad_norm": 0.29752638936042786, + "learning_rate": 6.502039597108226e-05, + "loss": 0.2291, + "step": 6069 + }, + { + "epoch": 1.2284962558186603, + "grad_norm": 0.3324909806251526, + "learning_rate": 6.49905999754673e-05, + "loss": 0.205, + "step": 6070 + }, + { + "epoch": 1.2286986439991905, + "grad_norm": 0.2881925404071808, + "learning_rate": 6.496080752173607e-05, + "loss": 0.203, + "step": 6071 + }, + { + "epoch": 1.2289010321797207, + "grad_norm": 0.44888433814048767, + "learning_rate": 6.49310186129027e-05, + "loss": 0.1765, + "step": 6072 + }, + { + "epoch": 1.2291034203602509, + "grad_norm": 0.283622682094574, + "learning_rate": 6.490123325198089e-05, + "loss": 0.2107, + "step": 6073 + }, + { + "epoch": 1.2293058085407813, + "grad_norm": 0.25115031003952026, + "learning_rate": 6.4871451441984e-05, + "loss": 0.1669, + "step": 6074 + }, + { + "epoch": 1.2295081967213115, + "grad_norm": 0.29004356265068054, + "learning_rate": 6.484167318592505e-05, + "loss": 0.176, + "step": 6075 + }, + { + "epoch": 1.2297105849018417, + "grad_norm": 0.2971007227897644, + "learning_rate": 6.48118984868167e-05, + "loss": 0.2017, + "step": 6076 + }, + { + "epoch": 1.229912973082372, + "grad_norm": 0.30012375116348267, + "learning_rate": 6.478212734767124e-05, + "loss": 0.1948, + "step": 6077 + }, + { + "epoch": 1.2301153612629023, + "grad_norm": 0.2942335307598114, + "learning_rate": 6.47523597715006e-05, + "loss": 0.2029, + "step": 6078 + }, + { + "epoch": 1.2303177494434325, + "grad_norm": 0.2728146016597748, + "learning_rate": 6.472259576131635e-05, + "loss": 0.1736, + "step": 6079 + }, + { + "epoch": 1.2305201376239627, + "grad_norm": 0.29079195857048035, + "learning_rate": 6.469283532012969e-05, + "loss": 0.1831, + "step": 6080 + }, + { + "epoch": 1.230722525804493, + "grad_norm": 0.25617703795433044, + "learning_rate": 6.466307845095148e-05, + "loss": 0.1744, + "step": 6081 + }, + { + "epoch": 1.2309249139850234, + "grad_norm": 0.26472562551498413, + "learning_rate": 6.463332515679221e-05, + "loss": 0.1943, + "step": 6082 + }, + { + "epoch": 1.2311273021655535, + "grad_norm": 0.2645573616027832, + "learning_rate": 6.4603575440662e-05, + "loss": 0.1982, + "step": 6083 + }, + { + "epoch": 1.2313296903460837, + "grad_norm": 0.3020835220813751, + "learning_rate": 6.457382930557062e-05, + "loss": 0.181, + "step": 6084 + }, + { + "epoch": 1.231532078526614, + "grad_norm": 0.261520653963089, + "learning_rate": 6.454408675452747e-05, + "loss": 0.1818, + "step": 6085 + }, + { + "epoch": 1.2317344667071444, + "grad_norm": 0.2937348186969757, + "learning_rate": 6.451434779054158e-05, + "loss": 0.2025, + "step": 6086 + }, + { + "epoch": 1.2319368548876746, + "grad_norm": 0.2838839292526245, + "learning_rate": 6.448461241662163e-05, + "loss": 0.2023, + "step": 6087 + }, + { + "epoch": 1.2321392430682048, + "grad_norm": 0.3361426293849945, + "learning_rate": 6.445488063577595e-05, + "loss": 0.2433, + "step": 6088 + }, + { + "epoch": 1.232341631248735, + "grad_norm": 0.2763873338699341, + "learning_rate": 6.442515245101247e-05, + "loss": 0.1605, + "step": 6089 + }, + { + "epoch": 1.2325440194292654, + "grad_norm": 0.2895985543727875, + "learning_rate": 6.439542786533879e-05, + "loss": 0.1764, + "step": 6090 + }, + { + "epoch": 1.2327464076097956, + "grad_norm": 0.3270489573478699, + "learning_rate": 6.436570688176211e-05, + "loss": 0.2119, + "step": 6091 + }, + { + "epoch": 1.2329487957903258, + "grad_norm": 0.2826992869377136, + "learning_rate": 6.433598950328934e-05, + "loss": 0.2041, + "step": 6092 + }, + { + "epoch": 1.2331511839708562, + "grad_norm": 0.24582330882549286, + "learning_rate": 6.430627573292689e-05, + "loss": 0.1679, + "step": 6093 + }, + { + "epoch": 1.2333535721513864, + "grad_norm": 0.2923988401889801, + "learning_rate": 6.427656557368095e-05, + "loss": 0.1922, + "step": 6094 + }, + { + "epoch": 1.2335559603319166, + "grad_norm": 0.3192385733127594, + "learning_rate": 6.424685902855725e-05, + "loss": 0.1708, + "step": 6095 + }, + { + "epoch": 1.2337583485124468, + "grad_norm": 0.27581390738487244, + "learning_rate": 6.421715610056121e-05, + "loss": 0.175, + "step": 6096 + }, + { + "epoch": 1.233960736692977, + "grad_norm": 0.3236446976661682, + "learning_rate": 6.418745679269785e-05, + "loss": 0.1985, + "step": 6097 + }, + { + "epoch": 1.2341631248735074, + "grad_norm": 0.29190582036972046, + "learning_rate": 6.415776110797184e-05, + "loss": 0.1825, + "step": 6098 + }, + { + "epoch": 1.2343655130540376, + "grad_norm": 0.31546276807785034, + "learning_rate": 6.412806904938746e-05, + "loss": 0.1647, + "step": 6099 + }, + { + "epoch": 1.2345679012345678, + "grad_norm": 0.24743930995464325, + "learning_rate": 6.409838061994867e-05, + "loss": 0.1971, + "step": 6100 + }, + { + "epoch": 1.2345679012345678, + "eval_loss": 0.26864010095596313, + "eval_runtime": 0.7356, + "eval_samples_per_second": 6.797, + "eval_steps_per_second": 1.359, + "step": 6100 + }, + { + "epoch": 1.2347702894150983, + "grad_norm": 0.2839614450931549, + "learning_rate": 6.4068695822659e-05, + "loss": 0.1969, + "step": 6101 + }, + { + "epoch": 1.2349726775956285, + "grad_norm": 0.26908546686172485, + "learning_rate": 6.403901466052167e-05, + "loss": 0.1676, + "step": 6102 + }, + { + "epoch": 1.2351750657761587, + "grad_norm": 0.32045114040374756, + "learning_rate": 6.400933713653949e-05, + "loss": 0.2132, + "step": 6103 + }, + { + "epoch": 1.2353774539566889, + "grad_norm": 0.31629490852355957, + "learning_rate": 6.397966325371496e-05, + "loss": 0.1936, + "step": 6104 + }, + { + "epoch": 1.2355798421372193, + "grad_norm": 0.2909006178379059, + "learning_rate": 6.394999301505013e-05, + "loss": 0.2212, + "step": 6105 + }, + { + "epoch": 1.2357822303177495, + "grad_norm": 0.29362747073173523, + "learning_rate": 6.392032642354673e-05, + "loss": 0.1745, + "step": 6106 + }, + { + "epoch": 1.2359846184982797, + "grad_norm": 0.24971556663513184, + "learning_rate": 6.389066348220613e-05, + "loss": 0.1852, + "step": 6107 + }, + { + "epoch": 1.2361870066788099, + "grad_norm": 0.26508629322052, + "learning_rate": 6.386100419402931e-05, + "loss": 0.1669, + "step": 6108 + }, + { + "epoch": 1.2363893948593403, + "grad_norm": 0.3305191695690155, + "learning_rate": 6.383134856201689e-05, + "loss": 0.1902, + "step": 6109 + }, + { + "epoch": 1.2365917830398705, + "grad_norm": 0.27122199535369873, + "learning_rate": 6.38016965891691e-05, + "loss": 0.1783, + "step": 6110 + }, + { + "epoch": 1.2367941712204007, + "grad_norm": 0.271418958902359, + "learning_rate": 6.377204827848584e-05, + "loss": 0.1913, + "step": 6111 + }, + { + "epoch": 1.236996559400931, + "grad_norm": 0.3329184651374817, + "learning_rate": 6.374240363296657e-05, + "loss": 0.2314, + "step": 6112 + }, + { + "epoch": 1.2371989475814613, + "grad_norm": 0.270542174577713, + "learning_rate": 6.371276265561047e-05, + "loss": 0.1918, + "step": 6113 + }, + { + "epoch": 1.2374013357619915, + "grad_norm": 0.24374881386756897, + "learning_rate": 6.368312534941632e-05, + "loss": 0.1699, + "step": 6114 + }, + { + "epoch": 1.2376037239425217, + "grad_norm": 0.24941346049308777, + "learning_rate": 6.365349171738244e-05, + "loss": 0.156, + "step": 6115 + }, + { + "epoch": 1.2378061121230521, + "grad_norm": 0.2745361924171448, + "learning_rate": 6.362386176250689e-05, + "loss": 0.2191, + "step": 6116 + }, + { + "epoch": 1.2380085003035823, + "grad_norm": 0.25560203194618225, + "learning_rate": 6.359423548778733e-05, + "loss": 0.1756, + "step": 6117 + }, + { + "epoch": 1.2382108884841125, + "grad_norm": 0.3089434802532196, + "learning_rate": 6.356461289622102e-05, + "loss": 0.1757, + "step": 6118 + }, + { + "epoch": 1.2384132766646427, + "grad_norm": 0.30926546454429626, + "learning_rate": 6.353499399080485e-05, + "loss": 0.2051, + "step": 6119 + }, + { + "epoch": 1.238615664845173, + "grad_norm": 0.26371267437934875, + "learning_rate": 6.350537877453537e-05, + "loss": 0.1658, + "step": 6120 + }, + { + "epoch": 1.2388180530257034, + "grad_norm": 0.30429723858833313, + "learning_rate": 6.347576725040874e-05, + "loss": 0.192, + "step": 6121 + }, + { + "epoch": 1.2390204412062336, + "grad_norm": 0.2675933837890625, + "learning_rate": 6.344615942142071e-05, + "loss": 0.1864, + "step": 6122 + }, + { + "epoch": 1.2392228293867638, + "grad_norm": 0.275258868932724, + "learning_rate": 6.341655529056675e-05, + "loss": 0.1937, + "step": 6123 + }, + { + "epoch": 1.2394252175672942, + "grad_norm": 0.26812008023262024, + "learning_rate": 6.338695486084184e-05, + "loss": 0.1943, + "step": 6124 + }, + { + "epoch": 1.2396276057478244, + "grad_norm": 0.29009515047073364, + "learning_rate": 6.335735813524066e-05, + "loss": 0.1872, + "step": 6125 + }, + { + "epoch": 1.2398299939283546, + "grad_norm": 0.2780061364173889, + "learning_rate": 6.33277651167575e-05, + "loss": 0.1845, + "step": 6126 + }, + { + "epoch": 1.2400323821088848, + "grad_norm": 0.25830531120300293, + "learning_rate": 6.329817580838628e-05, + "loss": 0.1902, + "step": 6127 + }, + { + "epoch": 1.240234770289415, + "grad_norm": 0.3139815926551819, + "learning_rate": 6.32685902131205e-05, + "loss": 0.2267, + "step": 6128 + }, + { + "epoch": 1.2404371584699454, + "grad_norm": 0.2877956032752991, + "learning_rate": 6.323900833395338e-05, + "loss": 0.2039, + "step": 6129 + }, + { + "epoch": 1.2406395466504756, + "grad_norm": 0.2266833633184433, + "learning_rate": 6.320943017387764e-05, + "loss": 0.1606, + "step": 6130 + }, + { + "epoch": 1.2408419348310058, + "grad_norm": 0.24923524260520935, + "learning_rate": 6.317985573588572e-05, + "loss": 0.1749, + "step": 6131 + }, + { + "epoch": 1.2410443230115362, + "grad_norm": 0.26781517267227173, + "learning_rate": 6.315028502296965e-05, + "loss": 0.2144, + "step": 6132 + }, + { + "epoch": 1.2412467111920664, + "grad_norm": 0.4893890619277954, + "learning_rate": 6.312071803812107e-05, + "loss": 0.1791, + "step": 6133 + }, + { + "epoch": 1.2414490993725966, + "grad_norm": 0.28728941082954407, + "learning_rate": 6.309115478433129e-05, + "loss": 0.1983, + "step": 6134 + }, + { + "epoch": 1.2416514875531268, + "grad_norm": 0.24946346879005432, + "learning_rate": 6.306159526459118e-05, + "loss": 0.1551, + "step": 6135 + }, + { + "epoch": 1.2418538757336572, + "grad_norm": 0.2808363437652588, + "learning_rate": 6.303203948189131e-05, + "loss": 0.156, + "step": 6136 + }, + { + "epoch": 1.2420562639141874, + "grad_norm": 0.251274973154068, + "learning_rate": 6.300248743922172e-05, + "loss": 0.189, + "step": 6137 + }, + { + "epoch": 1.2422586520947176, + "grad_norm": 0.26637983322143555, + "learning_rate": 6.297293913957227e-05, + "loss": 0.2127, + "step": 6138 + }, + { + "epoch": 1.2424610402752478, + "grad_norm": 0.2673642635345459, + "learning_rate": 6.29433945859323e-05, + "loss": 0.1819, + "step": 6139 + }, + { + "epoch": 1.2426634284557783, + "grad_norm": 0.33593377470970154, + "learning_rate": 6.291385378129085e-05, + "loss": 0.1829, + "step": 6140 + }, + { + "epoch": 1.2428658166363085, + "grad_norm": 0.23823416233062744, + "learning_rate": 6.288431672863654e-05, + "loss": 0.1773, + "step": 6141 + }, + { + "epoch": 1.2430682048168387, + "grad_norm": 0.4076298177242279, + "learning_rate": 6.28547834309576e-05, + "loss": 0.2245, + "step": 6142 + }, + { + "epoch": 1.2432705929973689, + "grad_norm": 0.28459057211875916, + "learning_rate": 6.282525389124192e-05, + "loss": 0.1853, + "step": 6143 + }, + { + "epoch": 1.2434729811778993, + "grad_norm": 0.27502378821372986, + "learning_rate": 6.279572811247698e-05, + "loss": 0.1984, + "step": 6144 + }, + { + "epoch": 1.2436753693584295, + "grad_norm": 0.2636633515357971, + "learning_rate": 6.276620609764988e-05, + "loss": 0.1823, + "step": 6145 + }, + { + "epoch": 1.2438777575389597, + "grad_norm": 0.28661084175109863, + "learning_rate": 6.273668784974737e-05, + "loss": 0.2045, + "step": 6146 + }, + { + "epoch": 1.24408014571949, + "grad_norm": 0.3029300570487976, + "learning_rate": 6.270717337175578e-05, + "loss": 0.1976, + "step": 6147 + }, + { + "epoch": 1.2442825339000203, + "grad_norm": 0.3371375501155853, + "learning_rate": 6.267766266666107e-05, + "loss": 0.19, + "step": 6148 + }, + { + "epoch": 1.2444849220805505, + "grad_norm": 0.2589677572250366, + "learning_rate": 6.264815573744884e-05, + "loss": 0.1714, + "step": 6149 + }, + { + "epoch": 1.2446873102610807, + "grad_norm": 0.36157044768333435, + "learning_rate": 6.261865258710428e-05, + "loss": 0.2119, + "step": 6150 + }, + { + "epoch": 1.2446873102610807, + "eval_loss": 0.2650595009326935, + "eval_runtime": 0.737, + "eval_samples_per_second": 6.784, + "eval_steps_per_second": 1.357, + "step": 6150 + }, + { + "epoch": 1.244889698441611, + "grad_norm": 0.31364962458610535, + "learning_rate": 6.258915321861223e-05, + "loss": 0.1856, + "step": 6151 + }, + { + "epoch": 1.2450920866221413, + "grad_norm": 0.2639737129211426, + "learning_rate": 6.255965763495709e-05, + "loss": 0.1939, + "step": 6152 + }, + { + "epoch": 1.2452944748026715, + "grad_norm": 0.2909432053565979, + "learning_rate": 6.253016583912295e-05, + "loss": 0.1955, + "step": 6153 + }, + { + "epoch": 1.2454968629832017, + "grad_norm": 0.2598069906234741, + "learning_rate": 6.250067783409345e-05, + "loss": 0.2062, + "step": 6154 + }, + { + "epoch": 1.2456992511637321, + "grad_norm": 0.35393086075782776, + "learning_rate": 6.24711936228519e-05, + "loss": 0.194, + "step": 6155 + }, + { + "epoch": 1.2459016393442623, + "grad_norm": 0.27285313606262207, + "learning_rate": 6.244171320838118e-05, + "loss": 0.1832, + "step": 6156 + }, + { + "epoch": 1.2461040275247925, + "grad_norm": 0.30949246883392334, + "learning_rate": 6.241223659366383e-05, + "loss": 0.1989, + "step": 6157 + }, + { + "epoch": 1.2463064157053227, + "grad_norm": 0.32277756929397583, + "learning_rate": 6.238276378168202e-05, + "loss": 0.2191, + "step": 6158 + }, + { + "epoch": 1.246508803885853, + "grad_norm": 0.2878342568874359, + "learning_rate": 6.235329477541743e-05, + "loss": 0.1834, + "step": 6159 + }, + { + "epoch": 1.2467111920663834, + "grad_norm": 0.2992894947528839, + "learning_rate": 6.232382957785143e-05, + "loss": 0.193, + "step": 6160 + }, + { + "epoch": 1.2469135802469136, + "grad_norm": 0.26729676127433777, + "learning_rate": 6.229436819196503e-05, + "loss": 0.1742, + "step": 6161 + }, + { + "epoch": 1.2471159684274438, + "grad_norm": 0.2526571452617645, + "learning_rate": 6.226491062073882e-05, + "loss": 0.1429, + "step": 6162 + }, + { + "epoch": 1.2473183566079742, + "grad_norm": 0.2803010642528534, + "learning_rate": 6.2235456867153e-05, + "loss": 0.2015, + "step": 6163 + }, + { + "epoch": 1.2475207447885044, + "grad_norm": 0.307064950466156, + "learning_rate": 6.220600693418739e-05, + "loss": 0.1896, + "step": 6164 + }, + { + "epoch": 1.2477231329690346, + "grad_norm": 0.28746598958969116, + "learning_rate": 6.217656082482143e-05, + "loss": 0.1931, + "step": 6165 + }, + { + "epoch": 1.2479255211495648, + "grad_norm": 0.31549063324928284, + "learning_rate": 6.214711854203417e-05, + "loss": 0.2117, + "step": 6166 + }, + { + "epoch": 1.2481279093300952, + "grad_norm": 0.31217721104621887, + "learning_rate": 6.211768008880427e-05, + "loss": 0.2061, + "step": 6167 + }, + { + "epoch": 1.2483302975106254, + "grad_norm": 0.287009060382843, + "learning_rate": 6.208824546811001e-05, + "loss": 0.2023, + "step": 6168 + }, + { + "epoch": 1.2485326856911556, + "grad_norm": 0.32352015376091003, + "learning_rate": 6.205881468292927e-05, + "loss": 0.2042, + "step": 6169 + }, + { + "epoch": 1.2487350738716858, + "grad_norm": 0.3012515604496002, + "learning_rate": 6.202938773623954e-05, + "loss": 0.1969, + "step": 6170 + }, + { + "epoch": 1.2489374620522162, + "grad_norm": 0.26024118065834045, + "learning_rate": 6.199996463101795e-05, + "loss": 0.1842, + "step": 6171 + }, + { + "epoch": 1.2491398502327464, + "grad_norm": 0.2766067087650299, + "learning_rate": 6.19705453702412e-05, + "loss": 0.1998, + "step": 6172 + }, + { + "epoch": 1.2493422384132766, + "grad_norm": 0.2818344235420227, + "learning_rate": 6.194112995688563e-05, + "loss": 0.1755, + "step": 6173 + }, + { + "epoch": 1.2495446265938068, + "grad_norm": 0.2871859073638916, + "learning_rate": 6.19117183939272e-05, + "loss": 0.1693, + "step": 6174 + }, + { + "epoch": 1.2497470147743373, + "grad_norm": 0.2709919214248657, + "learning_rate": 6.188231068434143e-05, + "loss": 0.198, + "step": 6175 + }, + { + "epoch": 1.2499494029548675, + "grad_norm": 0.2757877707481384, + "learning_rate": 6.18529068311035e-05, + "loss": 0.2063, + "step": 6176 + }, + { + "epoch": 1.2501517911353976, + "grad_norm": 0.33978694677352905, + "learning_rate": 6.18235068371882e-05, + "loss": 0.1982, + "step": 6177 + }, + { + "epoch": 1.250354179315928, + "grad_norm": 0.25776079297065735, + "learning_rate": 6.179411070556989e-05, + "loss": 0.1908, + "step": 6178 + }, + { + "epoch": 1.2505565674964583, + "grad_norm": 0.2702656388282776, + "learning_rate": 6.176471843922256e-05, + "loss": 0.1746, + "step": 6179 + }, + { + "epoch": 1.2507589556769885, + "grad_norm": 0.3064190745353699, + "learning_rate": 6.173533004111982e-05, + "loss": 0.199, + "step": 6180 + }, + { + "epoch": 1.2509613438575187, + "grad_norm": 0.2589879631996155, + "learning_rate": 6.170594551423493e-05, + "loss": 0.1843, + "step": 6181 + }, + { + "epoch": 1.2511637320380489, + "grad_norm": 0.2778613269329071, + "learning_rate": 6.167656486154061e-05, + "loss": 0.1809, + "step": 6182 + }, + { + "epoch": 1.2513661202185793, + "grad_norm": 0.27029654383659363, + "learning_rate": 6.164718808600933e-05, + "loss": 0.1613, + "step": 6183 + }, + { + "epoch": 1.2515685083991095, + "grad_norm": 0.2589007616043091, + "learning_rate": 6.161781519061314e-05, + "loss": 0.1769, + "step": 6184 + }, + { + "epoch": 1.2517708965796397, + "grad_norm": 0.3096887767314911, + "learning_rate": 6.158844617832367e-05, + "loss": 0.2047, + "step": 6185 + }, + { + "epoch": 1.2519732847601701, + "grad_norm": 0.35860517621040344, + "learning_rate": 6.155908105211216e-05, + "loss": 0.1758, + "step": 6186 + }, + { + "epoch": 1.2521756729407003, + "grad_norm": 0.3288900554180145, + "learning_rate": 6.152971981494948e-05, + "loss": 0.2195, + "step": 6187 + }, + { + "epoch": 1.2523780611212305, + "grad_norm": 0.2885974049568176, + "learning_rate": 6.150036246980609e-05, + "loss": 0.1957, + "step": 6188 + }, + { + "epoch": 1.2525804493017607, + "grad_norm": 0.27418920397758484, + "learning_rate": 6.147100901965203e-05, + "loss": 0.1887, + "step": 6189 + }, + { + "epoch": 1.252782837482291, + "grad_norm": 0.2852184772491455, + "learning_rate": 6.144165946745701e-05, + "loss": 0.1942, + "step": 6190 + }, + { + "epoch": 1.2529852256628213, + "grad_norm": 0.2928641140460968, + "learning_rate": 6.14123138161903e-05, + "loss": 0.1887, + "step": 6191 + }, + { + "epoch": 1.2531876138433515, + "grad_norm": 0.3721943497657776, + "learning_rate": 6.138297206882077e-05, + "loss": 0.214, + "step": 6192 + }, + { + "epoch": 1.2533900020238817, + "grad_norm": 0.32005420327186584, + "learning_rate": 6.135363422831695e-05, + "loss": 0.186, + "step": 6193 + }, + { + "epoch": 1.2535923902044122, + "grad_norm": 0.34797215461730957, + "learning_rate": 6.132430029764688e-05, + "loss": 0.2279, + "step": 6194 + }, + { + "epoch": 1.2537947783849424, + "grad_norm": 0.2843010425567627, + "learning_rate": 6.129497027977829e-05, + "loss": 0.1889, + "step": 6195 + }, + { + "epoch": 1.2539971665654726, + "grad_norm": 0.30852001905441284, + "learning_rate": 6.126564417767849e-05, + "loss": 0.2335, + "step": 6196 + }, + { + "epoch": 1.2541995547460028, + "grad_norm": 0.2834334075450897, + "learning_rate": 6.123632199431436e-05, + "loss": 0.1958, + "step": 6197 + }, + { + "epoch": 1.254401942926533, + "grad_norm": 0.2763972580432892, + "learning_rate": 6.120700373265245e-05, + "loss": 0.1835, + "step": 6198 + }, + { + "epoch": 1.2546043311070634, + "grad_norm": 0.27830395102500916, + "learning_rate": 6.117768939565883e-05, + "loss": 0.1904, + "step": 6199 + }, + { + "epoch": 1.2548067192875936, + "grad_norm": 0.2894650101661682, + "learning_rate": 6.114837898629926e-05, + "loss": 0.207, + "step": 6200 + }, + { + "epoch": 1.2548067192875936, + "eval_loss": 0.2662354111671448, + "eval_runtime": 0.737, + "eval_samples_per_second": 6.784, + "eval_steps_per_second": 1.357, + "step": 6200 + }, + { + "epoch": 1.255009107468124, + "grad_norm": 0.29689645767211914, + "learning_rate": 6.111907250753903e-05, + "loss": 0.2057, + "step": 6201 + }, + { + "epoch": 1.2552114956486542, + "grad_norm": 0.31450599431991577, + "learning_rate": 6.108976996234307e-05, + "loss": 0.2192, + "step": 6202 + }, + { + "epoch": 1.2554138838291844, + "grad_norm": 0.293605238199234, + "learning_rate": 6.106047135367594e-05, + "loss": 0.1811, + "step": 6203 + }, + { + "epoch": 1.2556162720097146, + "grad_norm": 0.28915974497795105, + "learning_rate": 6.103117668450171e-05, + "loss": 0.1885, + "step": 6204 + }, + { + "epoch": 1.2558186601902448, + "grad_norm": 0.337788462638855, + "learning_rate": 6.100188595778411e-05, + "loss": 0.1903, + "step": 6205 + }, + { + "epoch": 1.2560210483707752, + "grad_norm": 0.316983699798584, + "learning_rate": 6.097259917648649e-05, + "loss": 0.2047, + "step": 6206 + }, + { + "epoch": 1.2562234365513054, + "grad_norm": 0.28668999671936035, + "learning_rate": 6.0943316343571776e-05, + "loss": 0.1867, + "step": 6207 + }, + { + "epoch": 1.2564258247318356, + "grad_norm": 0.2853834927082062, + "learning_rate": 6.091403746200251e-05, + "loss": 0.1839, + "step": 6208 + }, + { + "epoch": 1.256628212912366, + "grad_norm": 0.29834699630737305, + "learning_rate": 6.088476253474078e-05, + "loss": 0.196, + "step": 6209 + }, + { + "epoch": 1.2568306010928962, + "grad_norm": 0.23479576408863068, + "learning_rate": 6.085549156474837e-05, + "loss": 0.1569, + "step": 6210 + }, + { + "epoch": 1.2570329892734264, + "grad_norm": 0.25744327902793884, + "learning_rate": 6.0826224554986574e-05, + "loss": 0.1651, + "step": 6211 + }, + { + "epoch": 1.2572353774539566, + "grad_norm": 0.2955494821071625, + "learning_rate": 6.079696150841634e-05, + "loss": 0.1895, + "step": 6212 + }, + { + "epoch": 1.2574377656344868, + "grad_norm": 0.2956841289997101, + "learning_rate": 6.076770242799818e-05, + "loss": 0.1915, + "step": 6213 + }, + { + "epoch": 1.2576401538150173, + "grad_norm": 0.2864188849925995, + "learning_rate": 6.0738447316692225e-05, + "loss": 0.188, + "step": 6214 + }, + { + "epoch": 1.2578425419955475, + "grad_norm": 0.3533359169960022, + "learning_rate": 6.0709196177458214e-05, + "loss": 0.2335, + "step": 6215 + }, + { + "epoch": 1.2580449301760777, + "grad_norm": 0.3068154752254486, + "learning_rate": 6.067994901325546e-05, + "loss": 0.1835, + "step": 6216 + }, + { + "epoch": 1.258247318356608, + "grad_norm": 0.28471609950065613, + "learning_rate": 6.0650705827042874e-05, + "loss": 0.1961, + "step": 6217 + }, + { + "epoch": 1.2584497065371383, + "grad_norm": 0.2817171514034271, + "learning_rate": 6.062146662177899e-05, + "loss": 0.2149, + "step": 6218 + }, + { + "epoch": 1.2586520947176685, + "grad_norm": 0.2808842957019806, + "learning_rate": 6.0592231400421914e-05, + "loss": 0.1907, + "step": 6219 + }, + { + "epoch": 1.2588544828981987, + "grad_norm": 0.29851841926574707, + "learning_rate": 6.056300016592937e-05, + "loss": 0.199, + "step": 6220 + }, + { + "epoch": 1.2590568710787289, + "grad_norm": 0.3412727415561676, + "learning_rate": 6.053377292125867e-05, + "loss": 0.2199, + "step": 6221 + }, + { + "epoch": 1.2592592592592593, + "grad_norm": 0.27769625186920166, + "learning_rate": 6.0504549669366706e-05, + "loss": 0.1949, + "step": 6222 + }, + { + "epoch": 1.2594616474397895, + "grad_norm": 0.26295939087867737, + "learning_rate": 6.047533041320998e-05, + "loss": 0.1899, + "step": 6223 + }, + { + "epoch": 1.2596640356203197, + "grad_norm": 0.30268317461013794, + "learning_rate": 6.0446115155744576e-05, + "loss": 0.1826, + "step": 6224 + }, + { + "epoch": 1.2598664238008501, + "grad_norm": 0.34082984924316406, + "learning_rate": 6.041690389992627e-05, + "loss": 0.2085, + "step": 6225 + }, + { + "epoch": 1.2600688119813803, + "grad_norm": 0.2618637979030609, + "learning_rate": 6.0387696648710246e-05, + "loss": 0.1653, + "step": 6226 + }, + { + "epoch": 1.2602712001619105, + "grad_norm": 0.27268511056900024, + "learning_rate": 6.035849340505142e-05, + "loss": 0.1629, + "step": 6227 + }, + { + "epoch": 1.2604735883424407, + "grad_norm": 0.29634785652160645, + "learning_rate": 6.0329294171904295e-05, + "loss": 0.189, + "step": 6228 + }, + { + "epoch": 1.260675976522971, + "grad_norm": 0.27574586868286133, + "learning_rate": 6.03000989522229e-05, + "loss": 0.1727, + "step": 6229 + }, + { + "epoch": 1.2608783647035013, + "grad_norm": 0.3131870627403259, + "learning_rate": 6.027090774896095e-05, + "loss": 0.1839, + "step": 6230 + }, + { + "epoch": 1.2610807528840315, + "grad_norm": 0.3620019853115082, + "learning_rate": 6.024172056507167e-05, + "loss": 0.1879, + "step": 6231 + }, + { + "epoch": 1.261283141064562, + "grad_norm": 0.2766468822956085, + "learning_rate": 6.021253740350793e-05, + "loss": 0.1889, + "step": 6232 + }, + { + "epoch": 1.2614855292450922, + "grad_norm": 0.2821180820465088, + "learning_rate": 6.0183358267222167e-05, + "loss": 0.1848, + "step": 6233 + }, + { + "epoch": 1.2616879174256224, + "grad_norm": 0.4128684997558594, + "learning_rate": 6.015418315916642e-05, + "loss": 0.1691, + "step": 6234 + }, + { + "epoch": 1.2618903056061526, + "grad_norm": 0.4056648015975952, + "learning_rate": 6.012501208229233e-05, + "loss": 0.2035, + "step": 6235 + }, + { + "epoch": 1.2620926937866828, + "grad_norm": 0.24870654940605164, + "learning_rate": 6.009584503955111e-05, + "loss": 0.1644, + "step": 6236 + }, + { + "epoch": 1.2622950819672132, + "grad_norm": 0.3007480800151825, + "learning_rate": 6.0066682033893586e-05, + "loss": 0.2186, + "step": 6237 + }, + { + "epoch": 1.2624974701477434, + "grad_norm": 0.302317351102829, + "learning_rate": 6.003752306827015e-05, + "loss": 0.2275, + "step": 6238 + }, + { + "epoch": 1.2626998583282736, + "grad_norm": 0.3214733302593231, + "learning_rate": 6.0008368145630814e-05, + "loss": 0.1849, + "step": 6239 + }, + { + "epoch": 1.262902246508804, + "grad_norm": 0.30345696210861206, + "learning_rate": 5.997921726892516e-05, + "loss": 0.1862, + "step": 6240 + }, + { + "epoch": 1.2631046346893342, + "grad_norm": 0.31955331563949585, + "learning_rate": 5.995007044110237e-05, + "loss": 0.238, + "step": 6241 + }, + { + "epoch": 1.2633070228698644, + "grad_norm": 0.3335302472114563, + "learning_rate": 5.992092766511121e-05, + "loss": 0.1993, + "step": 6242 + }, + { + "epoch": 1.2635094110503946, + "grad_norm": 0.27500903606414795, + "learning_rate": 5.989178894390004e-05, + "loss": 0.1757, + "step": 6243 + }, + { + "epoch": 1.2637117992309248, + "grad_norm": 0.2906457781791687, + "learning_rate": 5.9862654280416816e-05, + "loss": 0.2086, + "step": 6244 + }, + { + "epoch": 1.2639141874114552, + "grad_norm": 0.298608660697937, + "learning_rate": 5.9833523677609084e-05, + "loss": 0.2096, + "step": 6245 + }, + { + "epoch": 1.2641165755919854, + "grad_norm": 0.30972978472709656, + "learning_rate": 5.9804397138423965e-05, + "loss": 0.201, + "step": 6246 + }, + { + "epoch": 1.2643189637725156, + "grad_norm": 0.27621757984161377, + "learning_rate": 5.977527466580819e-05, + "loss": 0.1929, + "step": 6247 + }, + { + "epoch": 1.264521351953046, + "grad_norm": 0.37198740243911743, + "learning_rate": 5.974615626270803e-05, + "loss": 0.2281, + "step": 6248 + }, + { + "epoch": 1.2647237401335762, + "grad_norm": 0.3002682328224182, + "learning_rate": 5.9717041932069393e-05, + "loss": 0.1646, + "step": 6249 + }, + { + "epoch": 1.2649261283141064, + "grad_norm": 0.26520946621894836, + "learning_rate": 5.96879316768378e-05, + "loss": 0.1953, + "step": 6250 + }, + { + "epoch": 1.2649261283141064, + "eval_loss": 0.2648610472679138, + "eval_runtime": 0.7416, + "eval_samples_per_second": 6.742, + "eval_steps_per_second": 1.348, + "step": 6250 + }, + { + "epoch": 1.2651285164946366, + "grad_norm": 0.3371294140815735, + "learning_rate": 5.965882549995825e-05, + "loss": 0.2109, + "step": 6251 + }, + { + "epoch": 1.2653309046751668, + "grad_norm": 0.2856069803237915, + "learning_rate": 5.962972340437547e-05, + "loss": 0.2024, + "step": 6252 + }, + { + "epoch": 1.2655332928556973, + "grad_norm": 0.3305569589138031, + "learning_rate": 5.960062539303366e-05, + "loss": 0.2167, + "step": 6253 + }, + { + "epoch": 1.2657356810362275, + "grad_norm": 0.27187907695770264, + "learning_rate": 5.957153146887666e-05, + "loss": 0.1833, + "step": 6254 + }, + { + "epoch": 1.2659380692167577, + "grad_norm": 0.29358017444610596, + "learning_rate": 5.954244163484792e-05, + "loss": 0.1918, + "step": 6255 + }, + { + "epoch": 1.266140457397288, + "grad_norm": 0.28766417503356934, + "learning_rate": 5.95133558938904e-05, + "loss": 0.1937, + "step": 6256 + }, + { + "epoch": 1.2663428455778183, + "grad_norm": 0.2951597571372986, + "learning_rate": 5.9484274248946715e-05, + "loss": 0.1819, + "step": 6257 + }, + { + "epoch": 1.2665452337583485, + "grad_norm": 0.29034483432769775, + "learning_rate": 5.9455196702959035e-05, + "loss": 0.2177, + "step": 6258 + }, + { + "epoch": 1.2667476219388787, + "grad_norm": 0.2750239074230194, + "learning_rate": 5.942612325886912e-05, + "loss": 0.1965, + "step": 6259 + }, + { + "epoch": 1.266950010119409, + "grad_norm": 0.27870023250579834, + "learning_rate": 5.9397053919618317e-05, + "loss": 0.1724, + "step": 6260 + }, + { + "epoch": 1.2671523982999393, + "grad_norm": 0.27458661794662476, + "learning_rate": 5.9367988688147556e-05, + "loss": 0.1918, + "step": 6261 + }, + { + "epoch": 1.2673547864804695, + "grad_norm": 0.27289730310440063, + "learning_rate": 5.933892756739736e-05, + "loss": 0.2136, + "step": 6262 + }, + { + "epoch": 1.267557174661, + "grad_norm": 0.3496069610118866, + "learning_rate": 5.930987056030781e-05, + "loss": 0.202, + "step": 6263 + }, + { + "epoch": 1.2677595628415301, + "grad_norm": 0.29557445645332336, + "learning_rate": 5.9280817669818615e-05, + "loss": 0.2219, + "step": 6264 + }, + { + "epoch": 1.2679619510220603, + "grad_norm": 0.24535717070102692, + "learning_rate": 5.925176889886901e-05, + "loss": 0.1737, + "step": 6265 + }, + { + "epoch": 1.2681643392025905, + "grad_norm": 0.2717592716217041, + "learning_rate": 5.922272425039786e-05, + "loss": 0.1855, + "step": 6266 + }, + { + "epoch": 1.2683667273831207, + "grad_norm": 0.2716783285140991, + "learning_rate": 5.919368372734361e-05, + "loss": 0.1665, + "step": 6267 + }, + { + "epoch": 1.2685691155636512, + "grad_norm": 0.2683229446411133, + "learning_rate": 5.9164647332644266e-05, + "loss": 0.1919, + "step": 6268 + }, + { + "epoch": 1.2687715037441814, + "grad_norm": 0.2556900084018707, + "learning_rate": 5.913561506923741e-05, + "loss": 0.1909, + "step": 6269 + }, + { + "epoch": 1.2689738919247116, + "grad_norm": 0.30283623933792114, + "learning_rate": 5.9106586940060275e-05, + "loss": 0.2169, + "step": 6270 + }, + { + "epoch": 1.269176280105242, + "grad_norm": 0.2764374911785126, + "learning_rate": 5.907756294804955e-05, + "loss": 0.1926, + "step": 6271 + }, + { + "epoch": 1.2693786682857722, + "grad_norm": 0.29008007049560547, + "learning_rate": 5.904854309614162e-05, + "loss": 0.2117, + "step": 6272 + }, + { + "epoch": 1.2695810564663024, + "grad_norm": 0.27948057651519775, + "learning_rate": 5.901952738727239e-05, + "loss": 0.1912, + "step": 6273 + }, + { + "epoch": 1.2697834446468326, + "grad_norm": 0.2950563430786133, + "learning_rate": 5.899051582437738e-05, + "loss": 0.1747, + "step": 6274 + }, + { + "epoch": 1.2699858328273628, + "grad_norm": 0.3165188729763031, + "learning_rate": 5.8961508410391674e-05, + "loss": 0.2518, + "step": 6275 + }, + { + "epoch": 1.2701882210078932, + "grad_norm": 0.24708828330039978, + "learning_rate": 5.893250514824994e-05, + "loss": 0.186, + "step": 6276 + }, + { + "epoch": 1.2703906091884234, + "grad_norm": 0.32647305727005005, + "learning_rate": 5.8903506040886415e-05, + "loss": 0.2121, + "step": 6277 + }, + { + "epoch": 1.2705929973689536, + "grad_norm": 0.3253696858882904, + "learning_rate": 5.887451109123492e-05, + "loss": 0.2455, + "step": 6278 + }, + { + "epoch": 1.270795385549484, + "grad_norm": 0.3270389437675476, + "learning_rate": 5.8845520302228876e-05, + "loss": 0.229, + "step": 6279 + }, + { + "epoch": 1.2709977737300142, + "grad_norm": 0.2574213743209839, + "learning_rate": 5.8816533676801265e-05, + "loss": 0.2018, + "step": 6280 + }, + { + "epoch": 1.2712001619105444, + "grad_norm": 0.27948006987571716, + "learning_rate": 5.878755121788464e-05, + "loss": 0.2016, + "step": 6281 + }, + { + "epoch": 1.2714025500910746, + "grad_norm": 0.2876220941543579, + "learning_rate": 5.8758572928411136e-05, + "loss": 0.1904, + "step": 6282 + }, + { + "epoch": 1.2716049382716048, + "grad_norm": 0.26333025097846985, + "learning_rate": 5.872959881131248e-05, + "loss": 0.19, + "step": 6283 + }, + { + "epoch": 1.2718073264521352, + "grad_norm": 0.31372183561325073, + "learning_rate": 5.870062886951999e-05, + "loss": 0.2082, + "step": 6284 + }, + { + "epoch": 1.2720097146326654, + "grad_norm": 0.26160863041877747, + "learning_rate": 5.86716631059645e-05, + "loss": 0.1935, + "step": 6285 + }, + { + "epoch": 1.2722121028131956, + "grad_norm": 0.29009556770324707, + "learning_rate": 5.864270152357649e-05, + "loss": 0.2208, + "step": 6286 + }, + { + "epoch": 1.272414490993726, + "grad_norm": 0.29613327980041504, + "learning_rate": 5.8613744125285996e-05, + "loss": 0.179, + "step": 6287 + }, + { + "epoch": 1.2726168791742563, + "grad_norm": 0.29994410276412964, + "learning_rate": 5.85847909140226e-05, + "loss": 0.2183, + "step": 6288 + }, + { + "epoch": 1.2728192673547865, + "grad_norm": 0.28694406151771545, + "learning_rate": 5.855584189271549e-05, + "loss": 0.2246, + "step": 6289 + }, + { + "epoch": 1.2730216555353167, + "grad_norm": 0.26828375458717346, + "learning_rate": 5.852689706429344e-05, + "loss": 0.2023, + "step": 6290 + }, + { + "epoch": 1.273224043715847, + "grad_norm": 0.2522238492965698, + "learning_rate": 5.8497956431684766e-05, + "loss": 0.1794, + "step": 6291 + }, + { + "epoch": 1.2734264318963773, + "grad_norm": 0.306525856256485, + "learning_rate": 5.84690199978174e-05, + "loss": 0.2097, + "step": 6292 + }, + { + "epoch": 1.2736288200769075, + "grad_norm": 0.29265809059143066, + "learning_rate": 5.84400877656188e-05, + "loss": 0.206, + "step": 6293 + }, + { + "epoch": 1.273831208257438, + "grad_norm": 0.298153817653656, + "learning_rate": 5.841115973801603e-05, + "loss": 0.2021, + "step": 6294 + }, + { + "epoch": 1.274033596437968, + "grad_norm": 0.30544334650039673, + "learning_rate": 5.8382235917935745e-05, + "loss": 0.1921, + "step": 6295 + }, + { + "epoch": 1.2742359846184983, + "grad_norm": 0.2744888365268707, + "learning_rate": 5.835331630830414e-05, + "loss": 0.1753, + "step": 6296 + }, + { + "epoch": 1.2744383727990285, + "grad_norm": 0.2987912595272064, + "learning_rate": 5.832440091204698e-05, + "loss": 0.1775, + "step": 6297 + }, + { + "epoch": 1.2746407609795587, + "grad_norm": 0.31008341908454895, + "learning_rate": 5.829548973208965e-05, + "loss": 0.171, + "step": 6298 + }, + { + "epoch": 1.2748431491600891, + "grad_norm": 0.2730657458305359, + "learning_rate": 5.826658277135706e-05, + "loss": 0.21, + "step": 6299 + }, + { + "epoch": 1.2750455373406193, + "grad_norm": 0.308075875043869, + "learning_rate": 5.823768003277372e-05, + "loss": 0.2053, + "step": 6300 + }, + { + "epoch": 1.2750455373406193, + "eval_loss": 0.26606449484825134, + "eval_runtime": 0.7383, + "eval_samples_per_second": 6.772, + "eval_steps_per_second": 1.354, + "step": 6300 + }, + { + "epoch": 1.2752479255211495, + "grad_norm": 0.30432766675949097, + "learning_rate": 5.820878151926371e-05, + "loss": 0.229, + "step": 6301 + }, + { + "epoch": 1.27545031370168, + "grad_norm": 0.3013366162776947, + "learning_rate": 5.8179887233750674e-05, + "loss": 0.2356, + "step": 6302 + }, + { + "epoch": 1.2756527018822101, + "grad_norm": 0.3118283748626709, + "learning_rate": 5.815099717915784e-05, + "loss": 0.1861, + "step": 6303 + }, + { + "epoch": 1.2758550900627403, + "grad_norm": 0.29927361011505127, + "learning_rate": 5.812211135840799e-05, + "loss": 0.1997, + "step": 6304 + }, + { + "epoch": 1.2760574782432705, + "grad_norm": 0.289941668510437, + "learning_rate": 5.809322977442349e-05, + "loss": 0.2175, + "step": 6305 + }, + { + "epoch": 1.2762598664238007, + "grad_norm": 0.2943812608718872, + "learning_rate": 5.806435243012629e-05, + "loss": 0.2035, + "step": 6306 + }, + { + "epoch": 1.2764622546043312, + "grad_norm": 0.2723061740398407, + "learning_rate": 5.803547932843787e-05, + "loss": 0.2055, + "step": 6307 + }, + { + "epoch": 1.2766646427848614, + "grad_norm": 0.2959640622138977, + "learning_rate": 5.8006610472279336e-05, + "loss": 0.2127, + "step": 6308 + }, + { + "epoch": 1.2768670309653916, + "grad_norm": 0.2741999328136444, + "learning_rate": 5.797774586457132e-05, + "loss": 0.1693, + "step": 6309 + }, + { + "epoch": 1.277069419145922, + "grad_norm": 0.3455542325973511, + "learning_rate": 5.794888550823403e-05, + "loss": 0.2402, + "step": 6310 + }, + { + "epoch": 1.2772718073264522, + "grad_norm": 0.2634921669960022, + "learning_rate": 5.7920029406187284e-05, + "loss": 0.1774, + "step": 6311 + }, + { + "epoch": 1.2774741955069824, + "grad_norm": 0.3226475715637207, + "learning_rate": 5.789117756135042e-05, + "loss": 0.1888, + "step": 6312 + }, + { + "epoch": 1.2776765836875126, + "grad_norm": 0.2682722210884094, + "learning_rate": 5.786232997664236e-05, + "loss": 0.2034, + "step": 6313 + }, + { + "epoch": 1.2778789718680428, + "grad_norm": 0.26142412424087524, + "learning_rate": 5.7833486654981606e-05, + "loss": 0.2068, + "step": 6314 + }, + { + "epoch": 1.2780813600485732, + "grad_norm": 0.2496640533208847, + "learning_rate": 5.780464759928623e-05, + "loss": 0.1919, + "step": 6315 + }, + { + "epoch": 1.2782837482291034, + "grad_norm": 0.28327369689941406, + "learning_rate": 5.7775812812473864e-05, + "loss": 0.1949, + "step": 6316 + }, + { + "epoch": 1.2784861364096336, + "grad_norm": 0.2788563668727875, + "learning_rate": 5.774698229746169e-05, + "loss": 0.2042, + "step": 6317 + }, + { + "epoch": 1.278688524590164, + "grad_norm": 0.2682736814022064, + "learning_rate": 5.77181560571665e-05, + "loss": 0.19, + "step": 6318 + }, + { + "epoch": 1.2788909127706942, + "grad_norm": 0.29064783453941345, + "learning_rate": 5.7689334094504635e-05, + "loss": 0.2215, + "step": 6319 + }, + { + "epoch": 1.2790933009512244, + "grad_norm": 0.2953173518180847, + "learning_rate": 5.766051641239196e-05, + "loss": 0.1892, + "step": 6320 + }, + { + "epoch": 1.2792956891317546, + "grad_norm": 0.2516275942325592, + "learning_rate": 5.7631703013743984e-05, + "loss": 0.1574, + "step": 6321 + }, + { + "epoch": 1.279498077312285, + "grad_norm": 0.2933824360370636, + "learning_rate": 5.7602893901475744e-05, + "loss": 0.2212, + "step": 6322 + }, + { + "epoch": 1.2797004654928152, + "grad_norm": 0.3145001232624054, + "learning_rate": 5.757408907850181e-05, + "loss": 0.2046, + "step": 6323 + }, + { + "epoch": 1.2799028536733454, + "grad_norm": 0.2757995128631592, + "learning_rate": 5.754528854773639e-05, + "loss": 0.1678, + "step": 6324 + }, + { + "epoch": 1.2801052418538759, + "grad_norm": 0.3153713047504425, + "learning_rate": 5.7516492312093195e-05, + "loss": 0.1897, + "step": 6325 + }, + { + "epoch": 1.280307630034406, + "grad_norm": 0.29023733735084534, + "learning_rate": 5.748770037448552e-05, + "loss": 0.2178, + "step": 6326 + }, + { + "epoch": 1.2805100182149363, + "grad_norm": 0.24412359297275543, + "learning_rate": 5.745891273782626e-05, + "loss": 0.1861, + "step": 6327 + }, + { + "epoch": 1.2807124063954665, + "grad_norm": 0.28133055567741394, + "learning_rate": 5.7430129405027835e-05, + "loss": 0.2106, + "step": 6328 + }, + { + "epoch": 1.2809147945759967, + "grad_norm": 0.3459916412830353, + "learning_rate": 5.740135037900223e-05, + "loss": 0.2096, + "step": 6329 + }, + { + "epoch": 1.281117182756527, + "grad_norm": 0.29903319478034973, + "learning_rate": 5.737257566266101e-05, + "loss": 0.2135, + "step": 6330 + }, + { + "epoch": 1.2813195709370573, + "grad_norm": 0.33177486062049866, + "learning_rate": 5.73438052589153e-05, + "loss": 0.216, + "step": 6331 + }, + { + "epoch": 1.2815219591175875, + "grad_norm": 0.3481799066066742, + "learning_rate": 5.731503917067578e-05, + "loss": 0.2489, + "step": 6332 + }, + { + "epoch": 1.281724347298118, + "grad_norm": 0.2963894009590149, + "learning_rate": 5.728627740085273e-05, + "loss": 0.1812, + "step": 6333 + }, + { + "epoch": 1.281926735478648, + "grad_norm": 0.24134975671768188, + "learning_rate": 5.725751995235592e-05, + "loss": 0.1348, + "step": 6334 + }, + { + "epoch": 1.2821291236591783, + "grad_norm": 0.27999383211135864, + "learning_rate": 5.722876682809476e-05, + "loss": 0.1701, + "step": 6335 + }, + { + "epoch": 1.2823315118397085, + "grad_norm": 0.2429206371307373, + "learning_rate": 5.720001803097821e-05, + "loss": 0.1919, + "step": 6336 + }, + { + "epoch": 1.2825339000202387, + "grad_norm": 0.27424222230911255, + "learning_rate": 5.717127356391472e-05, + "loss": 0.213, + "step": 6337 + }, + { + "epoch": 1.2827362882007691, + "grad_norm": 0.2957404553890228, + "learning_rate": 5.714253342981235e-05, + "loss": 0.1927, + "step": 6338 + }, + { + "epoch": 1.2829386763812993, + "grad_norm": 0.2850241959095001, + "learning_rate": 5.711379763157876e-05, + "loss": 0.1778, + "step": 6339 + }, + { + "epoch": 1.2831410645618295, + "grad_norm": 0.30980175733566284, + "learning_rate": 5.708506617212113e-05, + "loss": 0.2397, + "step": 6340 + }, + { + "epoch": 1.28334345274236, + "grad_norm": 0.3064621090888977, + "learning_rate": 5.7056339054346194e-05, + "loss": 0.1958, + "step": 6341 + }, + { + "epoch": 1.2835458409228901, + "grad_norm": 0.2849607765674591, + "learning_rate": 5.702761628116029e-05, + "loss": 0.1998, + "step": 6342 + }, + { + "epoch": 1.2837482291034203, + "grad_norm": 0.25129014253616333, + "learning_rate": 5.6998897855469245e-05, + "loss": 0.2023, + "step": 6343 + }, + { + "epoch": 1.2839506172839505, + "grad_norm": 0.27141299843788147, + "learning_rate": 5.697018378017851e-05, + "loss": 0.2023, + "step": 6344 + }, + { + "epoch": 1.2841530054644807, + "grad_norm": 0.30553534626960754, + "learning_rate": 5.694147405819309e-05, + "loss": 0.2055, + "step": 6345 + }, + { + "epoch": 1.2843553936450112, + "grad_norm": 0.2521169185638428, + "learning_rate": 5.6912768692417505e-05, + "loss": 0.1964, + "step": 6346 + }, + { + "epoch": 1.2845577818255414, + "grad_norm": 0.2850729823112488, + "learning_rate": 5.688406768575587e-05, + "loss": 0.1935, + "step": 6347 + }, + { + "epoch": 1.2847601700060716, + "grad_norm": 0.27700698375701904, + "learning_rate": 5.6855371041111874e-05, + "loss": 0.2078, + "step": 6348 + }, + { + "epoch": 1.284962558186602, + "grad_norm": 0.24651356041431427, + "learning_rate": 5.682667876138871e-05, + "loss": 0.169, + "step": 6349 + }, + { + "epoch": 1.2851649463671322, + "grad_norm": 0.325048565864563, + "learning_rate": 5.679799084948918e-05, + "loss": 0.1984, + "step": 6350 + }, + { + "epoch": 1.2851649463671322, + "eval_loss": 0.2654205858707428, + "eval_runtime": 0.7368, + "eval_samples_per_second": 6.786, + "eval_steps_per_second": 1.357, + "step": 6350 + }, + { + "epoch": 1.2853673345476624, + "grad_norm": 0.28931573033332825, + "learning_rate": 5.676930730831562e-05, + "loss": 0.205, + "step": 6351 + }, + { + "epoch": 1.2855697227281926, + "grad_norm": 0.37909775972366333, + "learning_rate": 5.674062814076994e-05, + "loss": 0.2056, + "step": 6352 + }, + { + "epoch": 1.285772110908723, + "grad_norm": 0.25672927498817444, + "learning_rate": 5.671195334975358e-05, + "loss": 0.1567, + "step": 6353 + }, + { + "epoch": 1.2859744990892532, + "grad_norm": 0.25225627422332764, + "learning_rate": 5.668328293816756e-05, + "loss": 0.158, + "step": 6354 + }, + { + "epoch": 1.2861768872697834, + "grad_norm": 0.35539114475250244, + "learning_rate": 5.6654616908912473e-05, + "loss": 0.2258, + "step": 6355 + }, + { + "epoch": 1.2863792754503138, + "grad_norm": 0.2866462767124176, + "learning_rate": 5.6625955264888405e-05, + "loss": 0.2068, + "step": 6356 + }, + { + "epoch": 1.286581663630844, + "grad_norm": 0.26378700137138367, + "learning_rate": 5.659729800899509e-05, + "loss": 0.1868, + "step": 6357 + }, + { + "epoch": 1.2867840518113742, + "grad_norm": 0.3105945289134979, + "learning_rate": 5.656864514413174e-05, + "loss": 0.1813, + "step": 6358 + }, + { + "epoch": 1.2869864399919044, + "grad_norm": 0.24603483080863953, + "learning_rate": 5.6539996673197134e-05, + "loss": 0.1574, + "step": 6359 + }, + { + "epoch": 1.2871888281724346, + "grad_norm": 0.28006672859191895, + "learning_rate": 5.6511352599089664e-05, + "loss": 0.2039, + "step": 6360 + }, + { + "epoch": 1.287391216352965, + "grad_norm": 0.2715175449848175, + "learning_rate": 5.6482712924707203e-05, + "loss": 0.2076, + "step": 6361 + }, + { + "epoch": 1.2875936045334953, + "grad_norm": 0.2849876284599304, + "learning_rate": 5.6454077652947236e-05, + "loss": 0.2028, + "step": 6362 + }, + { + "epoch": 1.2877959927140255, + "grad_norm": 0.2738122045993805, + "learning_rate": 5.642544678670676e-05, + "loss": 0.2017, + "step": 6363 + }, + { + "epoch": 1.2879983808945559, + "grad_norm": 0.25495240092277527, + "learning_rate": 5.639682032888236e-05, + "loss": 0.1781, + "step": 6364 + }, + { + "epoch": 1.288200769075086, + "grad_norm": 0.2999967038631439, + "learning_rate": 5.6368198282370164e-05, + "loss": 0.2034, + "step": 6365 + }, + { + "epoch": 1.2884031572556163, + "grad_norm": 0.25779151916503906, + "learning_rate": 5.633958065006584e-05, + "loss": 0.193, + "step": 6366 + }, + { + "epoch": 1.2886055454361465, + "grad_norm": 0.30179300904273987, + "learning_rate": 5.6310967434864614e-05, + "loss": 0.1953, + "step": 6367 + }, + { + "epoch": 1.2888079336166767, + "grad_norm": 0.2599449157714844, + "learning_rate": 5.6282358639661284e-05, + "loss": 0.1783, + "step": 6368 + }, + { + "epoch": 1.289010321797207, + "grad_norm": 0.2871106266975403, + "learning_rate": 5.6253754267350176e-05, + "loss": 0.1946, + "step": 6369 + }, + { + "epoch": 1.2892127099777373, + "grad_norm": 0.2777661383152008, + "learning_rate": 5.62251543208252e-05, + "loss": 0.1887, + "step": 6370 + }, + { + "epoch": 1.2894150981582675, + "grad_norm": 0.2921324074268341, + "learning_rate": 5.619655880297978e-05, + "loss": 0.215, + "step": 6371 + }, + { + "epoch": 1.289617486338798, + "grad_norm": 0.29286885261535645, + "learning_rate": 5.616796771670692e-05, + "loss": 0.1865, + "step": 6372 + }, + { + "epoch": 1.2898198745193281, + "grad_norm": 0.3215929865837097, + "learning_rate": 5.613938106489916e-05, + "loss": 0.2336, + "step": 6373 + }, + { + "epoch": 1.2900222626998583, + "grad_norm": 0.25224849581718445, + "learning_rate": 5.611079885044859e-05, + "loss": 0.1729, + "step": 6374 + }, + { + "epoch": 1.2902246508803885, + "grad_norm": 0.2967127561569214, + "learning_rate": 5.608222107624688e-05, + "loss": 0.2173, + "step": 6375 + }, + { + "epoch": 1.2904270390609187, + "grad_norm": 0.2792723774909973, + "learning_rate": 5.60536477451852e-05, + "loss": 0.1813, + "step": 6376 + }, + { + "epoch": 1.2906294272414491, + "grad_norm": 0.3118976950645447, + "learning_rate": 5.6025078860154334e-05, + "loss": 0.2256, + "step": 6377 + }, + { + "epoch": 1.2908318154219793, + "grad_norm": 0.3105722665786743, + "learning_rate": 5.5996514424044565e-05, + "loss": 0.206, + "step": 6378 + }, + { + "epoch": 1.2910342036025095, + "grad_norm": 0.28627604246139526, + "learning_rate": 5.596795443974574e-05, + "loss": 0.2053, + "step": 6379 + }, + { + "epoch": 1.29123659178304, + "grad_norm": 0.32842501997947693, + "learning_rate": 5.593939891014726e-05, + "loss": 0.2459, + "step": 6380 + }, + { + "epoch": 1.2914389799635702, + "grad_norm": 0.3171040117740631, + "learning_rate": 5.5910847838138114e-05, + "loss": 0.1923, + "step": 6381 + }, + { + "epoch": 1.2916413681441004, + "grad_norm": 0.29570427536964417, + "learning_rate": 5.588230122660671e-05, + "loss": 0.2182, + "step": 6382 + }, + { + "epoch": 1.2918437563246306, + "grad_norm": 0.28231585025787354, + "learning_rate": 5.585375907844117e-05, + "loss": 0.1847, + "step": 6383 + }, + { + "epoch": 1.292046144505161, + "grad_norm": 0.30900073051452637, + "learning_rate": 5.582522139652906e-05, + "loss": 0.195, + "step": 6384 + }, + { + "epoch": 1.2922485326856912, + "grad_norm": 0.37727421522140503, + "learning_rate": 5.579668818375752e-05, + "loss": 0.2336, + "step": 6385 + }, + { + "epoch": 1.2924509208662214, + "grad_norm": 0.29819825291633606, + "learning_rate": 5.5768159443013255e-05, + "loss": 0.1505, + "step": 6386 + }, + { + "epoch": 1.2926533090467518, + "grad_norm": 0.2777939736843109, + "learning_rate": 5.57396351771825e-05, + "loss": 0.1769, + "step": 6387 + }, + { + "epoch": 1.292855697227282, + "grad_norm": 0.2909295856952667, + "learning_rate": 5.5711115389151036e-05, + "loss": 0.1709, + "step": 6388 + }, + { + "epoch": 1.2930580854078122, + "grad_norm": 0.30930832028388977, + "learning_rate": 5.5682600081804193e-05, + "loss": 0.2104, + "step": 6389 + }, + { + "epoch": 1.2932604735883424, + "grad_norm": 0.2792358994483948, + "learning_rate": 5.5654089258026866e-05, + "loss": 0.2252, + "step": 6390 + }, + { + "epoch": 1.2934628617688726, + "grad_norm": 0.2880384624004364, + "learning_rate": 5.5625582920703464e-05, + "loss": 0.1987, + "step": 6391 + }, + { + "epoch": 1.293665249949403, + "grad_norm": 0.2849469482898712, + "learning_rate": 5.559708107271797e-05, + "loss": 0.2174, + "step": 6392 + }, + { + "epoch": 1.2938676381299332, + "grad_norm": 0.28191232681274414, + "learning_rate": 5.55685837169539e-05, + "loss": 0.1831, + "step": 6393 + }, + { + "epoch": 1.2940700263104634, + "grad_norm": 0.26724427938461304, + "learning_rate": 5.554009085629431e-05, + "loss": 0.2101, + "step": 6394 + }, + { + "epoch": 1.2942724144909938, + "grad_norm": 0.28229594230651855, + "learning_rate": 5.551160249362183e-05, + "loss": 0.1904, + "step": 6395 + }, + { + "epoch": 1.294474802671524, + "grad_norm": 0.2788131833076477, + "learning_rate": 5.5483118631818586e-05, + "loss": 0.1992, + "step": 6396 + }, + { + "epoch": 1.2946771908520542, + "grad_norm": 0.271913081407547, + "learning_rate": 5.545463927376628e-05, + "loss": 0.1844, + "step": 6397 + }, + { + "epoch": 1.2948795790325844, + "grad_norm": 0.2459937036037445, + "learning_rate": 5.542616442234618e-05, + "loss": 0.1487, + "step": 6398 + }, + { + "epoch": 1.2950819672131146, + "grad_norm": 0.2703413665294647, + "learning_rate": 5.539769408043904e-05, + "loss": 0.1766, + "step": 6399 + }, + { + "epoch": 1.295284355393645, + "grad_norm": 0.3046536147594452, + "learning_rate": 5.536922825092523e-05, + "loss": 0.2165, + "step": 6400 + }, + { + "epoch": 1.295284355393645, + "eval_loss": 0.26894286274909973, + "eval_runtime": 0.7379, + "eval_samples_per_second": 6.776, + "eval_steps_per_second": 1.355, + "step": 6400 + }, + { + "epoch": 1.2954867435741753, + "grad_norm": 0.2641198933124542, + "learning_rate": 5.534076693668457e-05, + "loss": 0.1458, + "step": 6401 + }, + { + "epoch": 1.2956891317547055, + "grad_norm": 0.30567851662635803, + "learning_rate": 5.5312310140596535e-05, + "loss": 0.203, + "step": 6402 + }, + { + "epoch": 1.2958915199352359, + "grad_norm": 0.2631327509880066, + "learning_rate": 5.528385786554009e-05, + "loss": 0.1742, + "step": 6403 + }, + { + "epoch": 1.296093908115766, + "grad_norm": 0.2758510708808899, + "learning_rate": 5.5255410114393656e-05, + "loss": 0.1757, + "step": 6404 + }, + { + "epoch": 1.2962962962962963, + "grad_norm": 0.2766825258731842, + "learning_rate": 5.5226966890035325e-05, + "loss": 0.1884, + "step": 6405 + }, + { + "epoch": 1.2964986844768265, + "grad_norm": 0.2983852028846741, + "learning_rate": 5.5198528195342704e-05, + "loss": 0.192, + "step": 6406 + }, + { + "epoch": 1.2967010726573567, + "grad_norm": 0.2765571177005768, + "learning_rate": 5.517009403319289e-05, + "loss": 0.1783, + "step": 6407 + }, + { + "epoch": 1.296903460837887, + "grad_norm": 0.31526094675064087, + "learning_rate": 5.514166440646256e-05, + "loss": 0.1871, + "step": 6408 + }, + { + "epoch": 1.2971058490184173, + "grad_norm": 0.28560030460357666, + "learning_rate": 5.5113239318027945e-05, + "loss": 0.2182, + "step": 6409 + }, + { + "epoch": 1.2973082371989475, + "grad_norm": 0.2786945700645447, + "learning_rate": 5.5084818770764746e-05, + "loss": 0.1985, + "step": 6410 + }, + { + "epoch": 1.297510625379478, + "grad_norm": 0.3191640377044678, + "learning_rate": 5.505640276754832e-05, + "loss": 0.2244, + "step": 6411 + }, + { + "epoch": 1.2977130135600081, + "grad_norm": 0.26280149817466736, + "learning_rate": 5.502799131125349e-05, + "loss": 0.2237, + "step": 6412 + }, + { + "epoch": 1.2979154017405383, + "grad_norm": 0.2876483201980591, + "learning_rate": 5.49995844047546e-05, + "loss": 0.2006, + "step": 6413 + }, + { + "epoch": 1.2981177899210685, + "grad_norm": 0.26349586248397827, + "learning_rate": 5.497118205092558e-05, + "loss": 0.1689, + "step": 6414 + }, + { + "epoch": 1.298320178101599, + "grad_norm": 0.30534908175468445, + "learning_rate": 5.494278425263988e-05, + "loss": 0.1732, + "step": 6415 + }, + { + "epoch": 1.2985225662821291, + "grad_norm": 0.5254201292991638, + "learning_rate": 5.491439101277049e-05, + "loss": 0.2025, + "step": 6416 + }, + { + "epoch": 1.2987249544626593, + "grad_norm": 0.31549495458602905, + "learning_rate": 5.4886002334189946e-05, + "loss": 0.184, + "step": 6417 + }, + { + "epoch": 1.2989273426431898, + "grad_norm": 0.33482885360717773, + "learning_rate": 5.485761821977029e-05, + "loss": 0.2091, + "step": 6418 + }, + { + "epoch": 1.29912973082372, + "grad_norm": 0.3019998371601105, + "learning_rate": 5.482923867238317e-05, + "loss": 0.1964, + "step": 6419 + }, + { + "epoch": 1.2993321190042502, + "grad_norm": 0.2757321000099182, + "learning_rate": 5.4800863694899695e-05, + "loss": 0.1911, + "step": 6420 + }, + { + "epoch": 1.2995345071847804, + "grad_norm": 0.299513578414917, + "learning_rate": 5.477249329019057e-05, + "loss": 0.1983, + "step": 6421 + }, + { + "epoch": 1.2997368953653106, + "grad_norm": 0.2800210118293762, + "learning_rate": 5.474412746112601e-05, + "loss": 0.2082, + "step": 6422 + }, + { + "epoch": 1.299939283545841, + "grad_norm": 0.27341270446777344, + "learning_rate": 5.471576621057577e-05, + "loss": 0.1975, + "step": 6423 + }, + { + "epoch": 1.3001416717263712, + "grad_norm": 0.29459577798843384, + "learning_rate": 5.468740954140913e-05, + "loss": 0.2162, + "step": 6424 + }, + { + "epoch": 1.3003440599069014, + "grad_norm": 0.31252217292785645, + "learning_rate": 5.465905745649498e-05, + "loss": 0.2091, + "step": 6425 + }, + { + "epoch": 1.3005464480874318, + "grad_norm": 0.3006681501865387, + "learning_rate": 5.46307099587016e-05, + "loss": 0.2129, + "step": 6426 + }, + { + "epoch": 1.300748836267962, + "grad_norm": 0.26092275977134705, + "learning_rate": 5.460236705089693e-05, + "loss": 0.1709, + "step": 6427 + }, + { + "epoch": 1.3009512244484922, + "grad_norm": 0.2787107229232788, + "learning_rate": 5.457402873594841e-05, + "loss": 0.2133, + "step": 6428 + }, + { + "epoch": 1.3011536126290224, + "grad_norm": 0.3300659954547882, + "learning_rate": 5.454569501672302e-05, + "loss": 0.2338, + "step": 6429 + }, + { + "epoch": 1.3013560008095526, + "grad_norm": 0.29084473848342896, + "learning_rate": 5.4517365896087246e-05, + "loss": 0.2119, + "step": 6430 + }, + { + "epoch": 1.301558388990083, + "grad_norm": 0.3082306385040283, + "learning_rate": 5.4489041376907156e-05, + "loss": 0.1762, + "step": 6431 + }, + { + "epoch": 1.3017607771706132, + "grad_norm": 0.2574545443058014, + "learning_rate": 5.4460721462048324e-05, + "loss": 0.2003, + "step": 6432 + }, + { + "epoch": 1.3019631653511434, + "grad_norm": 0.31898951530456543, + "learning_rate": 5.443240615437586e-05, + "loss": 0.2014, + "step": 6433 + }, + { + "epoch": 1.3021655535316738, + "grad_norm": 0.25916787981987, + "learning_rate": 5.44040954567544e-05, + "loss": 0.2065, + "step": 6434 + }, + { + "epoch": 1.302367941712204, + "grad_norm": 0.25367629528045654, + "learning_rate": 5.437578937204813e-05, + "loss": 0.1668, + "step": 6435 + }, + { + "epoch": 1.3025703298927342, + "grad_norm": 0.27210503816604614, + "learning_rate": 5.4347487903120744e-05, + "loss": 0.1903, + "step": 6436 + }, + { + "epoch": 1.3027727180732644, + "grad_norm": 0.2796219289302826, + "learning_rate": 5.4319191052835525e-05, + "loss": 0.1919, + "step": 6437 + }, + { + "epoch": 1.3029751062537946, + "grad_norm": 0.35584887862205505, + "learning_rate": 5.429089882405523e-05, + "loss": 0.1834, + "step": 6438 + }, + { + "epoch": 1.303177494434325, + "grad_norm": 0.30212247371673584, + "learning_rate": 5.426261121964217e-05, + "loss": 0.2138, + "step": 6439 + }, + { + "epoch": 1.3033798826148553, + "grad_norm": 0.2893647849559784, + "learning_rate": 5.423432824245819e-05, + "loss": 0.21, + "step": 6440 + }, + { + "epoch": 1.3035822707953855, + "grad_norm": 0.2603617310523987, + "learning_rate": 5.4206049895364664e-05, + "loss": 0.2013, + "step": 6441 + }, + { + "epoch": 1.3037846589759159, + "grad_norm": 0.25256669521331787, + "learning_rate": 5.417777618122249e-05, + "loss": 0.1481, + "step": 6442 + }, + { + "epoch": 1.303987047156446, + "grad_norm": 0.2812972068786621, + "learning_rate": 5.414950710289213e-05, + "loss": 0.2016, + "step": 6443 + }, + { + "epoch": 1.3041894353369763, + "grad_norm": 0.2581676244735718, + "learning_rate": 5.412124266323353e-05, + "loss": 0.1927, + "step": 6444 + }, + { + "epoch": 1.3043918235175065, + "grad_norm": 0.3383719325065613, + "learning_rate": 5.40929828651062e-05, + "loss": 0.2128, + "step": 6445 + }, + { + "epoch": 1.304594211698037, + "grad_norm": 0.26960957050323486, + "learning_rate": 5.406472771136917e-05, + "loss": 0.1919, + "step": 6446 + }, + { + "epoch": 1.304796599878567, + "grad_norm": 0.4218273460865021, + "learning_rate": 5.403647720488099e-05, + "loss": 0.1771, + "step": 6447 + }, + { + "epoch": 1.3049989880590973, + "grad_norm": 0.33814752101898193, + "learning_rate": 5.4008231348499794e-05, + "loss": 0.1842, + "step": 6448 + }, + { + "epoch": 1.3052013762396277, + "grad_norm": 0.27987149357795715, + "learning_rate": 5.3979990145083124e-05, + "loss": 0.2052, + "step": 6449 + }, + { + "epoch": 1.305403764420158, + "grad_norm": 0.27401459217071533, + "learning_rate": 5.3951753597488176e-05, + "loss": 0.1909, + "step": 6450 + }, + { + "epoch": 1.305403764420158, + "eval_loss": 0.2693765461444855, + "eval_runtime": 0.7411, + "eval_samples_per_second": 6.747, + "eval_steps_per_second": 1.349, + "step": 6450 + }, + { + "epoch": 1.3056061526006881, + "grad_norm": 0.3010677695274353, + "learning_rate": 5.392352170857162e-05, + "loss": 0.1577, + "step": 6451 + }, + { + "epoch": 1.3058085407812183, + "grad_norm": 0.32166436314582825, + "learning_rate": 5.389529448118966e-05, + "loss": 0.2066, + "step": 6452 + }, + { + "epoch": 1.3060109289617485, + "grad_norm": 0.283600389957428, + "learning_rate": 5.386707191819803e-05, + "loss": 0.2034, + "step": 6453 + }, + { + "epoch": 1.306213317142279, + "grad_norm": 0.2762928605079651, + "learning_rate": 5.3838854022452e-05, + "loss": 0.1886, + "step": 6454 + }, + { + "epoch": 1.3064157053228092, + "grad_norm": 0.2601023018360138, + "learning_rate": 5.381064079680635e-05, + "loss": 0.1645, + "step": 6455 + }, + { + "epoch": 1.3066180935033394, + "grad_norm": 0.3133906424045563, + "learning_rate": 5.378243224411541e-05, + "loss": 0.2255, + "step": 6456 + }, + { + "epoch": 1.3068204816838698, + "grad_norm": 0.269016295671463, + "learning_rate": 5.375422836723303e-05, + "loss": 0.1651, + "step": 6457 + }, + { + "epoch": 1.3070228698644, + "grad_norm": 0.2578776478767395, + "learning_rate": 5.3726029169012556e-05, + "loss": 0.1606, + "step": 6458 + }, + { + "epoch": 1.3072252580449302, + "grad_norm": 0.3729574382305145, + "learning_rate": 5.369783465230691e-05, + "loss": 0.2231, + "step": 6459 + }, + { + "epoch": 1.3074276462254604, + "grad_norm": 0.27884671092033386, + "learning_rate": 5.366964481996852e-05, + "loss": 0.1803, + "step": 6460 + }, + { + "epoch": 1.3076300344059906, + "grad_norm": 0.3325755000114441, + "learning_rate": 5.3641459674849315e-05, + "loss": 0.2229, + "step": 6461 + }, + { + "epoch": 1.307832422586521, + "grad_norm": 0.27439218759536743, + "learning_rate": 5.3613279219800794e-05, + "loss": 0.1734, + "step": 6462 + }, + { + "epoch": 1.3080348107670512, + "grad_norm": 0.31271788477897644, + "learning_rate": 5.358510345767395e-05, + "loss": 0.2196, + "step": 6463 + }, + { + "epoch": 1.3082371989475814, + "grad_norm": 0.2967276871204376, + "learning_rate": 5.3556932391319304e-05, + "loss": 0.2027, + "step": 6464 + }, + { + "epoch": 1.3084395871281118, + "grad_norm": 0.24664251506328583, + "learning_rate": 5.3528766023586915e-05, + "loss": 0.1916, + "step": 6465 + }, + { + "epoch": 1.308641975308642, + "grad_norm": 0.269447386264801, + "learning_rate": 5.3500604357326376e-05, + "loss": 0.1685, + "step": 6466 + }, + { + "epoch": 1.3088443634891722, + "grad_norm": 0.23026353120803833, + "learning_rate": 5.347244739538677e-05, + "loss": 0.1641, + "step": 6467 + }, + { + "epoch": 1.3090467516697024, + "grad_norm": 0.27245429158210754, + "learning_rate": 5.3444295140616684e-05, + "loss": 0.1865, + "step": 6468 + }, + { + "epoch": 1.3092491398502326, + "grad_norm": 0.2808758616447449, + "learning_rate": 5.341614759586436e-05, + "loss": 0.1984, + "step": 6469 + }, + { + "epoch": 1.309451528030763, + "grad_norm": 0.2517721652984619, + "learning_rate": 5.338800476397746e-05, + "loss": 0.1535, + "step": 6470 + }, + { + "epoch": 1.3096539162112932, + "grad_norm": 0.28509992361068726, + "learning_rate": 5.3359866647803104e-05, + "loss": 0.1886, + "step": 6471 + }, + { + "epoch": 1.3098563043918237, + "grad_norm": 0.26791247725486755, + "learning_rate": 5.3331733250188054e-05, + "loss": 0.1953, + "step": 6472 + }, + { + "epoch": 1.3100586925723539, + "grad_norm": 0.26313990354537964, + "learning_rate": 5.330360457397854e-05, + "loss": 0.1874, + "step": 6473 + }, + { + "epoch": 1.310261080752884, + "grad_norm": 0.36424994468688965, + "learning_rate": 5.3275480622020346e-05, + "loss": 0.2314, + "step": 6474 + }, + { + "epoch": 1.3104634689334143, + "grad_norm": 0.30342644453048706, + "learning_rate": 5.324736139715875e-05, + "loss": 0.2049, + "step": 6475 + }, + { + "epoch": 1.3106658571139445, + "grad_norm": 0.281800240278244, + "learning_rate": 5.321924690223854e-05, + "loss": 0.1761, + "step": 6476 + }, + { + "epoch": 1.3108682452944749, + "grad_norm": 0.2727698087692261, + "learning_rate": 5.319113714010409e-05, + "loss": 0.1895, + "step": 6477 + }, + { + "epoch": 1.311070633475005, + "grad_norm": 0.3317430019378662, + "learning_rate": 5.31630321135992e-05, + "loss": 0.1835, + "step": 6478 + }, + { + "epoch": 1.3112730216555353, + "grad_norm": 0.2980647385120392, + "learning_rate": 5.313493182556728e-05, + "loss": 0.2068, + "step": 6479 + }, + { + "epoch": 1.3114754098360657, + "grad_norm": 0.3750055432319641, + "learning_rate": 5.3106836278851205e-05, + "loss": 0.1813, + "step": 6480 + }, + { + "epoch": 1.311677798016596, + "grad_norm": 0.2763236165046692, + "learning_rate": 5.307874547629339e-05, + "loss": 0.1592, + "step": 6481 + }, + { + "epoch": 1.311880186197126, + "grad_norm": 0.27184009552001953, + "learning_rate": 5.305065942073576e-05, + "loss": 0.1825, + "step": 6482 + }, + { + "epoch": 1.3120825743776563, + "grad_norm": 0.324699342250824, + "learning_rate": 5.3022578115019786e-05, + "loss": 0.1988, + "step": 6483 + }, + { + "epoch": 1.3122849625581865, + "grad_norm": 0.30634692311286926, + "learning_rate": 5.299450156198642e-05, + "loss": 0.1663, + "step": 6484 + }, + { + "epoch": 1.312487350738717, + "grad_norm": 0.290509968996048, + "learning_rate": 5.296642976447618e-05, + "loss": 0.1816, + "step": 6485 + }, + { + "epoch": 1.3126897389192471, + "grad_norm": 0.3158789277076721, + "learning_rate": 5.293836272532905e-05, + "loss": 0.2141, + "step": 6486 + }, + { + "epoch": 1.3128921270997773, + "grad_norm": 0.29174625873565674, + "learning_rate": 5.291030044738456e-05, + "loss": 0.187, + "step": 6487 + }, + { + "epoch": 1.3130945152803077, + "grad_norm": 0.2723025977611542, + "learning_rate": 5.2882242933481775e-05, + "loss": 0.2015, + "step": 6488 + }, + { + "epoch": 1.313296903460838, + "grad_norm": 0.5538578033447266, + "learning_rate": 5.285419018645925e-05, + "loss": 0.2135, + "step": 6489 + }, + { + "epoch": 1.3134992916413681, + "grad_norm": 0.4139079749584198, + "learning_rate": 5.282614220915505e-05, + "loss": 0.2084, + "step": 6490 + }, + { + "epoch": 1.3137016798218983, + "grad_norm": 0.3343629539012909, + "learning_rate": 5.279809900440681e-05, + "loss": 0.194, + "step": 6491 + }, + { + "epoch": 1.3139040680024285, + "grad_norm": 0.2938506007194519, + "learning_rate": 5.277006057505166e-05, + "loss": 0.246, + "step": 6492 + }, + { + "epoch": 1.314106456182959, + "grad_norm": 0.3289811313152313, + "learning_rate": 5.274202692392616e-05, + "loss": 0.2207, + "step": 6493 + }, + { + "epoch": 1.3143088443634892, + "grad_norm": 0.26164838671684265, + "learning_rate": 5.271399805386652e-05, + "loss": 0.1842, + "step": 6494 + }, + { + "epoch": 1.3145112325440194, + "grad_norm": 0.29249104857444763, + "learning_rate": 5.268597396770838e-05, + "loss": 0.1949, + "step": 6495 + }, + { + "epoch": 1.3147136207245498, + "grad_norm": 0.2775794267654419, + "learning_rate": 5.265795466828692e-05, + "loss": 0.1739, + "step": 6496 + }, + { + "epoch": 1.31491600890508, + "grad_norm": 0.382659375667572, + "learning_rate": 5.2629940158436874e-05, + "loss": 0.2117, + "step": 6497 + }, + { + "epoch": 1.3151183970856102, + "grad_norm": 0.28004685044288635, + "learning_rate": 5.260193044099242e-05, + "loss": 0.2075, + "step": 6498 + }, + { + "epoch": 1.3153207852661404, + "grad_norm": 0.319001168012619, + "learning_rate": 5.25739255187873e-05, + "loss": 0.1853, + "step": 6499 + }, + { + "epoch": 1.3155231734466706, + "grad_norm": 0.2945215404033661, + "learning_rate": 5.254592539465477e-05, + "loss": 0.1866, + "step": 6500 + }, + { + "epoch": 1.3155231734466706, + "eval_loss": 0.27097755670547485, + "eval_runtime": 0.7381, + "eval_samples_per_second": 6.774, + "eval_steps_per_second": 1.355, + "step": 6500 + }, + { + "epoch": 1.315725561627201, + "grad_norm": 0.23622101545333862, + "learning_rate": 5.251793007142758e-05, + "loss": 0.1413, + "step": 6501 + }, + { + "epoch": 1.3159279498077312, + "grad_norm": 0.29905086755752563, + "learning_rate": 5.248993955193799e-05, + "loss": 0.2117, + "step": 6502 + }, + { + "epoch": 1.3161303379882616, + "grad_norm": 0.2794634699821472, + "learning_rate": 5.246195383901782e-05, + "loss": 0.1778, + "step": 6503 + }, + { + "epoch": 1.3163327261687918, + "grad_norm": 0.2916606366634369, + "learning_rate": 5.243397293549832e-05, + "loss": 0.194, + "step": 6504 + }, + { + "epoch": 1.316535114349322, + "grad_norm": 0.3154829442501068, + "learning_rate": 5.240599684421036e-05, + "loss": 0.2022, + "step": 6505 + }, + { + "epoch": 1.3167375025298522, + "grad_norm": 0.2974871098995209, + "learning_rate": 5.2378025567984225e-05, + "loss": 0.2238, + "step": 6506 + }, + { + "epoch": 1.3169398907103824, + "grad_norm": 0.2797040045261383, + "learning_rate": 5.2350059109649784e-05, + "loss": 0.2043, + "step": 6507 + }, + { + "epoch": 1.3171422788909128, + "grad_norm": 0.26968511939048767, + "learning_rate": 5.232209747203636e-05, + "loss": 0.1652, + "step": 6508 + }, + { + "epoch": 1.317344667071443, + "grad_norm": 0.30984237790107727, + "learning_rate": 5.229414065797284e-05, + "loss": 0.2272, + "step": 6509 + }, + { + "epoch": 1.3175470552519732, + "grad_norm": 0.26939573884010315, + "learning_rate": 5.226618867028761e-05, + "loss": 0.1759, + "step": 6510 + }, + { + "epoch": 1.3177494434325037, + "grad_norm": 0.3126527667045593, + "learning_rate": 5.223824151180854e-05, + "loss": 0.2236, + "step": 6511 + }, + { + "epoch": 1.3179518316130339, + "grad_norm": 0.28654423356056213, + "learning_rate": 5.221029918536302e-05, + "loss": 0.1836, + "step": 6512 + }, + { + "epoch": 1.318154219793564, + "grad_norm": 0.2838856875896454, + "learning_rate": 5.218236169377799e-05, + "loss": 0.1835, + "step": 6513 + }, + { + "epoch": 1.3183566079740943, + "grad_norm": 0.28244468569755554, + "learning_rate": 5.215442903987986e-05, + "loss": 0.2124, + "step": 6514 + }, + { + "epoch": 1.3185589961546245, + "grad_norm": 0.2554950416088104, + "learning_rate": 5.212650122649456e-05, + "loss": 0.188, + "step": 6515 + }, + { + "epoch": 1.3187613843351549, + "grad_norm": 0.2603509724140167, + "learning_rate": 5.209857825644753e-05, + "loss": 0.197, + "step": 6516 + }, + { + "epoch": 1.318963772515685, + "grad_norm": 0.24836356937885284, + "learning_rate": 5.207066013256374e-05, + "loss": 0.1781, + "step": 6517 + }, + { + "epoch": 1.3191661606962153, + "grad_norm": 0.2643834948539734, + "learning_rate": 5.204274685766764e-05, + "loss": 0.181, + "step": 6518 + }, + { + "epoch": 1.3193685488767457, + "grad_norm": 0.2769608199596405, + "learning_rate": 5.201483843458319e-05, + "loss": 0.1884, + "step": 6519 + }, + { + "epoch": 1.319570937057276, + "grad_norm": 0.2706235349178314, + "learning_rate": 5.198693486613389e-05, + "loss": 0.1965, + "step": 6520 + }, + { + "epoch": 1.319773325237806, + "grad_norm": 0.29300135374069214, + "learning_rate": 5.1959036155142724e-05, + "loss": 0.1768, + "step": 6521 + }, + { + "epoch": 1.3199757134183363, + "grad_norm": 0.2877142131328583, + "learning_rate": 5.193114230443219e-05, + "loss": 0.1943, + "step": 6522 + }, + { + "epoch": 1.3201781015988665, + "grad_norm": 0.26191914081573486, + "learning_rate": 5.1903253316824305e-05, + "loss": 0.2015, + "step": 6523 + }, + { + "epoch": 1.320380489779397, + "grad_norm": 0.2909492254257202, + "learning_rate": 5.187536919514058e-05, + "loss": 0.1956, + "step": 6524 + }, + { + "epoch": 1.3205828779599271, + "grad_norm": 0.2586766183376312, + "learning_rate": 5.184748994220201e-05, + "loss": 0.1553, + "step": 6525 + }, + { + "epoch": 1.3207852661404573, + "grad_norm": 0.29264846444129944, + "learning_rate": 5.181961556082917e-05, + "loss": 0.1902, + "step": 6526 + }, + { + "epoch": 1.3209876543209877, + "grad_norm": 0.3760776221752167, + "learning_rate": 5.179174605384207e-05, + "loss": 0.2315, + "step": 6527 + }, + { + "epoch": 1.321190042501518, + "grad_norm": 0.2586089074611664, + "learning_rate": 5.176388142406026e-05, + "loss": 0.1886, + "step": 6528 + }, + { + "epoch": 1.3213924306820481, + "grad_norm": 0.2983001470565796, + "learning_rate": 5.17360216743028e-05, + "loss": 0.2337, + "step": 6529 + }, + { + "epoch": 1.3215948188625783, + "grad_norm": 0.30242887139320374, + "learning_rate": 5.1708166807388235e-05, + "loss": 0.2252, + "step": 6530 + }, + { + "epoch": 1.3217972070431085, + "grad_norm": 0.2580106556415558, + "learning_rate": 5.168031682613462e-05, + "loss": 0.1768, + "step": 6531 + }, + { + "epoch": 1.321999595223639, + "grad_norm": 0.31099486351013184, + "learning_rate": 5.165247173335954e-05, + "loss": 0.2352, + "step": 6532 + }, + { + "epoch": 1.3222019834041692, + "grad_norm": 0.25483667850494385, + "learning_rate": 5.162463153188009e-05, + "loss": 0.1635, + "step": 6533 + }, + { + "epoch": 1.3224043715846996, + "grad_norm": 0.27290982007980347, + "learning_rate": 5.159679622451279e-05, + "loss": 0.1768, + "step": 6534 + }, + { + "epoch": 1.3226067597652298, + "grad_norm": 0.3144781291484833, + "learning_rate": 5.1568965814073775e-05, + "loss": 0.213, + "step": 6535 + }, + { + "epoch": 1.32280914794576, + "grad_norm": 0.314395546913147, + "learning_rate": 5.154114030337862e-05, + "loss": 0.1714, + "step": 6536 + }, + { + "epoch": 1.3230115361262902, + "grad_norm": 0.27756208181381226, + "learning_rate": 5.1513319695242446e-05, + "loss": 0.1874, + "step": 6537 + }, + { + "epoch": 1.3232139243068204, + "grad_norm": 0.3120867609977722, + "learning_rate": 5.14855039924798e-05, + "loss": 0.1853, + "step": 6538 + }, + { + "epoch": 1.3234163124873508, + "grad_norm": 0.28202709555625916, + "learning_rate": 5.145769319790479e-05, + "loss": 0.2047, + "step": 6539 + }, + { + "epoch": 1.323618700667881, + "grad_norm": 0.2737361490726471, + "learning_rate": 5.1429887314331025e-05, + "loss": 0.2033, + "step": 6540 + }, + { + "epoch": 1.3238210888484112, + "grad_norm": 0.31434905529022217, + "learning_rate": 5.140208634457163e-05, + "loss": 0.1968, + "step": 6541 + }, + { + "epoch": 1.3240234770289416, + "grad_norm": 0.2981642484664917, + "learning_rate": 5.137429029143921e-05, + "loss": 0.1957, + "step": 6542 + }, + { + "epoch": 1.3242258652094718, + "grad_norm": 0.2877632677555084, + "learning_rate": 5.134649915774588e-05, + "loss": 0.1701, + "step": 6543 + }, + { + "epoch": 1.324428253390002, + "grad_norm": 0.3690420389175415, + "learning_rate": 5.131871294630324e-05, + "loss": 0.2493, + "step": 6544 + }, + { + "epoch": 1.3246306415705322, + "grad_norm": 0.2771603763103485, + "learning_rate": 5.1290931659922406e-05, + "loss": 0.1945, + "step": 6545 + }, + { + "epoch": 1.3248330297510624, + "grad_norm": 0.31182003021240234, + "learning_rate": 5.126315530141402e-05, + "loss": 0.1746, + "step": 6546 + }, + { + "epoch": 1.3250354179315929, + "grad_norm": 0.2257387489080429, + "learning_rate": 5.12353838735882e-05, + "loss": 0.1489, + "step": 6547 + }, + { + "epoch": 1.325237806112123, + "grad_norm": 0.3137997090816498, + "learning_rate": 5.1207617379254544e-05, + "loss": 0.2043, + "step": 6548 + }, + { + "epoch": 1.3254401942926533, + "grad_norm": 0.29156142473220825, + "learning_rate": 5.11798558212222e-05, + "loss": 0.1906, + "step": 6549 + }, + { + "epoch": 1.3256425824731837, + "grad_norm": 0.23661822080612183, + "learning_rate": 5.115209920229978e-05, + "loss": 0.1642, + "step": 6550 + }, + { + "epoch": 1.3256425824731837, + "eval_loss": 0.2720639705657959, + "eval_runtime": 0.7408, + "eval_samples_per_second": 6.749, + "eval_steps_per_second": 1.35, + "step": 6550 + }, + { + "epoch": 1.3258449706537139, + "grad_norm": 0.2968902587890625, + "learning_rate": 5.112434752529539e-05, + "loss": 0.2282, + "step": 6551 + }, + { + "epoch": 1.326047358834244, + "grad_norm": 0.26929181814193726, + "learning_rate": 5.109660079301668e-05, + "loss": 0.1882, + "step": 6552 + }, + { + "epoch": 1.3262497470147743, + "grad_norm": 0.27666157484054565, + "learning_rate": 5.1068859008270765e-05, + "loss": 0.1905, + "step": 6553 + }, + { + "epoch": 1.3264521351953045, + "grad_norm": 0.35504576563835144, + "learning_rate": 5.1041122173864275e-05, + "loss": 0.2033, + "step": 6554 + }, + { + "epoch": 1.326654523375835, + "grad_norm": 0.3042171895503998, + "learning_rate": 5.1013390292603325e-05, + "loss": 0.2009, + "step": 6555 + }, + { + "epoch": 1.326856911556365, + "grad_norm": 0.2512917220592499, + "learning_rate": 5.098566336729351e-05, + "loss": 0.2048, + "step": 6556 + }, + { + "epoch": 1.3270592997368953, + "grad_norm": 0.2652257978916168, + "learning_rate": 5.0957941400739996e-05, + "loss": 0.1826, + "step": 6557 + }, + { + "epoch": 1.3272616879174257, + "grad_norm": 0.2854307293891907, + "learning_rate": 5.0930224395747374e-05, + "loss": 0.1861, + "step": 6558 + }, + { + "epoch": 1.327464076097956, + "grad_norm": 0.32963502407073975, + "learning_rate": 5.0902512355119805e-05, + "loss": 0.2098, + "step": 6559 + }, + { + "epoch": 1.3276664642784861, + "grad_norm": 0.2660824656486511, + "learning_rate": 5.087480528166082e-05, + "loss": 0.1842, + "step": 6560 + }, + { + "epoch": 1.3278688524590163, + "grad_norm": 0.4016515016555786, + "learning_rate": 5.084710317817358e-05, + "loss": 0.2121, + "step": 6561 + }, + { + "epoch": 1.3280712406395465, + "grad_norm": 0.3001742959022522, + "learning_rate": 5.081940604746067e-05, + "loss": 0.1998, + "step": 6562 + }, + { + "epoch": 1.328273628820077, + "grad_norm": 0.2764403223991394, + "learning_rate": 5.079171389232418e-05, + "loss": 0.1657, + "step": 6563 + }, + { + "epoch": 1.3284760170006071, + "grad_norm": 0.31415703892707825, + "learning_rate": 5.0764026715565785e-05, + "loss": 0.2377, + "step": 6564 + }, + { + "epoch": 1.3286784051811376, + "grad_norm": 0.2876707911491394, + "learning_rate": 5.073634451998653e-05, + "loss": 0.184, + "step": 6565 + }, + { + "epoch": 1.3288807933616678, + "grad_norm": 0.2535141110420227, + "learning_rate": 5.0708667308387025e-05, + "loss": 0.1669, + "step": 6566 + }, + { + "epoch": 1.329083181542198, + "grad_norm": 0.2769143879413605, + "learning_rate": 5.0680995083567354e-05, + "loss": 0.1816, + "step": 6567 + }, + { + "epoch": 1.3292855697227282, + "grad_norm": 0.288330614566803, + "learning_rate": 5.0653327848327104e-05, + "loss": 0.1798, + "step": 6568 + }, + { + "epoch": 1.3294879579032584, + "grad_norm": 0.2596645653247833, + "learning_rate": 5.062566560546535e-05, + "loss": 0.1654, + "step": 6569 + }, + { + "epoch": 1.3296903460837888, + "grad_norm": 0.3217107951641083, + "learning_rate": 5.059800835778066e-05, + "loss": 0.1701, + "step": 6570 + }, + { + "epoch": 1.329892734264319, + "grad_norm": 0.2558998465538025, + "learning_rate": 5.0570356108071124e-05, + "loss": 0.1725, + "step": 6571 + }, + { + "epoch": 1.3300951224448492, + "grad_norm": 0.2911076843738556, + "learning_rate": 5.0542708859134305e-05, + "loss": 0.1956, + "step": 6572 + }, + { + "epoch": 1.3302975106253796, + "grad_norm": 0.3187974989414215, + "learning_rate": 5.051506661376725e-05, + "loss": 0.2149, + "step": 6573 + }, + { + "epoch": 1.3304998988059098, + "grad_norm": 0.3285270035266876, + "learning_rate": 5.0487429374766515e-05, + "loss": 0.1888, + "step": 6574 + }, + { + "epoch": 1.33070228698644, + "grad_norm": 0.2761330306529999, + "learning_rate": 5.045979714492814e-05, + "loss": 0.2034, + "step": 6575 + }, + { + "epoch": 1.3309046751669702, + "grad_norm": 0.2921268939971924, + "learning_rate": 5.043216992704767e-05, + "loss": 0.2066, + "step": 6576 + }, + { + "epoch": 1.3311070633475004, + "grad_norm": 0.3227090537548065, + "learning_rate": 5.040454772392015e-05, + "loss": 0.2061, + "step": 6577 + }, + { + "epoch": 1.3313094515280308, + "grad_norm": 0.2876438796520233, + "learning_rate": 5.037693053834008e-05, + "loss": 0.2014, + "step": 6578 + }, + { + "epoch": 1.331511839708561, + "grad_norm": 0.3059580624103546, + "learning_rate": 5.03493183731015e-05, + "loss": 0.2583, + "step": 6579 + }, + { + "epoch": 1.3317142278890912, + "grad_norm": 0.3144664466381073, + "learning_rate": 5.032171123099789e-05, + "loss": 0.1967, + "step": 6580 + }, + { + "epoch": 1.3319166160696216, + "grad_norm": 0.29009315371513367, + "learning_rate": 5.029410911482233e-05, + "loss": 0.1995, + "step": 6581 + }, + { + "epoch": 1.3321190042501518, + "grad_norm": 0.3386211395263672, + "learning_rate": 5.0266512027367204e-05, + "loss": 0.1961, + "step": 6582 + }, + { + "epoch": 1.332321392430682, + "grad_norm": 0.27624696493148804, + "learning_rate": 5.0238919971424536e-05, + "loss": 0.1891, + "step": 6583 + }, + { + "epoch": 1.3325237806112122, + "grad_norm": 0.2946856617927551, + "learning_rate": 5.0211332949785815e-05, + "loss": 0.2155, + "step": 6584 + }, + { + "epoch": 1.3327261687917424, + "grad_norm": 0.2675507664680481, + "learning_rate": 5.018375096524201e-05, + "loss": 0.178, + "step": 6585 + }, + { + "epoch": 1.3329285569722729, + "grad_norm": 0.30136606097221375, + "learning_rate": 5.0156174020583546e-05, + "loss": 0.1926, + "step": 6586 + }, + { + "epoch": 1.333130945152803, + "grad_norm": 0.23659691214561462, + "learning_rate": 5.01286021186004e-05, + "loss": 0.1557, + "step": 6587 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.2846830189228058, + "learning_rate": 5.0101035262082005e-05, + "loss": 0.173, + "step": 6588 + }, + { + "epoch": 1.3335357215138637, + "grad_norm": 0.25915321707725525, + "learning_rate": 5.007347345381727e-05, + "loss": 0.1886, + "step": 6589 + }, + { + "epoch": 1.3337381096943939, + "grad_norm": 0.30053645372390747, + "learning_rate": 5.004591669659462e-05, + "loss": 0.2086, + "step": 6590 + }, + { + "epoch": 1.333940497874924, + "grad_norm": 0.30931761860847473, + "learning_rate": 5.001836499320195e-05, + "loss": 0.206, + "step": 6591 + }, + { + "epoch": 1.3341428860554543, + "grad_norm": 0.2771763801574707, + "learning_rate": 4.999081834642666e-05, + "loss": 0.2223, + "step": 6592 + }, + { + "epoch": 1.3343452742359845, + "grad_norm": 0.27251917123794556, + "learning_rate": 4.996327675905563e-05, + "loss": 0.1661, + "step": 6593 + }, + { + "epoch": 1.334547662416515, + "grad_norm": 0.28505948185920715, + "learning_rate": 4.9935740233875236e-05, + "loss": 0.2201, + "step": 6594 + }, + { + "epoch": 1.334750050597045, + "grad_norm": 0.34662339091300964, + "learning_rate": 4.9908208773671315e-05, + "loss": 0.2281, + "step": 6595 + }, + { + "epoch": 1.3349524387775755, + "grad_norm": 0.32622262835502625, + "learning_rate": 4.988068238122924e-05, + "loss": 0.2139, + "step": 6596 + }, + { + "epoch": 1.3351548269581057, + "grad_norm": 0.2837918996810913, + "learning_rate": 4.985316105933381e-05, + "loss": 0.1957, + "step": 6597 + }, + { + "epoch": 1.335357215138636, + "grad_norm": 0.33972227573394775, + "learning_rate": 4.9825644810769366e-05, + "loss": 0.2278, + "step": 6598 + }, + { + "epoch": 1.3355596033191661, + "grad_norm": 0.32084619998931885, + "learning_rate": 4.97981336383197e-05, + "loss": 0.2126, + "step": 6599 + }, + { + "epoch": 1.3357619914996963, + "grad_norm": 0.27521273493766785, + "learning_rate": 4.977062754476811e-05, + "loss": 0.1871, + "step": 6600 + }, + { + "epoch": 1.3357619914996963, + "eval_loss": 0.2676220238208771, + "eval_runtime": 0.7409, + "eval_samples_per_second": 6.749, + "eval_steps_per_second": 1.35, + "step": 6600 + }, + { + "epoch": 1.3359643796802267, + "grad_norm": 0.2413671761751175, + "learning_rate": 4.974312653289739e-05, + "loss": 0.172, + "step": 6601 + }, + { + "epoch": 1.336166767860757, + "grad_norm": 0.3191070258617401, + "learning_rate": 4.971563060548977e-05, + "loss": 0.2377, + "step": 6602 + }, + { + "epoch": 1.3363691560412871, + "grad_norm": 0.32372811436653137, + "learning_rate": 4.968813976532707e-05, + "loss": 0.1926, + "step": 6603 + }, + { + "epoch": 1.3365715442218176, + "grad_norm": 0.3132017254829407, + "learning_rate": 4.966065401519042e-05, + "loss": 0.1901, + "step": 6604 + }, + { + "epoch": 1.3367739324023478, + "grad_norm": 0.28450313210487366, + "learning_rate": 4.9633173357860596e-05, + "loss": 0.1886, + "step": 6605 + }, + { + "epoch": 1.336976320582878, + "grad_norm": 0.27775466442108154, + "learning_rate": 4.96056977961178e-05, + "loss": 0.1568, + "step": 6606 + }, + { + "epoch": 1.3371787087634082, + "grad_norm": 0.25570055842399597, + "learning_rate": 4.957822733274172e-05, + "loss": 0.1501, + "step": 6607 + }, + { + "epoch": 1.3373810969439384, + "grad_norm": 0.26684972643852234, + "learning_rate": 4.955076197051154e-05, + "loss": 0.2062, + "step": 6608 + }, + { + "epoch": 1.3375834851244688, + "grad_norm": 0.3012605309486389, + "learning_rate": 4.952330171220589e-05, + "loss": 0.2076, + "step": 6609 + }, + { + "epoch": 1.337785873304999, + "grad_norm": 0.36997511982917786, + "learning_rate": 4.949584656060293e-05, + "loss": 0.1616, + "step": 6610 + }, + { + "epoch": 1.3379882614855292, + "grad_norm": 0.27101650834083557, + "learning_rate": 4.946839651848029e-05, + "loss": 0.1829, + "step": 6611 + }, + { + "epoch": 1.3381906496660596, + "grad_norm": 0.2873406708240509, + "learning_rate": 4.9440951588615056e-05, + "loss": 0.2096, + "step": 6612 + }, + { + "epoch": 1.3383930378465898, + "grad_norm": 0.2543468177318573, + "learning_rate": 4.9413511773783836e-05, + "loss": 0.1665, + "step": 6613 + }, + { + "epoch": 1.33859542602712, + "grad_norm": 0.2655819058418274, + "learning_rate": 4.9386077076762695e-05, + "loss": 0.1827, + "step": 6614 + }, + { + "epoch": 1.3387978142076502, + "grad_norm": 0.2574715316295624, + "learning_rate": 4.935864750032719e-05, + "loss": 0.165, + "step": 6615 + }, + { + "epoch": 1.3390002023881804, + "grad_norm": 0.3012125492095947, + "learning_rate": 4.933122304725236e-05, + "loss": 0.2141, + "step": 6616 + }, + { + "epoch": 1.3392025905687108, + "grad_norm": 0.31119707226753235, + "learning_rate": 4.930380372031273e-05, + "loss": 0.2039, + "step": 6617 + }, + { + "epoch": 1.339404978749241, + "grad_norm": 0.2763644754886627, + "learning_rate": 4.9276389522282275e-05, + "loss": 0.1877, + "step": 6618 + }, + { + "epoch": 1.3396073669297712, + "grad_norm": 0.3010362982749939, + "learning_rate": 4.924898045593449e-05, + "loss": 0.1879, + "step": 6619 + }, + { + "epoch": 1.3398097551103016, + "grad_norm": 0.26402631402015686, + "learning_rate": 4.922157652404235e-05, + "loss": 0.1837, + "step": 6620 + }, + { + "epoch": 1.3400121432908318, + "grad_norm": 0.24701833724975586, + "learning_rate": 4.9194177729378236e-05, + "loss": 0.1591, + "step": 6621 + }, + { + "epoch": 1.340214531471362, + "grad_norm": 0.356270432472229, + "learning_rate": 4.916678407471417e-05, + "loss": 0.2013, + "step": 6622 + }, + { + "epoch": 1.3404169196518922, + "grad_norm": 0.36151617765426636, + "learning_rate": 4.913939556282149e-05, + "loss": 0.217, + "step": 6623 + }, + { + "epoch": 1.3406193078324227, + "grad_norm": 0.31240034103393555, + "learning_rate": 4.91120121964711e-05, + "loss": 0.1944, + "step": 6624 + }, + { + "epoch": 1.3408216960129529, + "grad_norm": 0.2937697172164917, + "learning_rate": 4.9084633978433356e-05, + "loss": 0.1846, + "step": 6625 + }, + { + "epoch": 1.341024084193483, + "grad_norm": 0.26070502400398254, + "learning_rate": 4.9057260911478134e-05, + "loss": 0.1798, + "step": 6626 + }, + { + "epoch": 1.3412264723740135, + "grad_norm": 0.28170114755630493, + "learning_rate": 4.902989299837467e-05, + "loss": 0.1827, + "step": 6627 + }, + { + "epoch": 1.3414288605545437, + "grad_norm": 0.24363872408866882, + "learning_rate": 4.900253024189182e-05, + "loss": 0.1703, + "step": 6628 + }, + { + "epoch": 1.341631248735074, + "grad_norm": 0.2929212749004364, + "learning_rate": 4.897517264479785e-05, + "loss": 0.2417, + "step": 6629 + }, + { + "epoch": 1.341833636915604, + "grad_norm": 0.2667793035507202, + "learning_rate": 4.894782020986052e-05, + "loss": 0.1864, + "step": 6630 + }, + { + "epoch": 1.3420360250961343, + "grad_norm": 0.2632579207420349, + "learning_rate": 4.892047293984704e-05, + "loss": 0.1878, + "step": 6631 + }, + { + "epoch": 1.3422384132766647, + "grad_norm": 0.28002333641052246, + "learning_rate": 4.8893130837524145e-05, + "loss": 0.182, + "step": 6632 + }, + { + "epoch": 1.342440801457195, + "grad_norm": 0.28146490454673767, + "learning_rate": 4.886579390565802e-05, + "loss": 0.1922, + "step": 6633 + }, + { + "epoch": 1.342643189637725, + "grad_norm": 0.2770502269268036, + "learning_rate": 4.883846214701431e-05, + "loss": 0.1968, + "step": 6634 + }, + { + "epoch": 1.3428455778182555, + "grad_norm": 0.3004387319087982, + "learning_rate": 4.881113556435818e-05, + "loss": 0.1821, + "step": 6635 + }, + { + "epoch": 1.3430479659987857, + "grad_norm": 0.30087587237358093, + "learning_rate": 4.878381416045422e-05, + "loss": 0.2104, + "step": 6636 + }, + { + "epoch": 1.343250354179316, + "grad_norm": 0.2681400775909424, + "learning_rate": 4.8756497938066544e-05, + "loss": 0.1663, + "step": 6637 + }, + { + "epoch": 1.3434527423598461, + "grad_norm": 0.2617656886577606, + "learning_rate": 4.8729186899958726e-05, + "loss": 0.2128, + "step": 6638 + }, + { + "epoch": 1.3436551305403763, + "grad_norm": 0.27163711190223694, + "learning_rate": 4.8701881048893794e-05, + "loss": 0.2162, + "step": 6639 + }, + { + "epoch": 1.3438575187209068, + "grad_norm": 0.29744192957878113, + "learning_rate": 4.867458038763426e-05, + "loss": 0.223, + "step": 6640 + }, + { + "epoch": 1.344059906901437, + "grad_norm": 0.22820989787578583, + "learning_rate": 4.864728491894215e-05, + "loss": 0.1541, + "step": 6641 + }, + { + "epoch": 1.3442622950819672, + "grad_norm": 0.26483702659606934, + "learning_rate": 4.86199946455789e-05, + "loss": 0.1691, + "step": 6642 + }, + { + "epoch": 1.3444646832624976, + "grad_norm": 0.24761466681957245, + "learning_rate": 4.859270957030547e-05, + "loss": 0.1563, + "step": 6643 + }, + { + "epoch": 1.3446670714430278, + "grad_norm": 0.286159873008728, + "learning_rate": 4.856542969588228e-05, + "loss": 0.1792, + "step": 6644 + }, + { + "epoch": 1.344869459623558, + "grad_norm": 0.29326891899108887, + "learning_rate": 4.8538155025069206e-05, + "loss": 0.2011, + "step": 6645 + }, + { + "epoch": 1.3450718478040882, + "grad_norm": 0.278120219707489, + "learning_rate": 4.851088556062563e-05, + "loss": 0.1963, + "step": 6646 + }, + { + "epoch": 1.3452742359846184, + "grad_norm": 0.2773338854312897, + "learning_rate": 4.848362130531039e-05, + "loss": 0.1859, + "step": 6647 + }, + { + "epoch": 1.3454766241651488, + "grad_norm": 0.2770738899707794, + "learning_rate": 4.845636226188183e-05, + "loss": 0.2035, + "step": 6648 + }, + { + "epoch": 1.345679012345679, + "grad_norm": 0.27181872725486755, + "learning_rate": 4.8429108433097645e-05, + "loss": 0.2015, + "step": 6649 + }, + { + "epoch": 1.3458814005262092, + "grad_norm": 0.3191307783126831, + "learning_rate": 4.840185982171514e-05, + "loss": 0.1757, + "step": 6650 + }, + { + "epoch": 1.3458814005262092, + "eval_loss": 0.26896873116493225, + "eval_runtime": 0.7368, + "eval_samples_per_second": 6.786, + "eval_steps_per_second": 1.357, + "step": 6650 + }, + { + "epoch": 1.3460837887067396, + "grad_norm": 0.2746305465698242, + "learning_rate": 4.837461643049106e-05, + "loss": 0.1954, + "step": 6651 + }, + { + "epoch": 1.3462861768872698, + "grad_norm": 0.25542014837265015, + "learning_rate": 4.8347378262181583e-05, + "loss": 0.1673, + "step": 6652 + }, + { + "epoch": 1.3464885650678, + "grad_norm": 0.303438663482666, + "learning_rate": 4.832014531954239e-05, + "loss": 0.1968, + "step": 6653 + }, + { + "epoch": 1.3466909532483302, + "grad_norm": 0.29511943459510803, + "learning_rate": 4.829291760532861e-05, + "loss": 0.2027, + "step": 6654 + }, + { + "epoch": 1.3468933414288606, + "grad_norm": 0.29404184222221375, + "learning_rate": 4.826569512229488e-05, + "loss": 0.1458, + "step": 6655 + }, + { + "epoch": 1.3470957296093908, + "grad_norm": 0.2823881506919861, + "learning_rate": 4.823847787319529e-05, + "loss": 0.207, + "step": 6656 + }, + { + "epoch": 1.347298117789921, + "grad_norm": 0.3050212860107422, + "learning_rate": 4.821126586078336e-05, + "loss": 0.2397, + "step": 6657 + }, + { + "epoch": 1.3475005059704515, + "grad_norm": 0.3161782920360565, + "learning_rate": 4.818405908781215e-05, + "loss": 0.2163, + "step": 6658 + }, + { + "epoch": 1.3477028941509817, + "grad_norm": 0.2794608175754547, + "learning_rate": 4.8156857557034144e-05, + "loss": 0.1952, + "step": 6659 + }, + { + "epoch": 1.3479052823315119, + "grad_norm": 0.2686176300048828, + "learning_rate": 4.8129661271201296e-05, + "loss": 0.1546, + "step": 6660 + }, + { + "epoch": 1.348107670512042, + "grad_norm": 0.280245304107666, + "learning_rate": 4.810247023306505e-05, + "loss": 0.1811, + "step": 6661 + }, + { + "epoch": 1.3483100586925723, + "grad_norm": 0.27028578519821167, + "learning_rate": 4.807528444537632e-05, + "loss": 0.1729, + "step": 6662 + }, + { + "epoch": 1.3485124468731027, + "grad_norm": 0.3122173845767975, + "learning_rate": 4.8048103910885475e-05, + "loss": 0.2259, + "step": 6663 + }, + { + "epoch": 1.3487148350536329, + "grad_norm": 0.3586917519569397, + "learning_rate": 4.8020928632342346e-05, + "loss": 0.1921, + "step": 6664 + }, + { + "epoch": 1.348917223234163, + "grad_norm": 0.29752397537231445, + "learning_rate": 4.799375861249624e-05, + "loss": 0.1728, + "step": 6665 + }, + { + "epoch": 1.3491196114146935, + "grad_norm": 0.2642748951911926, + "learning_rate": 4.796659385409595e-05, + "loss": 0.1949, + "step": 6666 + }, + { + "epoch": 1.3493219995952237, + "grad_norm": 0.2839334309101105, + "learning_rate": 4.7939434359889714e-05, + "loss": 0.1828, + "step": 6667 + }, + { + "epoch": 1.349524387775754, + "grad_norm": 0.32244181632995605, + "learning_rate": 4.7912280132625245e-05, + "loss": 0.2258, + "step": 6668 + }, + { + "epoch": 1.349726775956284, + "grad_norm": 0.2739431858062744, + "learning_rate": 4.788513117504971e-05, + "loss": 0.1394, + "step": 6669 + }, + { + "epoch": 1.3499291641368143, + "grad_norm": 0.31246238946914673, + "learning_rate": 4.785798748990978e-05, + "loss": 0.2063, + "step": 6670 + }, + { + "epoch": 1.3501315523173447, + "grad_norm": 0.32760030031204224, + "learning_rate": 4.783084907995156e-05, + "loss": 0.1873, + "step": 6671 + }, + { + "epoch": 1.350333940497875, + "grad_norm": 0.2789546549320221, + "learning_rate": 4.7803715947920614e-05, + "loss": 0.1671, + "step": 6672 + }, + { + "epoch": 1.3505363286784051, + "grad_norm": 0.306374192237854, + "learning_rate": 4.7776588096562e-05, + "loss": 0.2078, + "step": 6673 + }, + { + "epoch": 1.3507387168589355, + "grad_norm": 0.3102760910987854, + "learning_rate": 4.774946552862023e-05, + "loss": 0.1982, + "step": 6674 + }, + { + "epoch": 1.3509411050394657, + "grad_norm": 0.2927134037017822, + "learning_rate": 4.7722348246839285e-05, + "loss": 0.2124, + "step": 6675 + }, + { + "epoch": 1.351143493219996, + "grad_norm": 0.3680955767631531, + "learning_rate": 4.769523625396259e-05, + "loss": 0.1934, + "step": 6676 + }, + { + "epoch": 1.3513458814005261, + "grad_norm": 0.3348569869995117, + "learning_rate": 4.7668129552733076e-05, + "loss": 0.2212, + "step": 6677 + }, + { + "epoch": 1.3515482695810563, + "grad_norm": 0.2908865213394165, + "learning_rate": 4.7641028145893094e-05, + "loss": 0.2018, + "step": 6678 + }, + { + "epoch": 1.3517506577615868, + "grad_norm": 0.27470603585243225, + "learning_rate": 4.761393203618449e-05, + "loss": 0.2029, + "step": 6679 + }, + { + "epoch": 1.351953045942117, + "grad_norm": 0.33673080801963806, + "learning_rate": 4.7586841226348564e-05, + "loss": 0.2223, + "step": 6680 + }, + { + "epoch": 1.3521554341226472, + "grad_norm": 0.23110777139663696, + "learning_rate": 4.7559755719126075e-05, + "loss": 0.1587, + "step": 6681 + }, + { + "epoch": 1.3523578223031776, + "grad_norm": 0.3230266571044922, + "learning_rate": 4.7532675517257256e-05, + "loss": 0.2, + "step": 6682 + }, + { + "epoch": 1.3525602104837078, + "grad_norm": 0.3509168326854706, + "learning_rate": 4.7505600623481784e-05, + "loss": 0.2157, + "step": 6683 + }, + { + "epoch": 1.352762598664238, + "grad_norm": 0.3701566755771637, + "learning_rate": 4.747853104053883e-05, + "loss": 0.1997, + "step": 6684 + }, + { + "epoch": 1.3529649868447682, + "grad_norm": 0.2779744267463684, + "learning_rate": 4.745146677116701e-05, + "loss": 0.1959, + "step": 6685 + }, + { + "epoch": 1.3531673750252986, + "grad_norm": 0.29480859637260437, + "learning_rate": 4.7424407818104385e-05, + "loss": 0.2142, + "step": 6686 + }, + { + "epoch": 1.3533697632058288, + "grad_norm": 0.31336894631385803, + "learning_rate": 4.739735418408852e-05, + "loss": 0.2362, + "step": 6687 + }, + { + "epoch": 1.353572151386359, + "grad_norm": 0.27973970770835876, + "learning_rate": 4.73703058718564e-05, + "loss": 0.2053, + "step": 6688 + }, + { + "epoch": 1.3537745395668894, + "grad_norm": 0.25769269466400146, + "learning_rate": 4.734326288414449e-05, + "loss": 0.1937, + "step": 6689 + }, + { + "epoch": 1.3539769277474196, + "grad_norm": 0.252174973487854, + "learning_rate": 4.7316225223688724e-05, + "loss": 0.1599, + "step": 6690 + }, + { + "epoch": 1.3541793159279498, + "grad_norm": 0.26635318994522095, + "learning_rate": 4.7289192893224486e-05, + "loss": 0.1944, + "step": 6691 + }, + { + "epoch": 1.35438170410848, + "grad_norm": 0.43550267815589905, + "learning_rate": 4.726216589548667e-05, + "loss": 0.2181, + "step": 6692 + }, + { + "epoch": 1.3545840922890102, + "grad_norm": 0.25361430644989014, + "learning_rate": 4.723514423320948e-05, + "loss": 0.1711, + "step": 6693 + }, + { + "epoch": 1.3547864804695406, + "grad_norm": 0.31329795718193054, + "learning_rate": 4.720812790912675e-05, + "loss": 0.203, + "step": 6694 + }, + { + "epoch": 1.3549888686500708, + "grad_norm": 0.27394962310791016, + "learning_rate": 4.71811169259717e-05, + "loss": 0.1971, + "step": 6695 + }, + { + "epoch": 1.355191256830601, + "grad_norm": 0.3030480742454529, + "learning_rate": 4.715411128647702e-05, + "loss": 0.2048, + "step": 6696 + }, + { + "epoch": 1.3553936450111315, + "grad_norm": 0.30011892318725586, + "learning_rate": 4.7127110993374855e-05, + "loss": 0.1889, + "step": 6697 + }, + { + "epoch": 1.3555960331916617, + "grad_norm": 0.292767733335495, + "learning_rate": 4.7100116049396804e-05, + "loss": 0.1699, + "step": 6698 + }, + { + "epoch": 1.3557984213721919, + "grad_norm": 0.27436232566833496, + "learning_rate": 4.707312645727394e-05, + "loss": 0.1829, + "step": 6699 + }, + { + "epoch": 1.356000809552722, + "grad_norm": 0.2540614902973175, + "learning_rate": 4.70461422197368e-05, + "loss": 0.2047, + "step": 6700 + }, + { + "epoch": 1.356000809552722, + "eval_loss": 0.26921871304512024, + "eval_runtime": 0.7381, + "eval_samples_per_second": 6.774, + "eval_steps_per_second": 1.355, + "step": 6700 + }, + { + "epoch": 1.3562031977332523, + "grad_norm": 0.28804129362106323, + "learning_rate": 4.701916333951534e-05, + "loss": 0.1905, + "step": 6701 + }, + { + "epoch": 1.3564055859137827, + "grad_norm": 0.2757605314254761, + "learning_rate": 4.6992189819339006e-05, + "loss": 0.1719, + "step": 6702 + }, + { + "epoch": 1.3566079740943129, + "grad_norm": 0.25101038813591003, + "learning_rate": 4.696522166193671e-05, + "loss": 0.1795, + "step": 6703 + }, + { + "epoch": 1.356810362274843, + "grad_norm": 0.2832792103290558, + "learning_rate": 4.6938258870036786e-05, + "loss": 0.1698, + "step": 6704 + }, + { + "epoch": 1.3570127504553735, + "grad_norm": 0.2435804158449173, + "learning_rate": 4.691130144636707e-05, + "loss": 0.181, + "step": 6705 + }, + { + "epoch": 1.3572151386359037, + "grad_norm": 0.28305795788764954, + "learning_rate": 4.6884349393654823e-05, + "loss": 0.2256, + "step": 6706 + }, + { + "epoch": 1.357417526816434, + "grad_norm": 0.28656500577926636, + "learning_rate": 4.6857402714626765e-05, + "loss": 0.1737, + "step": 6707 + }, + { + "epoch": 1.357619914996964, + "grad_norm": 0.2786801755428314, + "learning_rate": 4.683046141200909e-05, + "loss": 0.2059, + "step": 6708 + }, + { + "epoch": 1.3578223031774943, + "grad_norm": 0.2742007374763489, + "learning_rate": 4.680352548852741e-05, + "loss": 0.2013, + "step": 6709 + }, + { + "epoch": 1.3580246913580247, + "grad_norm": 0.2749464511871338, + "learning_rate": 4.677659494690685e-05, + "loss": 0.1908, + "step": 6710 + }, + { + "epoch": 1.358227079538555, + "grad_norm": 0.2884829342365265, + "learning_rate": 4.6749669789871944e-05, + "loss": 0.1536, + "step": 6711 + }, + { + "epoch": 1.3584294677190851, + "grad_norm": 0.303640216588974, + "learning_rate": 4.672275002014669e-05, + "loss": 0.2158, + "step": 6712 + }, + { + "epoch": 1.3586318558996155, + "grad_norm": 0.29193374514579773, + "learning_rate": 4.6695835640454564e-05, + "loss": 0.1926, + "step": 6713 + }, + { + "epoch": 1.3588342440801457, + "grad_norm": 0.27257031202316284, + "learning_rate": 4.666892665351847e-05, + "loss": 0.1714, + "step": 6714 + }, + { + "epoch": 1.359036632260676, + "grad_norm": 0.3507302701473236, + "learning_rate": 4.6642023062060825e-05, + "loss": 0.2356, + "step": 6715 + }, + { + "epoch": 1.3592390204412061, + "grad_norm": 0.2812184989452362, + "learning_rate": 4.6615124868803326e-05, + "loss": 0.2025, + "step": 6716 + }, + { + "epoch": 1.3594414086217366, + "grad_norm": 0.36313724517822266, + "learning_rate": 4.658823207646737e-05, + "loss": 0.2103, + "step": 6717 + }, + { + "epoch": 1.3596437968022668, + "grad_norm": 0.2924157977104187, + "learning_rate": 4.6561344687773655e-05, + "loss": 0.2042, + "step": 6718 + }, + { + "epoch": 1.359846184982797, + "grad_norm": 0.293690949678421, + "learning_rate": 4.653446270544236e-05, + "loss": 0.1875, + "step": 6719 + }, + { + "epoch": 1.3600485731633274, + "grad_norm": 0.3482380509376526, + "learning_rate": 4.6507586132193115e-05, + "loss": 0.2127, + "step": 6720 + }, + { + "epoch": 1.3602509613438576, + "grad_norm": 0.3009283244609833, + "learning_rate": 4.648071497074502e-05, + "loss": 0.2143, + "step": 6721 + }, + { + "epoch": 1.3604533495243878, + "grad_norm": 0.31680983304977417, + "learning_rate": 4.6453849223816604e-05, + "loss": 0.2419, + "step": 6722 + }, + { + "epoch": 1.360655737704918, + "grad_norm": 0.3034413456916809, + "learning_rate": 4.642698889412588e-05, + "loss": 0.2122, + "step": 6723 + }, + { + "epoch": 1.3608581258854482, + "grad_norm": 0.297990083694458, + "learning_rate": 4.6400133984390283e-05, + "loss": 0.1998, + "step": 6724 + }, + { + "epoch": 1.3610605140659786, + "grad_norm": 0.2984466552734375, + "learning_rate": 4.637328449732671e-05, + "loss": 0.2176, + "step": 6725 + }, + { + "epoch": 1.3612629022465088, + "grad_norm": 0.31007546186447144, + "learning_rate": 4.6346440435651505e-05, + "loss": 0.2072, + "step": 6726 + }, + { + "epoch": 1.361465290427039, + "grad_norm": 0.313833087682724, + "learning_rate": 4.6319601802080494e-05, + "loss": 0.1845, + "step": 6727 + }, + { + "epoch": 1.3616676786075694, + "grad_norm": 0.293378084897995, + "learning_rate": 4.629276859932889e-05, + "loss": 0.2082, + "step": 6728 + }, + { + "epoch": 1.3618700667880996, + "grad_norm": 0.302562415599823, + "learning_rate": 4.6265940830111434e-05, + "loss": 0.1976, + "step": 6729 + }, + { + "epoch": 1.3620724549686298, + "grad_norm": 0.31816911697387695, + "learning_rate": 4.6239118497142256e-05, + "loss": 0.2028, + "step": 6730 + }, + { + "epoch": 1.36227484314916, + "grad_norm": 0.305603951215744, + "learning_rate": 4.6212301603134954e-05, + "loss": 0.193, + "step": 6731 + }, + { + "epoch": 1.3624772313296902, + "grad_norm": 0.3000797927379608, + "learning_rate": 4.618549015080259e-05, + "loss": 0.194, + "step": 6732 + }, + { + "epoch": 1.3626796195102207, + "grad_norm": 0.32458996772766113, + "learning_rate": 4.615868414285767e-05, + "loss": 0.2204, + "step": 6733 + }, + { + "epoch": 1.3628820076907509, + "grad_norm": 0.27205538749694824, + "learning_rate": 4.6131883582012125e-05, + "loss": 0.176, + "step": 6734 + }, + { + "epoch": 1.363084395871281, + "grad_norm": 0.2538856863975525, + "learning_rate": 4.6105088470977384e-05, + "loss": 0.1954, + "step": 6735 + }, + { + "epoch": 1.3632867840518115, + "grad_norm": 0.3089398741722107, + "learning_rate": 4.607829881246427e-05, + "loss": 0.1953, + "step": 6736 + }, + { + "epoch": 1.3634891722323417, + "grad_norm": 0.31733807921409607, + "learning_rate": 4.6051514609183124e-05, + "loss": 0.2068, + "step": 6737 + }, + { + "epoch": 1.3636915604128719, + "grad_norm": 0.3041749894618988, + "learning_rate": 4.602473586384361e-05, + "loss": 0.2036, + "step": 6738 + }, + { + "epoch": 1.363893948593402, + "grad_norm": 0.34960976243019104, + "learning_rate": 4.599796257915499e-05, + "loss": 0.2228, + "step": 6739 + }, + { + "epoch": 1.3640963367739323, + "grad_norm": 0.2810072600841522, + "learning_rate": 4.597119475782585e-05, + "loss": 0.226, + "step": 6740 + }, + { + "epoch": 1.3642987249544627, + "grad_norm": 0.2816447615623474, + "learning_rate": 4.594443240256433e-05, + "loss": 0.2007, + "step": 6741 + }, + { + "epoch": 1.364501113134993, + "grad_norm": 0.31261932849884033, + "learning_rate": 4.591767551607793e-05, + "loss": 0.168, + "step": 6742 + }, + { + "epoch": 1.364703501315523, + "grad_norm": 0.24643424153327942, + "learning_rate": 4.589092410107364e-05, + "loss": 0.1709, + "step": 6743 + }, + { + "epoch": 1.3649058894960535, + "grad_norm": 0.25190091133117676, + "learning_rate": 4.5864178160257895e-05, + "loss": 0.1536, + "step": 6744 + }, + { + "epoch": 1.3651082776765837, + "grad_norm": 0.2703429162502289, + "learning_rate": 4.583743769633656e-05, + "loss": 0.1802, + "step": 6745 + }, + { + "epoch": 1.365310665857114, + "grad_norm": 0.30754923820495605, + "learning_rate": 4.5810702712014964e-05, + "loss": 0.1942, + "step": 6746 + }, + { + "epoch": 1.3655130540376441, + "grad_norm": 0.2669358253479004, + "learning_rate": 4.578397320999785e-05, + "loss": 0.1915, + "step": 6747 + }, + { + "epoch": 1.3657154422181745, + "grad_norm": 0.27001726627349854, + "learning_rate": 4.575724919298946e-05, + "loss": 0.1949, + "step": 6748 + }, + { + "epoch": 1.3659178303987047, + "grad_norm": 0.27014675736427307, + "learning_rate": 4.5730530663693425e-05, + "loss": 0.2015, + "step": 6749 + }, + { + "epoch": 1.366120218579235, + "grad_norm": 0.2672886550426483, + "learning_rate": 4.570381762481286e-05, + "loss": 0.1998, + "step": 6750 + }, + { + "epoch": 1.366120218579235, + "eval_loss": 0.2655577063560486, + "eval_runtime": 0.7398, + "eval_samples_per_second": 6.759, + "eval_steps_per_second": 1.352, + "step": 6750 + }, + { + "epoch": 1.3663226067597654, + "grad_norm": 0.2848818004131317, + "learning_rate": 4.56771100790503e-05, + "loss": 0.2016, + "step": 6751 + }, + { + "epoch": 1.3665249949402956, + "grad_norm": 0.2925681471824646, + "learning_rate": 4.5650408029107746e-05, + "loss": 0.1869, + "step": 6752 + }, + { + "epoch": 1.3667273831208258, + "grad_norm": 0.27403298020362854, + "learning_rate": 4.5623711477686614e-05, + "loss": 0.19, + "step": 6753 + }, + { + "epoch": 1.366929771301356, + "grad_norm": 0.2719515860080719, + "learning_rate": 4.55970204274878e-05, + "loss": 0.2171, + "step": 6754 + }, + { + "epoch": 1.3671321594818862, + "grad_norm": 0.3114962875843048, + "learning_rate": 4.557033488121161e-05, + "loss": 0.1991, + "step": 6755 + }, + { + "epoch": 1.3673345476624166, + "grad_norm": 0.24869102239608765, + "learning_rate": 4.554365484155782e-05, + "loss": 0.1907, + "step": 6756 + }, + { + "epoch": 1.3675369358429468, + "grad_norm": 0.2927948534488678, + "learning_rate": 4.551698031122561e-05, + "loss": 0.2102, + "step": 6757 + }, + { + "epoch": 1.367739324023477, + "grad_norm": 0.2724990248680115, + "learning_rate": 4.549031129291367e-05, + "loss": 0.2288, + "step": 6758 + }, + { + "epoch": 1.3679417122040074, + "grad_norm": 0.26832640171051025, + "learning_rate": 4.54636477893201e-05, + "loss": 0.1698, + "step": 6759 + }, + { + "epoch": 1.3681441003845376, + "grad_norm": 0.2984505891799927, + "learning_rate": 4.543698980314236e-05, + "loss": 0.1881, + "step": 6760 + }, + { + "epoch": 1.3683464885650678, + "grad_norm": 0.2443763017654419, + "learning_rate": 4.541033733707747e-05, + "loss": 0.181, + "step": 6761 + }, + { + "epoch": 1.368548876745598, + "grad_norm": 0.32323718070983887, + "learning_rate": 4.538369039382184e-05, + "loss": 0.2107, + "step": 6762 + }, + { + "epoch": 1.3687512649261282, + "grad_norm": 0.24224111437797546, + "learning_rate": 4.535704897607135e-05, + "loss": 0.172, + "step": 6763 + }, + { + "epoch": 1.3689536531066586, + "grad_norm": 0.2844039499759674, + "learning_rate": 4.5330413086521276e-05, + "loss": 0.1602, + "step": 6764 + }, + { + "epoch": 1.3691560412871888, + "grad_norm": 0.31434163451194763, + "learning_rate": 4.530378272786635e-05, + "loss": 0.1955, + "step": 6765 + }, + { + "epoch": 1.369358429467719, + "grad_norm": 0.28526541590690613, + "learning_rate": 4.527715790280078e-05, + "loss": 0.1656, + "step": 6766 + }, + { + "epoch": 1.3695608176482494, + "grad_norm": 0.27655383944511414, + "learning_rate": 4.525053861401818e-05, + "loss": 0.1804, + "step": 6767 + }, + { + "epoch": 1.3697632058287796, + "grad_norm": 0.48966366052627563, + "learning_rate": 4.52239248642116e-05, + "loss": 0.2221, + "step": 6768 + }, + { + "epoch": 1.3699655940093098, + "grad_norm": 0.3002183139324188, + "learning_rate": 4.519731665607353e-05, + "loss": 0.2067, + "step": 6769 + }, + { + "epoch": 1.37016798218984, + "grad_norm": 0.2847645878791809, + "learning_rate": 4.517071399229593e-05, + "loss": 0.2084, + "step": 6770 + }, + { + "epoch": 1.3703703703703702, + "grad_norm": 0.2950843274593353, + "learning_rate": 4.5144116875570176e-05, + "loss": 0.1991, + "step": 6771 + }, + { + "epoch": 1.3705727585509007, + "grad_norm": 0.29930636286735535, + "learning_rate": 4.511752530858707e-05, + "loss": 0.1767, + "step": 6772 + }, + { + "epoch": 1.3707751467314309, + "grad_norm": 0.28075480461120605, + "learning_rate": 4.5090939294036895e-05, + "loss": 0.1898, + "step": 6773 + }, + { + "epoch": 1.370977534911961, + "grad_norm": 0.2594175636768341, + "learning_rate": 4.506435883460928e-05, + "loss": 0.2015, + "step": 6774 + }, + { + "epoch": 1.3711799230924915, + "grad_norm": 0.3175654113292694, + "learning_rate": 4.503778393299344e-05, + "loss": 0.2128, + "step": 6775 + }, + { + "epoch": 1.3713823112730217, + "grad_norm": 0.3042403757572174, + "learning_rate": 4.501121459187792e-05, + "loss": 0.2206, + "step": 6776 + }, + { + "epoch": 1.3715846994535519, + "grad_norm": 0.31100788712501526, + "learning_rate": 4.49846508139507e-05, + "loss": 0.1891, + "step": 6777 + }, + { + "epoch": 1.371787087634082, + "grad_norm": 0.30918845534324646, + "learning_rate": 4.495809260189925e-05, + "loss": 0.173, + "step": 6778 + }, + { + "epoch": 1.3719894758146125, + "grad_norm": 0.29384666681289673, + "learning_rate": 4.493153995841045e-05, + "loss": 0.1626, + "step": 6779 + }, + { + "epoch": 1.3721918639951427, + "grad_norm": 0.27222007513046265, + "learning_rate": 4.4904992886170595e-05, + "loss": 0.1754, + "step": 6780 + }, + { + "epoch": 1.372394252175673, + "grad_norm": 0.30638864636421204, + "learning_rate": 4.487845138786551e-05, + "loss": 0.2065, + "step": 6781 + }, + { + "epoch": 1.3725966403562033, + "grad_norm": 0.30102983117103577, + "learning_rate": 4.4851915466180274e-05, + "loss": 0.1956, + "step": 6782 + }, + { + "epoch": 1.3727990285367335, + "grad_norm": 0.30725544691085815, + "learning_rate": 4.4825385123799576e-05, + "loss": 0.2123, + "step": 6783 + }, + { + "epoch": 1.3730014167172637, + "grad_norm": 0.25662949681282043, + "learning_rate": 4.479886036340748e-05, + "loss": 0.1818, + "step": 6784 + }, + { + "epoch": 1.373203804897794, + "grad_norm": 0.2699354887008667, + "learning_rate": 4.477234118768746e-05, + "loss": 0.17, + "step": 6785 + }, + { + "epoch": 1.3734061930783241, + "grad_norm": 0.26643893122673035, + "learning_rate": 4.4745827599322466e-05, + "loss": 0.1903, + "step": 6786 + }, + { + "epoch": 1.3736085812588545, + "grad_norm": 0.3266531527042389, + "learning_rate": 4.4719319600994856e-05, + "loss": 0.2274, + "step": 6787 + }, + { + "epoch": 1.3738109694393847, + "grad_norm": 0.2596859633922577, + "learning_rate": 4.469281719538645e-05, + "loss": 0.1752, + "step": 6788 + }, + { + "epoch": 1.374013357619915, + "grad_norm": 0.25986337661743164, + "learning_rate": 4.466632038517845e-05, + "loss": 0.16, + "step": 6789 + }, + { + "epoch": 1.3742157458004454, + "grad_norm": 0.2841913402080536, + "learning_rate": 4.4639829173051554e-05, + "loss": 0.2121, + "step": 6790 + }, + { + "epoch": 1.3744181339809756, + "grad_norm": 0.3008646070957184, + "learning_rate": 4.461334356168585e-05, + "loss": 0.1924, + "step": 6791 + }, + { + "epoch": 1.3746205221615058, + "grad_norm": 0.29514750838279724, + "learning_rate": 4.4586863553760885e-05, + "loss": 0.199, + "step": 6792 + }, + { + "epoch": 1.374822910342036, + "grad_norm": 0.2603028416633606, + "learning_rate": 4.4560389151955615e-05, + "loss": 0.1645, + "step": 6793 + }, + { + "epoch": 1.3750252985225662, + "grad_norm": 0.3041701018810272, + "learning_rate": 4.453392035894846e-05, + "loss": 0.1889, + "step": 6794 + }, + { + "epoch": 1.3752276867030966, + "grad_norm": 0.26599910855293274, + "learning_rate": 4.4507457177417234e-05, + "loss": 0.1522, + "step": 6795 + }, + { + "epoch": 1.3754300748836268, + "grad_norm": 0.3250159025192261, + "learning_rate": 4.448099961003922e-05, + "loss": 0.1977, + "step": 6796 + }, + { + "epoch": 1.375632463064157, + "grad_norm": 0.2732198238372803, + "learning_rate": 4.4454547659491106e-05, + "loss": 0.1951, + "step": 6797 + }, + { + "epoch": 1.3758348512446874, + "grad_norm": 0.30923816561698914, + "learning_rate": 4.442810132844903e-05, + "loss": 0.231, + "step": 6798 + }, + { + "epoch": 1.3760372394252176, + "grad_norm": 0.28462839126586914, + "learning_rate": 4.440166061958856e-05, + "loss": 0.1964, + "step": 6799 + }, + { + "epoch": 1.3762396276057478, + "grad_norm": 0.210264652967453, + "learning_rate": 4.437522553558466e-05, + "loss": 0.1613, + "step": 6800 + }, + { + "epoch": 1.3762396276057478, + "eval_loss": 0.2648058235645294, + "eval_runtime": 0.7403, + "eval_samples_per_second": 6.754, + "eval_steps_per_second": 1.351, + "step": 6800 + }, + { + "epoch": 1.376442015786278, + "grad_norm": 0.2972264885902405, + "learning_rate": 4.43487960791118e-05, + "loss": 0.2084, + "step": 6801 + }, + { + "epoch": 1.3766444039668082, + "grad_norm": 0.23235231637954712, + "learning_rate": 4.4322372252843805e-05, + "loss": 0.1628, + "step": 6802 + }, + { + "epoch": 1.3768467921473386, + "grad_norm": 0.28614068031311035, + "learning_rate": 4.429595405945399e-05, + "loss": 0.215, + "step": 6803 + }, + { + "epoch": 1.3770491803278688, + "grad_norm": 0.35497477650642395, + "learning_rate": 4.426954150161503e-05, + "loss": 0.1987, + "step": 6804 + }, + { + "epoch": 1.377251568508399, + "grad_norm": 0.28614556789398193, + "learning_rate": 4.424313458199908e-05, + "loss": 0.1982, + "step": 6805 + }, + { + "epoch": 1.3774539566889294, + "grad_norm": 0.25958994030952454, + "learning_rate": 4.421673330327771e-05, + "loss": 0.1862, + "step": 6806 + }, + { + "epoch": 1.3776563448694596, + "grad_norm": 0.2792111039161682, + "learning_rate": 4.419033766812196e-05, + "loss": 0.2023, + "step": 6807 + }, + { + "epoch": 1.3778587330499898, + "grad_norm": 0.29506009817123413, + "learning_rate": 4.416394767920222e-05, + "loss": 0.1735, + "step": 6808 + }, + { + "epoch": 1.37806112123052, + "grad_norm": 0.28253570199012756, + "learning_rate": 4.4137563339188395e-05, + "loss": 0.204, + "step": 6809 + }, + { + "epoch": 1.3782635094110505, + "grad_norm": 0.2398274689912796, + "learning_rate": 4.411118465074974e-05, + "loss": 0.1415, + "step": 6810 + }, + { + "epoch": 1.3784658975915807, + "grad_norm": 0.3018699884414673, + "learning_rate": 4.408481161655499e-05, + "loss": 0.2001, + "step": 6811 + }, + { + "epoch": 1.3786682857721109, + "grad_norm": 0.26964306831359863, + "learning_rate": 4.405844423927228e-05, + "loss": 0.1626, + "step": 6812 + }, + { + "epoch": 1.3788706739526413, + "grad_norm": 0.25589293241500854, + "learning_rate": 4.403208252156921e-05, + "loss": 0.1736, + "step": 6813 + }, + { + "epoch": 1.3790730621331715, + "grad_norm": 0.28545406460762024, + "learning_rate": 4.400572646611275e-05, + "loss": 0.1692, + "step": 6814 + }, + { + "epoch": 1.3792754503137017, + "grad_norm": 0.29041236639022827, + "learning_rate": 4.3979376075569354e-05, + "loss": 0.2007, + "step": 6815 + }, + { + "epoch": 1.379477838494232, + "grad_norm": 0.26712194085121155, + "learning_rate": 4.395303135260487e-05, + "loss": 0.1918, + "step": 6816 + }, + { + "epoch": 1.379680226674762, + "grad_norm": 0.3227420449256897, + "learning_rate": 4.3926692299884573e-05, + "loss": 0.1999, + "step": 6817 + }, + { + "epoch": 1.3798826148552925, + "grad_norm": 0.310242235660553, + "learning_rate": 4.3900358920073184e-05, + "loss": 0.1989, + "step": 6818 + }, + { + "epoch": 1.3800850030358227, + "grad_norm": 0.3100515305995941, + "learning_rate": 4.387403121583482e-05, + "loss": 0.2101, + "step": 6819 + }, + { + "epoch": 1.380287391216353, + "grad_norm": 0.2882237434387207, + "learning_rate": 4.3847709189833075e-05, + "loss": 0.2105, + "step": 6820 + }, + { + "epoch": 1.3804897793968833, + "grad_norm": 0.4425124526023865, + "learning_rate": 4.38213928447309e-05, + "loss": 0.1808, + "step": 6821 + }, + { + "epoch": 1.3806921675774135, + "grad_norm": 0.2675884962081909, + "learning_rate": 4.379508218319073e-05, + "loss": 0.2059, + "step": 6822 + }, + { + "epoch": 1.3808945557579437, + "grad_norm": 0.30350470542907715, + "learning_rate": 4.376877720787439e-05, + "loss": 0.2115, + "step": 6823 + }, + { + "epoch": 1.381096943938474, + "grad_norm": 0.27757778763771057, + "learning_rate": 4.374247792144314e-05, + "loss": 0.1884, + "step": 6824 + }, + { + "epoch": 1.3812993321190041, + "grad_norm": 0.3709230422973633, + "learning_rate": 4.371618432655767e-05, + "loss": 0.1827, + "step": 6825 + }, + { + "epoch": 1.3815017202995346, + "grad_norm": 0.5349687933921814, + "learning_rate": 4.3689896425878095e-05, + "loss": 0.2039, + "step": 6826 + }, + { + "epoch": 1.3817041084800648, + "grad_norm": 0.2831704914569855, + "learning_rate": 4.3663614222063956e-05, + "loss": 0.1719, + "step": 6827 + }, + { + "epoch": 1.381906496660595, + "grad_norm": 0.2922341227531433, + "learning_rate": 4.3637337717774186e-05, + "loss": 0.2089, + "step": 6828 + }, + { + "epoch": 1.3821088848411254, + "grad_norm": 0.3860246241092682, + "learning_rate": 4.3611066915667173e-05, + "loss": 0.2074, + "step": 6829 + }, + { + "epoch": 1.3823112730216556, + "grad_norm": 0.3138883113861084, + "learning_rate": 4.3584801818400746e-05, + "loss": 0.2413, + "step": 6830 + }, + { + "epoch": 1.3825136612021858, + "grad_norm": 0.27164164185523987, + "learning_rate": 4.35585424286321e-05, + "loss": 0.1726, + "step": 6831 + }, + { + "epoch": 1.382716049382716, + "grad_norm": 0.32853418588638306, + "learning_rate": 4.353228874901789e-05, + "loss": 0.1984, + "step": 6832 + }, + { + "epoch": 1.3829184375632462, + "grad_norm": 0.24393995106220245, + "learning_rate": 4.350604078221421e-05, + "loss": 0.1721, + "step": 6833 + }, + { + "epoch": 1.3831208257437766, + "grad_norm": 0.26866650581359863, + "learning_rate": 4.3479798530876535e-05, + "loss": 0.135, + "step": 6834 + }, + { + "epoch": 1.3833232139243068, + "grad_norm": 0.25781649351119995, + "learning_rate": 4.3453561997659786e-05, + "loss": 0.2081, + "step": 6835 + }, + { + "epoch": 1.3835256021048372, + "grad_norm": 0.2890598177909851, + "learning_rate": 4.342733118521829e-05, + "loss": 0.1822, + "step": 6836 + }, + { + "epoch": 1.3837279902853674, + "grad_norm": 0.25596973299980164, + "learning_rate": 4.340110609620582e-05, + "loss": 0.1756, + "step": 6837 + }, + { + "epoch": 1.3839303784658976, + "grad_norm": 0.3108643889427185, + "learning_rate": 4.3374886733275554e-05, + "loss": 0.2116, + "step": 6838 + }, + { + "epoch": 1.3841327666464278, + "grad_norm": 0.2812763452529907, + "learning_rate": 4.3348673099080087e-05, + "loss": 0.1977, + "step": 6839 + }, + { + "epoch": 1.384335154826958, + "grad_norm": 0.27367645502090454, + "learning_rate": 4.3322465196271434e-05, + "loss": 0.2071, + "step": 6840 + }, + { + "epoch": 1.3845375430074884, + "grad_norm": 0.3385069668292999, + "learning_rate": 4.329626302750105e-05, + "loss": 0.1943, + "step": 6841 + }, + { + "epoch": 1.3847399311880186, + "grad_norm": 0.25230950117111206, + "learning_rate": 4.327006659541979e-05, + "loss": 0.1838, + "step": 6842 + }, + { + "epoch": 1.3849423193685488, + "grad_norm": 0.28919148445129395, + "learning_rate": 4.324387590267792e-05, + "loss": 0.1669, + "step": 6843 + }, + { + "epoch": 1.3851447075490793, + "grad_norm": 0.2459256798028946, + "learning_rate": 4.321769095192516e-05, + "loss": 0.1675, + "step": 6844 + }, + { + "epoch": 1.3853470957296095, + "grad_norm": 0.2925647795200348, + "learning_rate": 4.319151174581061e-05, + "loss": 0.195, + "step": 6845 + }, + { + "epoch": 1.3855494839101397, + "grad_norm": 0.27055978775024414, + "learning_rate": 4.316533828698283e-05, + "loss": 0.2061, + "step": 6846 + }, + { + "epoch": 1.3857518720906699, + "grad_norm": 0.27059707045555115, + "learning_rate": 4.313917057808975e-05, + "loss": 0.1772, + "step": 6847 + }, + { + "epoch": 1.3859542602712, + "grad_norm": 0.3185412883758545, + "learning_rate": 4.311300862177879e-05, + "loss": 0.1912, + "step": 6848 + }, + { + "epoch": 1.3861566484517305, + "grad_norm": 0.3338720202445984, + "learning_rate": 4.3086852420696685e-05, + "loss": 0.2449, + "step": 6849 + }, + { + "epoch": 1.3863590366322607, + "grad_norm": 0.3972804844379425, + "learning_rate": 4.306070197748967e-05, + "loss": 0.2011, + "step": 6850 + }, + { + "epoch": 1.3863590366322607, + "eval_loss": 0.2659958302974701, + "eval_runtime": 0.7397, + "eval_samples_per_second": 6.759, + "eval_steps_per_second": 1.352, + "step": 6850 + }, + { + "epoch": 1.3865614248127909, + "grad_norm": 0.2555653750896454, + "learning_rate": 4.3034557294803365e-05, + "loss": 0.1863, + "step": 6851 + }, + { + "epoch": 1.3867638129933213, + "grad_norm": 0.27162060141563416, + "learning_rate": 4.300841837528282e-05, + "loss": 0.2004, + "step": 6852 + }, + { + "epoch": 1.3869662011738515, + "grad_norm": 0.3083297908306122, + "learning_rate": 4.2982285221572505e-05, + "loss": 0.2093, + "step": 6853 + }, + { + "epoch": 1.3871685893543817, + "grad_norm": 0.3471545875072479, + "learning_rate": 4.295615783631629e-05, + "loss": 0.1907, + "step": 6854 + }, + { + "epoch": 1.387370977534912, + "grad_norm": 0.2562052309513092, + "learning_rate": 4.2930036222157466e-05, + "loss": 0.1747, + "step": 6855 + }, + { + "epoch": 1.387573365715442, + "grad_norm": 0.27602705359458923, + "learning_rate": 4.290392038173875e-05, + "loss": 0.2041, + "step": 6856 + }, + { + "epoch": 1.3877757538959725, + "grad_norm": 0.32171010971069336, + "learning_rate": 4.287781031770227e-05, + "loss": 0.2447, + "step": 6857 + }, + { + "epoch": 1.3879781420765027, + "grad_norm": 0.27243730425834656, + "learning_rate": 4.285170603268957e-05, + "loss": 0.177, + "step": 6858 + }, + { + "epoch": 1.388180530257033, + "grad_norm": 0.3044010400772095, + "learning_rate": 4.28256075293416e-05, + "loss": 0.2262, + "step": 6859 + }, + { + "epoch": 1.3883829184375633, + "grad_norm": 0.25574493408203125, + "learning_rate": 4.279951481029872e-05, + "loss": 0.1701, + "step": 6860 + }, + { + "epoch": 1.3885853066180935, + "grad_norm": 0.3542279899120331, + "learning_rate": 4.277342787820076e-05, + "loss": 0.2242, + "step": 6861 + }, + { + "epoch": 1.3887876947986237, + "grad_norm": 0.3481219708919525, + "learning_rate": 4.274734673568688e-05, + "loss": 0.1787, + "step": 6862 + }, + { + "epoch": 1.388990082979154, + "grad_norm": 0.3042903542518616, + "learning_rate": 4.272127138539571e-05, + "loss": 0.1845, + "step": 6863 + }, + { + "epoch": 1.3891924711596841, + "grad_norm": 0.3019198775291443, + "learning_rate": 4.269520182996528e-05, + "loss": 0.206, + "step": 6864 + }, + { + "epoch": 1.3893948593402146, + "grad_norm": 0.2840752601623535, + "learning_rate": 4.2669138072033056e-05, + "loss": 0.1861, + "step": 6865 + }, + { + "epoch": 1.3895972475207448, + "grad_norm": 0.25696811079978943, + "learning_rate": 4.2643080114235854e-05, + "loss": 0.143, + "step": 6866 + }, + { + "epoch": 1.3897996357012752, + "grad_norm": 0.26223090291023254, + "learning_rate": 4.2617027959209975e-05, + "loss": 0.1767, + "step": 6867 + }, + { + "epoch": 1.3900020238818054, + "grad_norm": 0.277608722448349, + "learning_rate": 4.259098160959109e-05, + "loss": 0.1934, + "step": 6868 + }, + { + "epoch": 1.3902044120623356, + "grad_norm": 0.3046850562095642, + "learning_rate": 4.256494106801432e-05, + "loss": 0.2011, + "step": 6869 + }, + { + "epoch": 1.3904068002428658, + "grad_norm": 0.2791365087032318, + "learning_rate": 4.2538906337114136e-05, + "loss": 0.2009, + "step": 6870 + }, + { + "epoch": 1.390609188423396, + "grad_norm": 0.245823934674263, + "learning_rate": 4.2512877419524476e-05, + "loss": 0.1618, + "step": 6871 + }, + { + "epoch": 1.3908115766039264, + "grad_norm": 0.33263981342315674, + "learning_rate": 4.2486854317878674e-05, + "loss": 0.1809, + "step": 6872 + }, + { + "epoch": 1.3910139647844566, + "grad_norm": 0.3047584295272827, + "learning_rate": 4.246083703480949e-05, + "loss": 0.1679, + "step": 6873 + }, + { + "epoch": 1.3912163529649868, + "grad_norm": 0.2599356174468994, + "learning_rate": 4.243482557294904e-05, + "loss": 0.1797, + "step": 6874 + }, + { + "epoch": 1.3914187411455172, + "grad_norm": 0.28328827023506165, + "learning_rate": 4.2408819934928924e-05, + "loss": 0.217, + "step": 6875 + }, + { + "epoch": 1.3916211293260474, + "grad_norm": 0.275232195854187, + "learning_rate": 4.2382820123380105e-05, + "loss": 0.2133, + "step": 6876 + }, + { + "epoch": 1.3918235175065776, + "grad_norm": 0.3404455780982971, + "learning_rate": 4.235682614093298e-05, + "loss": 0.1836, + "step": 6877 + }, + { + "epoch": 1.3920259056871078, + "grad_norm": 0.24205805361270905, + "learning_rate": 4.233083799021734e-05, + "loss": 0.1525, + "step": 6878 + }, + { + "epoch": 1.392228293867638, + "grad_norm": 0.30682337284088135, + "learning_rate": 4.230485567386241e-05, + "loss": 0.2042, + "step": 6879 + }, + { + "epoch": 1.3924306820481684, + "grad_norm": 0.2682032585144043, + "learning_rate": 4.227887919449678e-05, + "loss": 0.1837, + "step": 6880 + }, + { + "epoch": 1.3926330702286986, + "grad_norm": 0.27204856276512146, + "learning_rate": 4.225290855474849e-05, + "loss": 0.1906, + "step": 6881 + }, + { + "epoch": 1.3928354584092288, + "grad_norm": 0.31881260871887207, + "learning_rate": 4.2226943757245e-05, + "loss": 0.2215, + "step": 6882 + }, + { + "epoch": 1.3930378465897593, + "grad_norm": 0.3069551885128021, + "learning_rate": 4.220098480461311e-05, + "loss": 0.2167, + "step": 6883 + }, + { + "epoch": 1.3932402347702895, + "grad_norm": 0.3175070285797119, + "learning_rate": 4.217503169947912e-05, + "loss": 0.1998, + "step": 6884 + }, + { + "epoch": 1.3934426229508197, + "grad_norm": 0.3131294250488281, + "learning_rate": 4.2149084444468656e-05, + "loss": 0.1858, + "step": 6885 + }, + { + "epoch": 1.3936450111313499, + "grad_norm": 0.3579147458076477, + "learning_rate": 4.212314304220681e-05, + "loss": 0.2275, + "step": 6886 + }, + { + "epoch": 1.39384739931188, + "grad_norm": 0.3155602812767029, + "learning_rate": 4.209720749531806e-05, + "loss": 0.1723, + "step": 6887 + }, + { + "epoch": 1.3940497874924105, + "grad_norm": 0.30170321464538574, + "learning_rate": 4.207127780642628e-05, + "loss": 0.1445, + "step": 6888 + }, + { + "epoch": 1.3942521756729407, + "grad_norm": 0.28568869829177856, + "learning_rate": 4.204535397815478e-05, + "loss": 0.1945, + "step": 6889 + }, + { + "epoch": 1.3944545638534709, + "grad_norm": 0.2988249957561493, + "learning_rate": 4.2019436013126244e-05, + "loss": 0.2014, + "step": 6890 + }, + { + "epoch": 1.3946569520340013, + "grad_norm": 0.3752059042453766, + "learning_rate": 4.199352391396281e-05, + "loss": 0.2106, + "step": 6891 + }, + { + "epoch": 1.3948593402145315, + "grad_norm": 0.28706473112106323, + "learning_rate": 4.196761768328599e-05, + "loss": 0.1889, + "step": 6892 + }, + { + "epoch": 1.3950617283950617, + "grad_norm": 0.2898298501968384, + "learning_rate": 4.1941717323716645e-05, + "loss": 0.2069, + "step": 6893 + }, + { + "epoch": 1.395264116575592, + "grad_norm": 0.3084106743335724, + "learning_rate": 4.191582283787515e-05, + "loss": 0.2127, + "step": 6894 + }, + { + "epoch": 1.395466504756122, + "grad_norm": 0.29156970977783203, + "learning_rate": 4.188993422838123e-05, + "loss": 0.2036, + "step": 6895 + }, + { + "epoch": 1.3956688929366525, + "grad_norm": 0.2639651596546173, + "learning_rate": 4.186405149785403e-05, + "loss": 0.2066, + "step": 6896 + }, + { + "epoch": 1.3958712811171827, + "grad_norm": 0.3042910397052765, + "learning_rate": 4.1838174648912074e-05, + "loss": 0.2027, + "step": 6897 + }, + { + "epoch": 1.3960736692977131, + "grad_norm": 0.28110945224761963, + "learning_rate": 4.1812303684173334e-05, + "loss": 0.1957, + "step": 6898 + }, + { + "epoch": 1.3962760574782433, + "grad_norm": 0.3179463744163513, + "learning_rate": 4.178643860625514e-05, + "loss": 0.2002, + "step": 6899 + }, + { + "epoch": 1.3964784456587735, + "grad_norm": 0.24411508440971375, + "learning_rate": 4.176057941777427e-05, + "loss": 0.1795, + "step": 6900 + }, + { + "epoch": 1.3964784456587735, + "eval_loss": 0.2689473032951355, + "eval_runtime": 0.7378, + "eval_samples_per_second": 6.777, + "eval_steps_per_second": 1.355, + "step": 6900 + }, + { + "epoch": 1.3966808338393037, + "grad_norm": 0.27842822670936584, + "learning_rate": 4.1734726121346865e-05, + "loss": 0.2144, + "step": 6901 + }, + { + "epoch": 1.396883222019834, + "grad_norm": 0.3558849096298218, + "learning_rate": 4.170887871958851e-05, + "loss": 0.1779, + "step": 6902 + }, + { + "epoch": 1.3970856102003644, + "grad_norm": 0.33816561102867126, + "learning_rate": 4.168303721511415e-05, + "loss": 0.1732, + "step": 6903 + }, + { + "epoch": 1.3972879983808946, + "grad_norm": 0.3181740641593933, + "learning_rate": 4.1657201610538185e-05, + "loss": 0.2123, + "step": 6904 + }, + { + "epoch": 1.3974903865614248, + "grad_norm": 0.2679811120033264, + "learning_rate": 4.163137190847437e-05, + "loss": 0.2069, + "step": 6905 + }, + { + "epoch": 1.3976927747419552, + "grad_norm": 0.2734729051589966, + "learning_rate": 4.1605548111535894e-05, + "loss": 0.1787, + "step": 6906 + }, + { + "epoch": 1.3978951629224854, + "grad_norm": 0.28582727909088135, + "learning_rate": 4.1579730222335333e-05, + "loss": 0.2116, + "step": 6907 + }, + { + "epoch": 1.3980975511030156, + "grad_norm": 0.3257206976413727, + "learning_rate": 4.155391824348467e-05, + "loss": 0.2168, + "step": 6908 + }, + { + "epoch": 1.3982999392835458, + "grad_norm": 0.4075670540332794, + "learning_rate": 4.152811217759529e-05, + "loss": 0.2054, + "step": 6909 + }, + { + "epoch": 1.398502327464076, + "grad_norm": 0.2531040906906128, + "learning_rate": 4.150231202727799e-05, + "loss": 0.1747, + "step": 6910 + }, + { + "epoch": 1.3987047156446064, + "grad_norm": 0.2485847920179367, + "learning_rate": 4.1476517795142945e-05, + "loss": 0.1639, + "step": 6911 + }, + { + "epoch": 1.3989071038251366, + "grad_norm": 0.33335885405540466, + "learning_rate": 4.1450729483799746e-05, + "loss": 0.1984, + "step": 6912 + }, + { + "epoch": 1.3991094920056668, + "grad_norm": 0.29418519139289856, + "learning_rate": 4.142494709585739e-05, + "loss": 0.2102, + "step": 6913 + }, + { + "epoch": 1.3993118801861972, + "grad_norm": 0.31125608086586, + "learning_rate": 4.139917063392427e-05, + "loss": 0.2247, + "step": 6914 + }, + { + "epoch": 1.3995142683667274, + "grad_norm": 0.29453811049461365, + "learning_rate": 4.1373400100608194e-05, + "loss": 0.201, + "step": 6915 + }, + { + "epoch": 1.3997166565472576, + "grad_norm": 0.2884596586227417, + "learning_rate": 4.1347635498516314e-05, + "loss": 0.2132, + "step": 6916 + }, + { + "epoch": 1.3999190447277878, + "grad_norm": 0.2621874511241913, + "learning_rate": 4.132187683025523e-05, + "loss": 0.1808, + "step": 6917 + }, + { + "epoch": 1.400121432908318, + "grad_norm": 0.26058146357536316, + "learning_rate": 4.129612409843095e-05, + "loss": 0.173, + "step": 6918 + }, + { + "epoch": 1.4003238210888485, + "grad_norm": 0.2764558494091034, + "learning_rate": 4.127037730564888e-05, + "loss": 0.1854, + "step": 6919 + }, + { + "epoch": 1.4005262092693787, + "grad_norm": 0.3047637641429901, + "learning_rate": 4.1244636454513766e-05, + "loss": 0.1963, + "step": 6920 + }, + { + "epoch": 1.4007285974499089, + "grad_norm": 0.2553144693374634, + "learning_rate": 4.121890154762983e-05, + "loss": 0.156, + "step": 6921 + }, + { + "epoch": 1.4009309856304393, + "grad_norm": 0.3238953948020935, + "learning_rate": 4.119317258760066e-05, + "loss": 0.1743, + "step": 6922 + }, + { + "epoch": 1.4011333738109695, + "grad_norm": 0.2943544089794159, + "learning_rate": 4.1167449577029224e-05, + "loss": 0.2146, + "step": 6923 + }, + { + "epoch": 1.4013357619914997, + "grad_norm": 0.33927851915359497, + "learning_rate": 4.114173251851793e-05, + "loss": 0.1974, + "step": 6924 + }, + { + "epoch": 1.4015381501720299, + "grad_norm": 0.2715267241001129, + "learning_rate": 4.1116021414668525e-05, + "loss": 0.2103, + "step": 6925 + }, + { + "epoch": 1.40174053835256, + "grad_norm": 0.29221105575561523, + "learning_rate": 4.109031626808223e-05, + "loss": 0.1773, + "step": 6926 + }, + { + "epoch": 1.4019429265330905, + "grad_norm": 0.33900484442710876, + "learning_rate": 4.106461708135956e-05, + "loss": 0.1881, + "step": 6927 + }, + { + "epoch": 1.4021453147136207, + "grad_norm": 0.27142077684402466, + "learning_rate": 4.1038923857100565e-05, + "loss": 0.18, + "step": 6928 + }, + { + "epoch": 1.4023477028941511, + "grad_norm": 0.2642443776130676, + "learning_rate": 4.101323659790459e-05, + "loss": 0.1691, + "step": 6929 + }, + { + "epoch": 1.4025500910746813, + "grad_norm": 0.2583783268928528, + "learning_rate": 4.09875553063704e-05, + "loss": 0.1571, + "step": 6930 + }, + { + "epoch": 1.4027524792552115, + "grad_norm": 0.3036493957042694, + "learning_rate": 4.096187998509614e-05, + "loss": 0.1887, + "step": 6931 + }, + { + "epoch": 1.4029548674357417, + "grad_norm": 0.2827909588813782, + "learning_rate": 4.0936210636679386e-05, + "loss": 0.1945, + "step": 6932 + }, + { + "epoch": 1.403157255616272, + "grad_norm": 0.34129467606544495, + "learning_rate": 4.091054726371709e-05, + "loss": 0.2067, + "step": 6933 + }, + { + "epoch": 1.4033596437968023, + "grad_norm": 0.2848436236381531, + "learning_rate": 4.0884889868805606e-05, + "loss": 0.2043, + "step": 6934 + }, + { + "epoch": 1.4035620319773325, + "grad_norm": 0.28599414229393005, + "learning_rate": 4.085923845454067e-05, + "loss": 0.198, + "step": 6935 + }, + { + "epoch": 1.4037644201578627, + "grad_norm": 0.3087374269962311, + "learning_rate": 4.0833593023517445e-05, + "loss": 0.1792, + "step": 6936 + }, + { + "epoch": 1.4039668083383932, + "grad_norm": 0.30320149660110474, + "learning_rate": 4.080795357833047e-05, + "loss": 0.1712, + "step": 6937 + }, + { + "epoch": 1.4041691965189234, + "grad_norm": 0.3309793770313263, + "learning_rate": 4.0782320121573635e-05, + "loss": 0.2074, + "step": 6938 + }, + { + "epoch": 1.4043715846994536, + "grad_norm": 0.30699872970581055, + "learning_rate": 4.075669265584028e-05, + "loss": 0.2281, + "step": 6939 + }, + { + "epoch": 1.4045739728799838, + "grad_norm": 0.36575666069984436, + "learning_rate": 4.0731071183723135e-05, + "loss": 0.1978, + "step": 6940 + }, + { + "epoch": 1.404776361060514, + "grad_norm": 0.2952874004840851, + "learning_rate": 4.07054557078143e-05, + "loss": 0.2132, + "step": 6941 + }, + { + "epoch": 1.4049787492410444, + "grad_norm": 0.29006901383399963, + "learning_rate": 4.067984623070529e-05, + "loss": 0.1625, + "step": 6942 + }, + { + "epoch": 1.4051811374215746, + "grad_norm": 0.286811888217926, + "learning_rate": 4.065424275498699e-05, + "loss": 0.1725, + "step": 6943 + }, + { + "epoch": 1.4053835256021048, + "grad_norm": 0.25828805565834045, + "learning_rate": 4.062864528324971e-05, + "loss": 0.1738, + "step": 6944 + }, + { + "epoch": 1.4055859137826352, + "grad_norm": 0.267084538936615, + "learning_rate": 4.0603053818083125e-05, + "loss": 0.1879, + "step": 6945 + }, + { + "epoch": 1.4057883019631654, + "grad_norm": 0.39066869020462036, + "learning_rate": 4.0577468362076297e-05, + "loss": 0.1714, + "step": 6946 + }, + { + "epoch": 1.4059906901436956, + "grad_norm": 0.2583625614643097, + "learning_rate": 4.0551888917817716e-05, + "loss": 0.1542, + "step": 6947 + }, + { + "epoch": 1.4061930783242258, + "grad_norm": 0.28583186864852905, + "learning_rate": 4.052631548789524e-05, + "loss": 0.1832, + "step": 6948 + }, + { + "epoch": 1.406395466504756, + "grad_norm": 0.3042111098766327, + "learning_rate": 4.05007480748961e-05, + "loss": 0.2508, + "step": 6949 + }, + { + "epoch": 1.4065978546852864, + "grad_norm": 0.2741240859031677, + "learning_rate": 4.0475186681406954e-05, + "loss": 0.1845, + "step": 6950 + }, + { + "epoch": 1.4065978546852864, + "eval_loss": 0.26788368821144104, + "eval_runtime": 0.7398, + "eval_samples_per_second": 6.758, + "eval_steps_per_second": 1.352, + "step": 6950 + }, + { + "epoch": 1.4068002428658166, + "grad_norm": 0.2780517339706421, + "learning_rate": 4.044963131001383e-05, + "loss": 0.194, + "step": 6951 + }, + { + "epoch": 1.4070026310463468, + "grad_norm": 0.3128349781036377, + "learning_rate": 4.0424081963302164e-05, + "loss": 0.2026, + "step": 6952 + }, + { + "epoch": 1.4072050192268772, + "grad_norm": 0.2936166524887085, + "learning_rate": 4.0398538643856754e-05, + "loss": 0.2069, + "step": 6953 + }, + { + "epoch": 1.4074074074074074, + "grad_norm": 0.2642104923725128, + "learning_rate": 4.037300135426182e-05, + "loss": 0.1818, + "step": 6954 + }, + { + "epoch": 1.4076097955879376, + "grad_norm": 0.3181673586368561, + "learning_rate": 4.0347470097100934e-05, + "loss": 0.2027, + "step": 6955 + }, + { + "epoch": 1.4078121837684678, + "grad_norm": 0.26954516768455505, + "learning_rate": 4.032194487495712e-05, + "loss": 0.1967, + "step": 6956 + }, + { + "epoch": 1.4080145719489983, + "grad_norm": 0.2782799005508423, + "learning_rate": 4.029642569041271e-05, + "loss": 0.2049, + "step": 6957 + }, + { + "epoch": 1.4082169601295285, + "grad_norm": 0.2677285969257355, + "learning_rate": 4.02709125460495e-05, + "loss": 0.1798, + "step": 6958 + }, + { + "epoch": 1.4084193483100587, + "grad_norm": 0.29425325989723206, + "learning_rate": 4.024540544444865e-05, + "loss": 0.1793, + "step": 6959 + }, + { + "epoch": 1.408621736490589, + "grad_norm": 0.2720067799091339, + "learning_rate": 4.0219904388190655e-05, + "loss": 0.1665, + "step": 6960 + }, + { + "epoch": 1.4088241246711193, + "grad_norm": 0.28150567412376404, + "learning_rate": 4.0194409379855456e-05, + "loss": 0.1955, + "step": 6961 + }, + { + "epoch": 1.4090265128516495, + "grad_norm": 0.29516106843948364, + "learning_rate": 4.016892042202239e-05, + "loss": 0.1957, + "step": 6962 + }, + { + "epoch": 1.4092289010321797, + "grad_norm": 0.299935519695282, + "learning_rate": 4.014343751727017e-05, + "loss": 0.2035, + "step": 6963 + }, + { + "epoch": 1.4094312892127099, + "grad_norm": 0.3131525218486786, + "learning_rate": 4.011796066817686e-05, + "loss": 0.1969, + "step": 6964 + }, + { + "epoch": 1.4096336773932403, + "grad_norm": 0.25779253244400024, + "learning_rate": 4.009248987731995e-05, + "loss": 0.1746, + "step": 6965 + }, + { + "epoch": 1.4098360655737705, + "grad_norm": 0.24830158054828644, + "learning_rate": 4.006702514727632e-05, + "loss": 0.2183, + "step": 6966 + }, + { + "epoch": 1.4100384537543007, + "grad_norm": 0.29047366976737976, + "learning_rate": 4.0041566480622215e-05, + "loss": 0.1979, + "step": 6967 + }, + { + "epoch": 1.4102408419348311, + "grad_norm": 0.27898839116096497, + "learning_rate": 4.001611387993327e-05, + "loss": 0.2051, + "step": 6968 + }, + { + "epoch": 1.4104432301153613, + "grad_norm": 0.30387625098228455, + "learning_rate": 3.9990667347784525e-05, + "loss": 0.2038, + "step": 6969 + }, + { + "epoch": 1.4106456182958915, + "grad_norm": 0.29681432247161865, + "learning_rate": 3.996522688675038e-05, + "loss": 0.2124, + "step": 6970 + }, + { + "epoch": 1.4108480064764217, + "grad_norm": 0.3277590274810791, + "learning_rate": 3.993979249940465e-05, + "loss": 0.2206, + "step": 6971 + }, + { + "epoch": 1.411050394656952, + "grad_norm": 0.3136327266693115, + "learning_rate": 3.991436418832051e-05, + "loss": 0.2164, + "step": 6972 + }, + { + "epoch": 1.4112527828374823, + "grad_norm": 0.25054970383644104, + "learning_rate": 3.9888941956070525e-05, + "loss": 0.1874, + "step": 6973 + }, + { + "epoch": 1.4114551710180125, + "grad_norm": 0.3261873424053192, + "learning_rate": 3.9863525805226664e-05, + "loss": 0.1866, + "step": 6974 + }, + { + "epoch": 1.4116575591985427, + "grad_norm": 0.26633456349372864, + "learning_rate": 3.983811573836025e-05, + "loss": 0.1779, + "step": 6975 + }, + { + "epoch": 1.4118599473790732, + "grad_norm": 0.3039325773715973, + "learning_rate": 3.981271175804201e-05, + "loss": 0.2131, + "step": 6976 + }, + { + "epoch": 1.4120623355596034, + "grad_norm": 0.2937398850917816, + "learning_rate": 3.978731386684206e-05, + "loss": 0.1881, + "step": 6977 + }, + { + "epoch": 1.4122647237401336, + "grad_norm": 0.2729499936103821, + "learning_rate": 3.976192206732989e-05, + "loss": 0.1745, + "step": 6978 + }, + { + "epoch": 1.4124671119206638, + "grad_norm": 0.2721683979034424, + "learning_rate": 3.973653636207437e-05, + "loss": 0.2057, + "step": 6979 + }, + { + "epoch": 1.412669500101194, + "grad_norm": 0.267595112323761, + "learning_rate": 3.971115675364378e-05, + "loss": 0.1943, + "step": 6980 + }, + { + "epoch": 1.4128718882817244, + "grad_norm": 0.30413955450057983, + "learning_rate": 3.9685783244605726e-05, + "loss": 0.2195, + "step": 6981 + }, + { + "epoch": 1.4130742764622546, + "grad_norm": 0.23452375829219818, + "learning_rate": 3.966041583752726e-05, + "loss": 0.1611, + "step": 6982 + }, + { + "epoch": 1.4132766646427848, + "grad_norm": 0.2777538597583771, + "learning_rate": 3.963505453497478e-05, + "loss": 0.1663, + "step": 6983 + }, + { + "epoch": 1.4134790528233152, + "grad_norm": 0.2605833411216736, + "learning_rate": 3.960969933951409e-05, + "loss": 0.1797, + "step": 6984 + }, + { + "epoch": 1.4136814410038454, + "grad_norm": 0.2730015814304352, + "learning_rate": 3.9584350253710345e-05, + "loss": 0.205, + "step": 6985 + }, + { + "epoch": 1.4138838291843756, + "grad_norm": 0.3188944160938263, + "learning_rate": 3.9559007280128105e-05, + "loss": 0.2173, + "step": 6986 + }, + { + "epoch": 1.4140862173649058, + "grad_norm": 0.27802667021751404, + "learning_rate": 3.9533670421331314e-05, + "loss": 0.1775, + "step": 6987 + }, + { + "epoch": 1.4142886055454362, + "grad_norm": 0.27725279331207275, + "learning_rate": 3.9508339679883276e-05, + "loss": 0.1848, + "step": 6988 + }, + { + "epoch": 1.4144909937259664, + "grad_norm": 0.25943297147750854, + "learning_rate": 3.948301505834671e-05, + "loss": 0.1987, + "step": 6989 + }, + { + "epoch": 1.4146933819064966, + "grad_norm": 0.3306175768375397, + "learning_rate": 3.9457696559283674e-05, + "loss": 0.186, + "step": 6990 + }, + { + "epoch": 1.414895770087027, + "grad_norm": 0.2895580530166626, + "learning_rate": 3.9432384185255635e-05, + "loss": 0.1855, + "step": 6991 + }, + { + "epoch": 1.4150981582675572, + "grad_norm": 0.26445573568344116, + "learning_rate": 3.940707793882344e-05, + "loss": 0.1952, + "step": 6992 + }, + { + "epoch": 1.4153005464480874, + "grad_norm": 0.28116846084594727, + "learning_rate": 3.9381777822547305e-05, + "loss": 0.1798, + "step": 6993 + }, + { + "epoch": 1.4155029346286176, + "grad_norm": 0.2855150103569031, + "learning_rate": 3.935648383898683e-05, + "loss": 0.1667, + "step": 6994 + }, + { + "epoch": 1.4157053228091478, + "grad_norm": 0.25917181372642517, + "learning_rate": 3.9331195990701e-05, + "loss": 0.1764, + "step": 6995 + }, + { + "epoch": 1.4159077109896783, + "grad_norm": 0.3372366726398468, + "learning_rate": 3.930591428024816e-05, + "loss": 0.203, + "step": 6996 + }, + { + "epoch": 1.4161100991702085, + "grad_norm": 0.32218167185783386, + "learning_rate": 3.9280638710186056e-05, + "loss": 0.2263, + "step": 6997 + }, + { + "epoch": 1.4163124873507387, + "grad_norm": 0.2682129442691803, + "learning_rate": 3.925536928307181e-05, + "loss": 0.1852, + "step": 6998 + }, + { + "epoch": 1.416514875531269, + "grad_norm": 0.25663620233535767, + "learning_rate": 3.923010600146192e-05, + "loss": 0.1841, + "step": 6999 + }, + { + "epoch": 1.4167172637117993, + "grad_norm": 0.2717934548854828, + "learning_rate": 3.920484886791225e-05, + "loss": 0.1916, + "step": 7000 + }, + { + "epoch": 1.4167172637117993, + "eval_loss": 0.2678060531616211, + "eval_runtime": 0.7392, + "eval_samples_per_second": 6.764, + "eval_steps_per_second": 1.353, + "step": 7000 + }, + { + "epoch": 1.4169196518923295, + "grad_norm": 0.25787022709846497, + "learning_rate": 3.917959788497805e-05, + "loss": 0.1891, + "step": 7001 + }, + { + "epoch": 1.4171220400728597, + "grad_norm": 0.2321719080209732, + "learning_rate": 3.9154353055213955e-05, + "loss": 0.1634, + "step": 7002 + }, + { + "epoch": 1.41732442825339, + "grad_norm": 0.2759595513343811, + "learning_rate": 3.912911438117397e-05, + "loss": 0.1692, + "step": 7003 + }, + { + "epoch": 1.4175268164339203, + "grad_norm": 0.26630786061286926, + "learning_rate": 3.910388186541153e-05, + "loss": 0.1929, + "step": 7004 + }, + { + "epoch": 1.4177292046144505, + "grad_norm": 0.28287503123283386, + "learning_rate": 3.90786555104793e-05, + "loss": 0.2087, + "step": 7005 + }, + { + "epoch": 1.4179315927949807, + "grad_norm": 0.2573223114013672, + "learning_rate": 3.9053435318929464e-05, + "loss": 0.1451, + "step": 7006 + }, + { + "epoch": 1.4181339809755111, + "grad_norm": 0.23673537373542786, + "learning_rate": 3.902822129331355e-05, + "loss": 0.1607, + "step": 7007 + }, + { + "epoch": 1.4183363691560413, + "grad_norm": 0.26111093163490295, + "learning_rate": 3.900301343618242e-05, + "loss": 0.2106, + "step": 7008 + }, + { + "epoch": 1.4185387573365715, + "grad_norm": 0.27173036336898804, + "learning_rate": 3.897781175008637e-05, + "loss": 0.1738, + "step": 7009 + }, + { + "epoch": 1.4187411455171017, + "grad_norm": 0.300983190536499, + "learning_rate": 3.895261623757502e-05, + "loss": 0.1811, + "step": 7010 + }, + { + "epoch": 1.418943533697632, + "grad_norm": 0.2922830581665039, + "learning_rate": 3.89274269011974e-05, + "loss": 0.2008, + "step": 7011 + }, + { + "epoch": 1.4191459218781624, + "grad_norm": 0.2900276482105255, + "learning_rate": 3.89022437435019e-05, + "loss": 0.187, + "step": 7012 + }, + { + "epoch": 1.4193483100586926, + "grad_norm": 0.30275261402130127, + "learning_rate": 3.887706676703628e-05, + "loss": 0.1927, + "step": 7013 + }, + { + "epoch": 1.4195506982392228, + "grad_norm": 0.24182263016700745, + "learning_rate": 3.88518959743477e-05, + "loss": 0.1538, + "step": 7014 + }, + { + "epoch": 1.4197530864197532, + "grad_norm": 0.2707059383392334, + "learning_rate": 3.882673136798265e-05, + "loss": 0.1697, + "step": 7015 + }, + { + "epoch": 1.4199554746002834, + "grad_norm": 0.25626611709594727, + "learning_rate": 3.880157295048704e-05, + "loss": 0.183, + "step": 7016 + }, + { + "epoch": 1.4201578627808136, + "grad_norm": 0.3871743083000183, + "learning_rate": 3.8776420724406136e-05, + "loss": 0.1843, + "step": 7017 + }, + { + "epoch": 1.4203602509613438, + "grad_norm": 0.29843243956565857, + "learning_rate": 3.875127469228458e-05, + "loss": 0.2119, + "step": 7018 + }, + { + "epoch": 1.4205626391418742, + "grad_norm": 0.2737175524234772, + "learning_rate": 3.872613485666636e-05, + "loss": 0.1581, + "step": 7019 + }, + { + "epoch": 1.4207650273224044, + "grad_norm": 0.2567245364189148, + "learning_rate": 3.870100122009488e-05, + "loss": 0.1756, + "step": 7020 + }, + { + "epoch": 1.4209674155029346, + "grad_norm": 0.2890596389770508, + "learning_rate": 3.867587378511291e-05, + "loss": 0.1906, + "step": 7021 + }, + { + "epoch": 1.421169803683465, + "grad_norm": 0.3265067934989929, + "learning_rate": 3.8650752554262536e-05, + "loss": 0.2011, + "step": 7022 + }, + { + "epoch": 1.4213721918639952, + "grad_norm": 0.30416449904441833, + "learning_rate": 3.86256375300853e-05, + "loss": 0.2116, + "step": 7023 + }, + { + "epoch": 1.4215745800445254, + "grad_norm": 0.2644321024417877, + "learning_rate": 3.8600528715122074e-05, + "loss": 0.2059, + "step": 7024 + }, + { + "epoch": 1.4217769682250556, + "grad_norm": 0.2900888919830322, + "learning_rate": 3.8575426111913084e-05, + "loss": 0.1599, + "step": 7025 + }, + { + "epoch": 1.4219793564055858, + "grad_norm": 0.2844480574131012, + "learning_rate": 3.855032972299797e-05, + "loss": 0.2083, + "step": 7026 + }, + { + "epoch": 1.4221817445861162, + "grad_norm": 0.3461550176143646, + "learning_rate": 3.852523955091569e-05, + "loss": 0.2172, + "step": 7027 + }, + { + "epoch": 1.4223841327666464, + "grad_norm": 0.28394609689712524, + "learning_rate": 3.8500155598204644e-05, + "loss": 0.2022, + "step": 7028 + }, + { + "epoch": 1.4225865209471766, + "grad_norm": 0.3086096942424774, + "learning_rate": 3.847507786740254e-05, + "loss": 0.2031, + "step": 7029 + }, + { + "epoch": 1.422788909127707, + "grad_norm": 0.31477683782577515, + "learning_rate": 3.845000636104649e-05, + "loss": 0.1966, + "step": 7030 + }, + { + "epoch": 1.4229912973082373, + "grad_norm": 0.27445054054260254, + "learning_rate": 3.842494108167294e-05, + "loss": 0.1893, + "step": 7031 + }, + { + "epoch": 1.4231936854887675, + "grad_norm": 0.29484322667121887, + "learning_rate": 3.839988203181777e-05, + "loss": 0.1978, + "step": 7032 + }, + { + "epoch": 1.4233960736692977, + "grad_norm": 0.33337923884391785, + "learning_rate": 3.837482921401616e-05, + "loss": 0.1961, + "step": 7033 + }, + { + "epoch": 1.4235984618498279, + "grad_norm": 0.2541923224925995, + "learning_rate": 3.834978263080271e-05, + "loss": 0.1719, + "step": 7034 + }, + { + "epoch": 1.4238008500303583, + "grad_norm": 0.28883957862854004, + "learning_rate": 3.8324742284711366e-05, + "loss": 0.1913, + "step": 7035 + }, + { + "epoch": 1.4240032382108885, + "grad_norm": 0.31055620312690735, + "learning_rate": 3.829970817827545e-05, + "loss": 0.2066, + "step": 7036 + }, + { + "epoch": 1.4242056263914187, + "grad_norm": 0.2655206620693207, + "learning_rate": 3.8274680314027646e-05, + "loss": 0.1796, + "step": 7037 + }, + { + "epoch": 1.424408014571949, + "grad_norm": 0.28076305985450745, + "learning_rate": 3.824965869450001e-05, + "loss": 0.1964, + "step": 7038 + }, + { + "epoch": 1.4246104027524793, + "grad_norm": 0.314505398273468, + "learning_rate": 3.822464332222396e-05, + "loss": 0.2218, + "step": 7039 + }, + { + "epoch": 1.4248127909330095, + "grad_norm": 0.30106157064437866, + "learning_rate": 3.819963419973031e-05, + "loss": 0.2023, + "step": 7040 + }, + { + "epoch": 1.4250151791135397, + "grad_norm": 0.2859259247779846, + "learning_rate": 3.8174631329549203e-05, + "loss": 0.1928, + "step": 7041 + }, + { + "epoch": 1.42521756729407, + "grad_norm": 0.2816571295261383, + "learning_rate": 3.814963471421017e-05, + "loss": 0.1849, + "step": 7042 + }, + { + "epoch": 1.4254199554746003, + "grad_norm": 0.27684688568115234, + "learning_rate": 3.812464435624211e-05, + "loss": 0.1767, + "step": 7043 + }, + { + "epoch": 1.4256223436551305, + "grad_norm": 0.2891367971897125, + "learning_rate": 3.8099660258173285e-05, + "loss": 0.1881, + "step": 7044 + }, + { + "epoch": 1.4258247318356607, + "grad_norm": 0.30000269412994385, + "learning_rate": 3.8074682422531314e-05, + "loss": 0.1921, + "step": 7045 + }, + { + "epoch": 1.4260271200161911, + "grad_norm": 0.2510431706905365, + "learning_rate": 3.804971085184321e-05, + "loss": 0.1945, + "step": 7046 + }, + { + "epoch": 1.4262295081967213, + "grad_norm": 0.3182806968688965, + "learning_rate": 3.802474554863532e-05, + "loss": 0.1756, + "step": 7047 + }, + { + "epoch": 1.4264318963772515, + "grad_norm": 0.29959988594055176, + "learning_rate": 3.799978651543341e-05, + "loss": 0.2023, + "step": 7048 + }, + { + "epoch": 1.4266342845577817, + "grad_norm": 0.3611461818218231, + "learning_rate": 3.797483375476251e-05, + "loss": 0.2177, + "step": 7049 + }, + { + "epoch": 1.4268366727383122, + "grad_norm": 0.28023532032966614, + "learning_rate": 3.79498872691471e-05, + "loss": 0.2006, + "step": 7050 + }, + { + "epoch": 1.4268366727383122, + "eval_loss": 0.26923832297325134, + "eval_runtime": 0.7415, + "eval_samples_per_second": 6.743, + "eval_steps_per_second": 1.349, + "step": 7050 + }, + { + "epoch": 1.4270390609188424, + "grad_norm": 0.3193739652633667, + "learning_rate": 3.792494706111102e-05, + "loss": 0.2008, + "step": 7051 + }, + { + "epoch": 1.4272414490993726, + "grad_norm": 0.29817917943000793, + "learning_rate": 3.790001313317745e-05, + "loss": 0.1809, + "step": 7052 + }, + { + "epoch": 1.427443837279903, + "grad_norm": 0.2573106288909912, + "learning_rate": 3.787508548786893e-05, + "loss": 0.189, + "step": 7053 + }, + { + "epoch": 1.4276462254604332, + "grad_norm": 0.2731283903121948, + "learning_rate": 3.785016412770741e-05, + "loss": 0.1973, + "step": 7054 + }, + { + "epoch": 1.4278486136409634, + "grad_norm": 0.3203198313713074, + "learning_rate": 3.782524905521414e-05, + "loss": 0.2265, + "step": 7055 + }, + { + "epoch": 1.4280510018214936, + "grad_norm": 0.2824562191963196, + "learning_rate": 3.780034027290978e-05, + "loss": 0.1963, + "step": 7056 + }, + { + "epoch": 1.4282533900020238, + "grad_norm": 0.2550790309906006, + "learning_rate": 3.777543778331435e-05, + "loss": 0.1694, + "step": 7057 + }, + { + "epoch": 1.4284557781825542, + "grad_norm": 0.30352532863616943, + "learning_rate": 3.7750541588947195e-05, + "loss": 0.1808, + "step": 7058 + }, + { + "epoch": 1.4286581663630844, + "grad_norm": 0.2527769207954407, + "learning_rate": 3.772565169232707e-05, + "loss": 0.1661, + "step": 7059 + }, + { + "epoch": 1.4288605545436146, + "grad_norm": 0.30140969157218933, + "learning_rate": 3.7700768095972074e-05, + "loss": 0.176, + "step": 7060 + }, + { + "epoch": 1.429062942724145, + "grad_norm": 0.318043977022171, + "learning_rate": 3.767589080239966e-05, + "loss": 0.189, + "step": 7061 + }, + { + "epoch": 1.4292653309046752, + "grad_norm": 0.32130885124206543, + "learning_rate": 3.7651019814126654e-05, + "loss": 0.2084, + "step": 7062 + }, + { + "epoch": 1.4294677190852054, + "grad_norm": 0.30957019329071045, + "learning_rate": 3.762615513366925e-05, + "loss": 0.2183, + "step": 7063 + }, + { + "epoch": 1.4296701072657356, + "grad_norm": 0.31902942061424255, + "learning_rate": 3.760129676354298e-05, + "loss": 0.1922, + "step": 7064 + }, + { + "epoch": 1.4298724954462658, + "grad_norm": 0.26623645424842834, + "learning_rate": 3.757644470626276e-05, + "loss": 0.1567, + "step": 7065 + }, + { + "epoch": 1.4300748836267962, + "grad_norm": 0.2927635610103607, + "learning_rate": 3.755159896434287e-05, + "loss": 0.1668, + "step": 7066 + }, + { + "epoch": 1.4302772718073264, + "grad_norm": 0.28568488359451294, + "learning_rate": 3.752675954029693e-05, + "loss": 0.1999, + "step": 7067 + }, + { + "epoch": 1.4304796599878566, + "grad_norm": 0.25858959555625916, + "learning_rate": 3.7501926436637934e-05, + "loss": 0.1602, + "step": 7068 + }, + { + "epoch": 1.430682048168387, + "grad_norm": 0.28040850162506104, + "learning_rate": 3.7477099655878236e-05, + "loss": 0.1841, + "step": 7069 + }, + { + "epoch": 1.4308844363489173, + "grad_norm": 0.27585768699645996, + "learning_rate": 3.7452279200529585e-05, + "loss": 0.1719, + "step": 7070 + }, + { + "epoch": 1.4310868245294475, + "grad_norm": 0.3029848337173462, + "learning_rate": 3.742746507310299e-05, + "loss": 0.1921, + "step": 7071 + }, + { + "epoch": 1.4312892127099777, + "grad_norm": 0.308578759431839, + "learning_rate": 3.74026572761089e-05, + "loss": 0.2057, + "step": 7072 + }, + { + "epoch": 1.4314916008905079, + "grad_norm": 0.3463033139705658, + "learning_rate": 3.737785581205713e-05, + "loss": 0.1898, + "step": 7073 + }, + { + "epoch": 1.4316939890710383, + "grad_norm": 0.3103969693183899, + "learning_rate": 3.735306068345681e-05, + "loss": 0.1944, + "step": 7074 + }, + { + "epoch": 1.4318963772515685, + "grad_norm": 0.2698296904563904, + "learning_rate": 3.732827189281647e-05, + "loss": 0.1753, + "step": 7075 + }, + { + "epoch": 1.4320987654320987, + "grad_norm": 0.2952782213687897, + "learning_rate": 3.730348944264398e-05, + "loss": 0.2036, + "step": 7076 + }, + { + "epoch": 1.432301153612629, + "grad_norm": 0.24207934737205505, + "learning_rate": 3.7278713335446557e-05, + "loss": 0.1865, + "step": 7077 + }, + { + "epoch": 1.4325035417931593, + "grad_norm": 0.26546210050582886, + "learning_rate": 3.7253943573730784e-05, + "loss": 0.1585, + "step": 7078 + }, + { + "epoch": 1.4327059299736895, + "grad_norm": 0.3293583393096924, + "learning_rate": 3.722918016000263e-05, + "loss": 0.1835, + "step": 7079 + }, + { + "epoch": 1.4329083181542197, + "grad_norm": 0.319829523563385, + "learning_rate": 3.720442309676733e-05, + "loss": 0.1847, + "step": 7080 + }, + { + "epoch": 1.4331107063347501, + "grad_norm": 0.2597779929637909, + "learning_rate": 3.717967238652964e-05, + "loss": 0.1772, + "step": 7081 + }, + { + "epoch": 1.4333130945152803, + "grad_norm": 0.23758916556835175, + "learning_rate": 3.7154928031793526e-05, + "loss": 0.1486, + "step": 7082 + }, + { + "epoch": 1.4335154826958105, + "grad_norm": 0.3300015926361084, + "learning_rate": 3.713019003506237e-05, + "loss": 0.1764, + "step": 7083 + }, + { + "epoch": 1.433717870876341, + "grad_norm": 0.3280850350856781, + "learning_rate": 3.71054583988389e-05, + "loss": 0.192, + "step": 7084 + }, + { + "epoch": 1.4339202590568711, + "grad_norm": 0.241069957613945, + "learning_rate": 3.70807331256252e-05, + "loss": 0.1598, + "step": 7085 + }, + { + "epoch": 1.4341226472374013, + "grad_norm": 0.29670462012290955, + "learning_rate": 3.705601421792273e-05, + "loss": 0.2229, + "step": 7086 + }, + { + "epoch": 1.4343250354179315, + "grad_norm": 0.2805311977863312, + "learning_rate": 3.7031301678232266e-05, + "loss": 0.1959, + "step": 7087 + }, + { + "epoch": 1.4345274235984617, + "grad_norm": 0.2845379114151001, + "learning_rate": 3.700659550905398e-05, + "loss": 0.1719, + "step": 7088 + }, + { + "epoch": 1.4347298117789922, + "grad_norm": 0.27402937412261963, + "learning_rate": 3.698189571288737e-05, + "loss": 0.1479, + "step": 7089 + }, + { + "epoch": 1.4349321999595224, + "grad_norm": 0.37877580523490906, + "learning_rate": 3.695720229223132e-05, + "loss": 0.2259, + "step": 7090 + }, + { + "epoch": 1.4351345881400526, + "grad_norm": 0.32928740978240967, + "learning_rate": 3.6932515249584045e-05, + "loss": 0.2107, + "step": 7091 + }, + { + "epoch": 1.435336976320583, + "grad_norm": 0.29471495747566223, + "learning_rate": 3.690783458744311e-05, + "loss": 0.2261, + "step": 7092 + }, + { + "epoch": 1.4355393645011132, + "grad_norm": 0.31165096163749695, + "learning_rate": 3.688316030830549e-05, + "loss": 0.1891, + "step": 7093 + }, + { + "epoch": 1.4357417526816434, + "grad_norm": 0.4806102514266968, + "learning_rate": 3.685849241466739e-05, + "loss": 0.2203, + "step": 7094 + }, + { + "epoch": 1.4359441408621736, + "grad_norm": 0.25433045625686646, + "learning_rate": 3.6833830909024505e-05, + "loss": 0.1395, + "step": 7095 + }, + { + "epoch": 1.4361465290427038, + "grad_norm": 0.2732140123844147, + "learning_rate": 3.680917579387181e-05, + "loss": 0.1718, + "step": 7096 + }, + { + "epoch": 1.4363489172232342, + "grad_norm": 0.3305196166038513, + "learning_rate": 3.678452707170364e-05, + "loss": 0.2393, + "step": 7097 + }, + { + "epoch": 1.4365513054037644, + "grad_norm": 0.2531237304210663, + "learning_rate": 3.675988474501373e-05, + "loss": 0.1803, + "step": 7098 + }, + { + "epoch": 1.4367536935842946, + "grad_norm": 0.2620668113231659, + "learning_rate": 3.6735248816295096e-05, + "loss": 0.1948, + "step": 7099 + }, + { + "epoch": 1.436956081764825, + "grad_norm": 0.2772290110588074, + "learning_rate": 3.671061928804016e-05, + "loss": 0.1965, + "step": 7100 + }, + { + "epoch": 1.436956081764825, + "eval_loss": 0.2654963731765747, + "eval_runtime": 0.7381, + "eval_samples_per_second": 6.775, + "eval_steps_per_second": 1.355, + "step": 7100 + }, + { + "epoch": 1.4371584699453552, + "grad_norm": 0.3738824427127838, + "learning_rate": 3.6685996162740674e-05, + "loss": 0.2033, + "step": 7101 + }, + { + "epoch": 1.4373608581258854, + "grad_norm": 0.29637742042541504, + "learning_rate": 3.6661379442887755e-05, + "loss": 0.197, + "step": 7102 + }, + { + "epoch": 1.4375632463064156, + "grad_norm": 0.284424364566803, + "learning_rate": 3.663676913097186e-05, + "loss": 0.1912, + "step": 7103 + }, + { + "epoch": 1.4377656344869458, + "grad_norm": 0.3330419957637787, + "learning_rate": 3.66121652294828e-05, + "loss": 0.1742, + "step": 7104 + }, + { + "epoch": 1.4379680226674763, + "grad_norm": 0.2758113741874695, + "learning_rate": 3.6587567740909746e-05, + "loss": 0.1907, + "step": 7105 + }, + { + "epoch": 1.4381704108480065, + "grad_norm": 0.2782094478607178, + "learning_rate": 3.65629766677412e-05, + "loss": 0.1927, + "step": 7106 + }, + { + "epoch": 1.4383727990285367, + "grad_norm": 0.3029071092605591, + "learning_rate": 3.653839201246504e-05, + "loss": 0.2061, + "step": 7107 + }, + { + "epoch": 1.438575187209067, + "grad_norm": 0.3084353506565094, + "learning_rate": 3.6513813777568485e-05, + "loss": 0.2026, + "step": 7108 + }, + { + "epoch": 1.4387775753895973, + "grad_norm": 0.31316351890563965, + "learning_rate": 3.648924196553809e-05, + "loss": 0.202, + "step": 7109 + }, + { + "epoch": 1.4389799635701275, + "grad_norm": 0.31355270743370056, + "learning_rate": 3.646467657885979e-05, + "loss": 0.1952, + "step": 7110 + }, + { + "epoch": 1.4391823517506577, + "grad_norm": 0.29359108209609985, + "learning_rate": 3.6440117620018844e-05, + "loss": 0.1893, + "step": 7111 + }, + { + "epoch": 1.439384739931188, + "grad_norm": 0.27208781242370605, + "learning_rate": 3.641556509149987e-05, + "loss": 0.1721, + "step": 7112 + }, + { + "epoch": 1.4395871281117183, + "grad_norm": 0.27961140871047974, + "learning_rate": 3.639101899578684e-05, + "loss": 0.1964, + "step": 7113 + }, + { + "epoch": 1.4397895162922485, + "grad_norm": 0.2883656322956085, + "learning_rate": 3.636647933536306e-05, + "loss": 0.1949, + "step": 7114 + }, + { + "epoch": 1.439991904472779, + "grad_norm": 0.29237887263298035, + "learning_rate": 3.634194611271124e-05, + "loss": 0.222, + "step": 7115 + }, + { + "epoch": 1.4401942926533091, + "grad_norm": 0.5512852072715759, + "learning_rate": 3.6317419330313316e-05, + "loss": 0.201, + "step": 7116 + }, + { + "epoch": 1.4403966808338393, + "grad_norm": 0.3003794550895691, + "learning_rate": 3.6292898990650704e-05, + "loss": 0.1926, + "step": 7117 + }, + { + "epoch": 1.4405990690143695, + "grad_norm": 0.30134889483451843, + "learning_rate": 3.62683850962041e-05, + "loss": 0.214, + "step": 7118 + }, + { + "epoch": 1.4408014571948997, + "grad_norm": 0.31887561082839966, + "learning_rate": 3.624387764945355e-05, + "loss": 0.1939, + "step": 7119 + }, + { + "epoch": 1.4410038453754301, + "grad_norm": 0.30428701639175415, + "learning_rate": 3.6219376652878476e-05, + "loss": 0.2034, + "step": 7120 + }, + { + "epoch": 1.4412062335559603, + "grad_norm": 0.2859341502189636, + "learning_rate": 3.619488210895763e-05, + "loss": 0.1735, + "step": 7121 + }, + { + "epoch": 1.4414086217364905, + "grad_norm": 0.3013683259487152, + "learning_rate": 3.617039402016912e-05, + "loss": 0.2124, + "step": 7122 + }, + { + "epoch": 1.441611009917021, + "grad_norm": 0.28692567348480225, + "learning_rate": 3.614591238899039e-05, + "loss": 0.2221, + "step": 7123 + }, + { + "epoch": 1.4418133980975512, + "grad_norm": 0.3107813894748688, + "learning_rate": 3.612143721789821e-05, + "loss": 0.1963, + "step": 7124 + }, + { + "epoch": 1.4420157862780814, + "grad_norm": 0.2541584074497223, + "learning_rate": 3.609696850936877e-05, + "loss": 0.1875, + "step": 7125 + }, + { + "epoch": 1.4422181744586116, + "grad_norm": 0.2540287673473358, + "learning_rate": 3.607250626587752e-05, + "loss": 0.1891, + "step": 7126 + }, + { + "epoch": 1.4424205626391418, + "grad_norm": 0.2570984661579132, + "learning_rate": 3.604805048989929e-05, + "loss": 0.1861, + "step": 7127 + }, + { + "epoch": 1.4426229508196722, + "grad_norm": 0.23033881187438965, + "learning_rate": 3.602360118390828e-05, + "loss": 0.1512, + "step": 7128 + }, + { + "epoch": 1.4428253390002024, + "grad_norm": 0.25252917408943176, + "learning_rate": 3.5999158350378e-05, + "loss": 0.1603, + "step": 7129 + }, + { + "epoch": 1.4430277271807326, + "grad_norm": 0.276273250579834, + "learning_rate": 3.5974721991781334e-05, + "loss": 0.2062, + "step": 7130 + }, + { + "epoch": 1.443230115361263, + "grad_norm": 0.3179681897163391, + "learning_rate": 3.595029211059049e-05, + "loss": 0.2001, + "step": 7131 + }, + { + "epoch": 1.4434325035417932, + "grad_norm": 0.25583258271217346, + "learning_rate": 3.592586870927701e-05, + "loss": 0.1672, + "step": 7132 + }, + { + "epoch": 1.4436348917223234, + "grad_norm": 0.2345746010541916, + "learning_rate": 3.590145179031183e-05, + "loss": 0.1596, + "step": 7133 + }, + { + "epoch": 1.4438372799028536, + "grad_norm": 0.26826149225234985, + "learning_rate": 3.5877041356165165e-05, + "loss": 0.156, + "step": 7134 + }, + { + "epoch": 1.4440396680833838, + "grad_norm": 0.2847994863986969, + "learning_rate": 3.585263740930662e-05, + "loss": 0.1912, + "step": 7135 + }, + { + "epoch": 1.4442420562639142, + "grad_norm": 0.2812890410423279, + "learning_rate": 3.5828239952205136e-05, + "loss": 0.1913, + "step": 7136 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.26837995648384094, + "learning_rate": 3.580384898732899e-05, + "loss": 0.1794, + "step": 7137 + }, + { + "epoch": 1.4446468326249746, + "grad_norm": 0.28359875082969666, + "learning_rate": 3.57794645171458e-05, + "loss": 0.2171, + "step": 7138 + }, + { + "epoch": 1.444849220805505, + "grad_norm": 0.2712598443031311, + "learning_rate": 3.575508654412253e-05, + "loss": 0.1673, + "step": 7139 + }, + { + "epoch": 1.4450516089860352, + "grad_norm": 0.31262731552124023, + "learning_rate": 3.5730715070725483e-05, + "loss": 0.195, + "step": 7140 + }, + { + "epoch": 1.4452539971665654, + "grad_norm": 0.2699423134326935, + "learning_rate": 3.570635009942033e-05, + "loss": 0.1895, + "step": 7141 + }, + { + "epoch": 1.4454563853470956, + "grad_norm": 0.35048073530197144, + "learning_rate": 3.568199163267203e-05, + "loss": 0.2072, + "step": 7142 + }, + { + "epoch": 1.445658773527626, + "grad_norm": 0.3334614038467407, + "learning_rate": 3.565763967294495e-05, + "loss": 0.2004, + "step": 7143 + }, + { + "epoch": 1.4458611617081563, + "grad_norm": 0.3023090362548828, + "learning_rate": 3.563329422270274e-05, + "loss": 0.2038, + "step": 7144 + }, + { + "epoch": 1.4460635498886865, + "grad_norm": 0.2510114908218384, + "learning_rate": 3.5608955284408443e-05, + "loss": 0.1772, + "step": 7145 + }, + { + "epoch": 1.4462659380692169, + "grad_norm": 0.2934434115886688, + "learning_rate": 3.5584622860524385e-05, + "loss": 0.2137, + "step": 7146 + }, + { + "epoch": 1.446468326249747, + "grad_norm": 0.2533113360404968, + "learning_rate": 3.5560296953512295e-05, + "loss": 0.1719, + "step": 7147 + }, + { + "epoch": 1.4466707144302773, + "grad_norm": 0.2543116807937622, + "learning_rate": 3.55359775658332e-05, + "loss": 0.1822, + "step": 7148 + }, + { + "epoch": 1.4468731026108075, + "grad_norm": 0.3006589710712433, + "learning_rate": 3.551166469994748e-05, + "loss": 0.1703, + "step": 7149 + }, + { + "epoch": 1.4470754907913377, + "grad_norm": 0.28558582067489624, + "learning_rate": 3.548735835831486e-05, + "loss": 0.1909, + "step": 7150 + }, + { + "epoch": 1.4470754907913377, + "eval_loss": 0.2670600414276123, + "eval_runtime": 0.7391, + "eval_samples_per_second": 6.765, + "eval_steps_per_second": 1.353, + "step": 7150 + }, + { + "epoch": 1.447277878971868, + "grad_norm": 0.3041171431541443, + "learning_rate": 3.546305854339439e-05, + "loss": 0.2061, + "step": 7151 + }, + { + "epoch": 1.4474802671523983, + "grad_norm": 0.2890252470970154, + "learning_rate": 3.543876525764449e-05, + "loss": 0.1843, + "step": 7152 + }, + { + "epoch": 1.4476826553329285, + "grad_norm": 0.40670862793922424, + "learning_rate": 3.5414478503522873e-05, + "loss": 0.2172, + "step": 7153 + }, + { + "epoch": 1.447885043513459, + "grad_norm": 0.25703248381614685, + "learning_rate": 3.5390198283486654e-05, + "loss": 0.187, + "step": 7154 + }, + { + "epoch": 1.4480874316939891, + "grad_norm": 0.31184110045433044, + "learning_rate": 3.536592459999221e-05, + "loss": 0.2156, + "step": 7155 + }, + { + "epoch": 1.4482898198745193, + "grad_norm": 0.26967254281044006, + "learning_rate": 3.5341657455495325e-05, + "loss": 0.1816, + "step": 7156 + }, + { + "epoch": 1.4484922080550495, + "grad_norm": 0.3085002601146698, + "learning_rate": 3.531739685245109e-05, + "loss": 0.2036, + "step": 7157 + }, + { + "epoch": 1.4486945962355797, + "grad_norm": 0.25523659586906433, + "learning_rate": 3.5293142793313925e-05, + "loss": 0.1647, + "step": 7158 + }, + { + "epoch": 1.4488969844161101, + "grad_norm": 0.281577467918396, + "learning_rate": 3.526889528053765e-05, + "loss": 0.2121, + "step": 7159 + }, + { + "epoch": 1.4490993725966403, + "grad_norm": 0.2778119444847107, + "learning_rate": 3.52446543165753e-05, + "loss": 0.1907, + "step": 7160 + }, + { + "epoch": 1.4493017607771705, + "grad_norm": 0.3081774413585663, + "learning_rate": 3.522041990387935e-05, + "loss": 0.1941, + "step": 7161 + }, + { + "epoch": 1.449504148957701, + "grad_norm": 0.44540053606033325, + "learning_rate": 3.519619204490161e-05, + "loss": 0.1423, + "step": 7162 + }, + { + "epoch": 1.4497065371382312, + "grad_norm": 0.31192874908447266, + "learning_rate": 3.517197074209316e-05, + "loss": 0.193, + "step": 7163 + }, + { + "epoch": 1.4499089253187614, + "grad_norm": 0.3415822982788086, + "learning_rate": 3.514775599790448e-05, + "loss": 0.2248, + "step": 7164 + }, + { + "epoch": 1.4501113134992916, + "grad_norm": 0.2829976975917816, + "learning_rate": 3.512354781478537e-05, + "loss": 0.197, + "step": 7165 + }, + { + "epoch": 1.4503137016798218, + "grad_norm": 0.2971900701522827, + "learning_rate": 3.509934619518494e-05, + "loss": 0.193, + "step": 7166 + }, + { + "epoch": 1.4505160898603522, + "grad_norm": 0.2710355818271637, + "learning_rate": 3.5075151141551686e-05, + "loss": 0.1712, + "step": 7167 + }, + { + "epoch": 1.4507184780408824, + "grad_norm": 0.27177128195762634, + "learning_rate": 3.5050962656333376e-05, + "loss": 0.1729, + "step": 7168 + }, + { + "epoch": 1.4509208662214128, + "grad_norm": 0.290659099817276, + "learning_rate": 3.502678074197716e-05, + "loss": 0.177, + "step": 7169 + }, + { + "epoch": 1.451123254401943, + "grad_norm": 0.29393497109413147, + "learning_rate": 3.500260540092952e-05, + "loss": 0.2, + "step": 7170 + }, + { + "epoch": 1.4513256425824732, + "grad_norm": 0.2999376952648163, + "learning_rate": 3.497843663563626e-05, + "loss": 0.1784, + "step": 7171 + }, + { + "epoch": 1.4515280307630034, + "grad_norm": 0.2778777778148651, + "learning_rate": 3.49542744485425e-05, + "loss": 0.1785, + "step": 7172 + }, + { + "epoch": 1.4517304189435336, + "grad_norm": 0.28878605365753174, + "learning_rate": 3.493011884209275e-05, + "loss": 0.1847, + "step": 7173 + }, + { + "epoch": 1.451932807124064, + "grad_norm": 0.3202642798423767, + "learning_rate": 3.49059698187308e-05, + "loss": 0.1723, + "step": 7174 + }, + { + "epoch": 1.4521351953045942, + "grad_norm": 0.26450544595718384, + "learning_rate": 3.48818273808998e-05, + "loss": 0.1764, + "step": 7175 + }, + { + "epoch": 1.4523375834851244, + "grad_norm": 0.3330189883708954, + "learning_rate": 3.485769153104222e-05, + "loss": 0.2002, + "step": 7176 + }, + { + "epoch": 1.4525399716656548, + "grad_norm": 0.2631731927394867, + "learning_rate": 3.4833562271599896e-05, + "loss": 0.176, + "step": 7177 + }, + { + "epoch": 1.452742359846185, + "grad_norm": 0.2750820219516754, + "learning_rate": 3.480943960501395e-05, + "loss": 0.1472, + "step": 7178 + }, + { + "epoch": 1.4529447480267152, + "grad_norm": 0.26182010769844055, + "learning_rate": 3.478532353372487e-05, + "loss": 0.1669, + "step": 7179 + }, + { + "epoch": 1.4531471362072454, + "grad_norm": 0.3083231747150421, + "learning_rate": 3.476121406017246e-05, + "loss": 0.2078, + "step": 7180 + }, + { + "epoch": 1.4533495243877756, + "grad_norm": 0.2545689344406128, + "learning_rate": 3.473711118679587e-05, + "loss": 0.156, + "step": 7181 + }, + { + "epoch": 1.453551912568306, + "grad_norm": 0.2764873206615448, + "learning_rate": 3.471301491603358e-05, + "loss": 0.1938, + "step": 7182 + }, + { + "epoch": 1.4537543007488363, + "grad_norm": 0.3199172616004944, + "learning_rate": 3.468892525032339e-05, + "loss": 0.2067, + "step": 7183 + }, + { + "epoch": 1.4539566889293665, + "grad_norm": 0.27605000138282776, + "learning_rate": 3.466484219210244e-05, + "loss": 0.1767, + "step": 7184 + }, + { + "epoch": 1.454159077109897, + "grad_norm": 0.2564321458339691, + "learning_rate": 3.46407657438072e-05, + "loss": 0.1897, + "step": 7185 + }, + { + "epoch": 1.454361465290427, + "grad_norm": 0.28144845366477966, + "learning_rate": 3.461669590787348e-05, + "loss": 0.2016, + "step": 7186 + }, + { + "epoch": 1.4545638534709573, + "grad_norm": 0.317074716091156, + "learning_rate": 3.4592632686736406e-05, + "loss": 0.2276, + "step": 7187 + }, + { + "epoch": 1.4547662416514875, + "grad_norm": 0.2932002544403076, + "learning_rate": 3.456857608283045e-05, + "loss": 0.2215, + "step": 7188 + }, + { + "epoch": 1.4549686298320177, + "grad_norm": 0.2682000994682312, + "learning_rate": 3.454452609858939e-05, + "loss": 0.201, + "step": 7189 + }, + { + "epoch": 1.4551710180125481, + "grad_norm": 0.27240079641342163, + "learning_rate": 3.452048273644638e-05, + "loss": 0.1955, + "step": 7190 + }, + { + "epoch": 1.4553734061930783, + "grad_norm": 0.2567487359046936, + "learning_rate": 3.449644599883385e-05, + "loss": 0.1824, + "step": 7191 + }, + { + "epoch": 1.4555757943736085, + "grad_norm": 0.30283913016319275, + "learning_rate": 3.447241588818358e-05, + "loss": 0.2202, + "step": 7192 + }, + { + "epoch": 1.455778182554139, + "grad_norm": 0.2953677773475647, + "learning_rate": 3.444839240692671e-05, + "loss": 0.1909, + "step": 7193 + }, + { + "epoch": 1.4559805707346691, + "grad_norm": 0.28766822814941406, + "learning_rate": 3.4424375557493674e-05, + "loss": 0.1699, + "step": 7194 + }, + { + "epoch": 1.4561829589151993, + "grad_norm": 0.267906129360199, + "learning_rate": 3.4400365342314245e-05, + "loss": 0.199, + "step": 7195 + }, + { + "epoch": 1.4563853470957295, + "grad_norm": 0.25349098443984985, + "learning_rate": 3.437636176381751e-05, + "loss": 0.1693, + "step": 7196 + }, + { + "epoch": 1.4565877352762597, + "grad_norm": 0.23480501770973206, + "learning_rate": 3.4352364824431914e-05, + "loss": 0.1416, + "step": 7197 + }, + { + "epoch": 1.4567901234567902, + "grad_norm": 0.3415687680244446, + "learning_rate": 3.4328374526585215e-05, + "loss": 0.2447, + "step": 7198 + }, + { + "epoch": 1.4569925116373204, + "grad_norm": 0.26703810691833496, + "learning_rate": 3.430439087270449e-05, + "loss": 0.1838, + "step": 7199 + }, + { + "epoch": 1.4571948998178508, + "grad_norm": 0.378103107213974, + "learning_rate": 3.428041386521618e-05, + "loss": 0.1944, + "step": 7200 + }, + { + "epoch": 1.4571948998178508, + "eval_loss": 0.265423983335495, + "eval_runtime": 0.7377, + "eval_samples_per_second": 6.778, + "eval_steps_per_second": 1.356, + "step": 7200 + }, + { + "epoch": 1.457397287998381, + "grad_norm": 0.31857749819755554, + "learning_rate": 3.425644350654599e-05, + "loss": 0.1793, + "step": 7201 + }, + { + "epoch": 1.4575996761789112, + "grad_norm": 0.3083222806453705, + "learning_rate": 3.4232479799119e-05, + "loss": 0.1843, + "step": 7202 + }, + { + "epoch": 1.4578020643594414, + "grad_norm": 0.29255810379981995, + "learning_rate": 3.420852274535963e-05, + "loss": 0.1863, + "step": 7203 + }, + { + "epoch": 1.4580044525399716, + "grad_norm": 0.2882266342639923, + "learning_rate": 3.418457234769161e-05, + "loss": 0.2145, + "step": 7204 + }, + { + "epoch": 1.458206840720502, + "grad_norm": 0.2893981337547302, + "learning_rate": 3.4160628608537935e-05, + "loss": 0.1946, + "step": 7205 + }, + { + "epoch": 1.4584092289010322, + "grad_norm": 0.3118157386779785, + "learning_rate": 3.4136691530321016e-05, + "loss": 0.2245, + "step": 7206 + }, + { + "epoch": 1.4586116170815624, + "grad_norm": 0.33280250430107117, + "learning_rate": 3.411276111546254e-05, + "loss": 0.1947, + "step": 7207 + }, + { + "epoch": 1.4588140052620928, + "grad_norm": 0.2869172990322113, + "learning_rate": 3.4088837366383565e-05, + "loss": 0.1946, + "step": 7208 + }, + { + "epoch": 1.459016393442623, + "grad_norm": 0.315771222114563, + "learning_rate": 3.406492028550442e-05, + "loss": 0.2258, + "step": 7209 + }, + { + "epoch": 1.4592187816231532, + "grad_norm": 0.3115472197532654, + "learning_rate": 3.404100987524479e-05, + "loss": 0.1907, + "step": 7210 + }, + { + "epoch": 1.4594211698036834, + "grad_norm": 0.345337450504303, + "learning_rate": 3.401710613802368e-05, + "loss": 0.2074, + "step": 7211 + }, + { + "epoch": 1.4596235579842136, + "grad_norm": 0.3084411919116974, + "learning_rate": 3.399320907625942e-05, + "loss": 0.1961, + "step": 7212 + }, + { + "epoch": 1.459825946164744, + "grad_norm": 0.31509360671043396, + "learning_rate": 3.396931869236967e-05, + "loss": 0.218, + "step": 7213 + }, + { + "epoch": 1.4600283343452742, + "grad_norm": 0.25666624307632446, + "learning_rate": 3.39454349887714e-05, + "loss": 0.1809, + "step": 7214 + }, + { + "epoch": 1.4602307225258044, + "grad_norm": 0.3312767446041107, + "learning_rate": 3.392155796788091e-05, + "loss": 0.2241, + "step": 7215 + }, + { + "epoch": 1.4604331107063349, + "grad_norm": 0.28062111139297485, + "learning_rate": 3.389768763211384e-05, + "loss": 0.1524, + "step": 7216 + }, + { + "epoch": 1.460635498886865, + "grad_norm": 0.24783366918563843, + "learning_rate": 3.387382398388513e-05, + "loss": 0.168, + "step": 7217 + }, + { + "epoch": 1.4608378870673953, + "grad_norm": 0.23416613042354584, + "learning_rate": 3.384996702560905e-05, + "loss": 0.1463, + "step": 7218 + }, + { + "epoch": 1.4610402752479255, + "grad_norm": 0.3014012277126312, + "learning_rate": 3.382611675969921e-05, + "loss": 0.1943, + "step": 7219 + }, + { + "epoch": 1.4612426634284557, + "grad_norm": 0.30629321932792664, + "learning_rate": 3.3802273188568514e-05, + "loss": 0.1925, + "step": 7220 + }, + { + "epoch": 1.461445051608986, + "grad_norm": 0.2842909097671509, + "learning_rate": 3.3778436314629216e-05, + "loss": 0.2063, + "step": 7221 + }, + { + "epoch": 1.4616474397895163, + "grad_norm": 0.28999006748199463, + "learning_rate": 3.3754606140292875e-05, + "loss": 0.1774, + "step": 7222 + }, + { + "epoch": 1.4618498279700465, + "grad_norm": 0.30419832468032837, + "learning_rate": 3.3730782667970375e-05, + "loss": 0.1816, + "step": 7223 + }, + { + "epoch": 1.462052216150577, + "grad_norm": 0.2875352203845978, + "learning_rate": 3.370696590007194e-05, + "loss": 0.2169, + "step": 7224 + }, + { + "epoch": 1.462254604331107, + "grad_norm": 0.40214404463768005, + "learning_rate": 3.3683155839007086e-05, + "loss": 0.206, + "step": 7225 + }, + { + "epoch": 1.4624569925116373, + "grad_norm": 0.2744888365268707, + "learning_rate": 3.36593524871847e-05, + "loss": 0.1899, + "step": 7226 + }, + { + "epoch": 1.4626593806921675, + "grad_norm": 0.28675928711891174, + "learning_rate": 3.363555584701289e-05, + "loss": 0.1886, + "step": 7227 + }, + { + "epoch": 1.4628617688726977, + "grad_norm": 0.25819188356399536, + "learning_rate": 3.361176592089919e-05, + "loss": 0.1784, + "step": 7228 + }, + { + "epoch": 1.4630641570532281, + "grad_norm": 0.2800363600254059, + "learning_rate": 3.358798271125041e-05, + "loss": 0.1907, + "step": 7229 + }, + { + "epoch": 1.4632665452337583, + "grad_norm": 0.3032877445220947, + "learning_rate": 3.3564206220472684e-05, + "loss": 0.1924, + "step": 7230 + }, + { + "epoch": 1.4634689334142887, + "grad_norm": 0.27191731333732605, + "learning_rate": 3.354043645097147e-05, + "loss": 0.1863, + "step": 7231 + }, + { + "epoch": 1.463671321594819, + "grad_norm": 0.2605868875980377, + "learning_rate": 3.351667340515154e-05, + "loss": 0.1899, + "step": 7232 + }, + { + "epoch": 1.4638737097753491, + "grad_norm": 0.2882217466831207, + "learning_rate": 3.349291708541696e-05, + "loss": 0.2119, + "step": 7233 + }, + { + "epoch": 1.4640760979558793, + "grad_norm": 0.26905587315559387, + "learning_rate": 3.346916749417123e-05, + "loss": 0.1699, + "step": 7234 + }, + { + "epoch": 1.4642784861364095, + "grad_norm": 0.30613499879837036, + "learning_rate": 3.344542463381701e-05, + "loss": 0.2023, + "step": 7235 + }, + { + "epoch": 1.46448087431694, + "grad_norm": 0.3181071877479553, + "learning_rate": 3.3421688506756386e-05, + "loss": 0.2077, + "step": 7236 + }, + { + "epoch": 1.4646832624974702, + "grad_norm": 0.34396564960479736, + "learning_rate": 3.339795911539072e-05, + "loss": 0.2227, + "step": 7237 + }, + { + "epoch": 1.4648856506780004, + "grad_norm": 0.26022300124168396, + "learning_rate": 3.33742364621207e-05, + "loss": 0.1496, + "step": 7238 + }, + { + "epoch": 1.4650880388585308, + "grad_norm": 0.3025625944137573, + "learning_rate": 3.335052054934634e-05, + "loss": 0.2067, + "step": 7239 + }, + { + "epoch": 1.465290427039061, + "grad_norm": 0.3104478120803833, + "learning_rate": 3.332681137946697e-05, + "loss": 0.2209, + "step": 7240 + }, + { + "epoch": 1.4654928152195912, + "grad_norm": 0.2558635175228119, + "learning_rate": 3.3303108954881226e-05, + "loss": 0.1905, + "step": 7241 + }, + { + "epoch": 1.4656952034001214, + "grad_norm": 0.25812438130378723, + "learning_rate": 3.327941327798708e-05, + "loss": 0.1568, + "step": 7242 + }, + { + "epoch": 1.4658975915806516, + "grad_norm": 0.2858171761035919, + "learning_rate": 3.3255724351181804e-05, + "loss": 0.1883, + "step": 7243 + }, + { + "epoch": 1.466099979761182, + "grad_norm": 0.27138751745224, + "learning_rate": 3.3232042176862e-05, + "loss": 0.183, + "step": 7244 + }, + { + "epoch": 1.4663023679417122, + "grad_norm": 0.2960588037967682, + "learning_rate": 3.320836675742358e-05, + "loss": 0.165, + "step": 7245 + }, + { + "epoch": 1.4665047561222424, + "grad_norm": 0.31151270866394043, + "learning_rate": 3.3184698095261766e-05, + "loss": 0.195, + "step": 7246 + }, + { + "epoch": 1.4667071443027728, + "grad_norm": 0.3337216079235077, + "learning_rate": 3.3161036192771134e-05, + "loss": 0.1813, + "step": 7247 + }, + { + "epoch": 1.466909532483303, + "grad_norm": 0.2706492245197296, + "learning_rate": 3.313738105234554e-05, + "loss": 0.1679, + "step": 7248 + }, + { + "epoch": 1.4671119206638332, + "grad_norm": 0.2831174433231354, + "learning_rate": 3.311373267637813e-05, + "loss": 0.2286, + "step": 7249 + }, + { + "epoch": 1.4673143088443634, + "grad_norm": 0.32331645488739014, + "learning_rate": 3.309009106726141e-05, + "loss": 0.2308, + "step": 7250 + }, + { + "epoch": 1.4673143088443634, + "eval_loss": 0.2653513550758362, + "eval_runtime": 0.7361, + "eval_samples_per_second": 6.793, + "eval_steps_per_second": 1.359, + "step": 7250 + }, + { + "epoch": 1.4675166970248936, + "grad_norm": 0.27429506182670593, + "learning_rate": 3.30664562273872e-05, + "loss": 0.1592, + "step": 7251 + }, + { + "epoch": 1.467719085205424, + "grad_norm": 0.2732314467430115, + "learning_rate": 3.304282815914662e-05, + "loss": 0.2064, + "step": 7252 + }, + { + "epoch": 1.4679214733859542, + "grad_norm": 0.3681529462337494, + "learning_rate": 3.301920686493012e-05, + "loss": 0.2259, + "step": 7253 + }, + { + "epoch": 1.4681238615664844, + "grad_norm": 0.30304232239723206, + "learning_rate": 3.299559234712745e-05, + "loss": 0.22, + "step": 7254 + }, + { + "epoch": 1.4683262497470149, + "grad_norm": 0.25034329295158386, + "learning_rate": 3.297198460812767e-05, + "loss": 0.1799, + "step": 7255 + }, + { + "epoch": 1.468528637927545, + "grad_norm": 0.26537463068962097, + "learning_rate": 3.294838365031917e-05, + "loss": 0.1747, + "step": 7256 + }, + { + "epoch": 1.4687310261080753, + "grad_norm": 0.2688654661178589, + "learning_rate": 3.2924789476089644e-05, + "loss": 0.1727, + "step": 7257 + }, + { + "epoch": 1.4689334142886055, + "grad_norm": 0.2707471251487732, + "learning_rate": 3.2901202087826124e-05, + "loss": 0.1915, + "step": 7258 + }, + { + "epoch": 1.4691358024691357, + "grad_norm": 0.30396607518196106, + "learning_rate": 3.28776214879149e-05, + "loss": 0.2073, + "step": 7259 + }, + { + "epoch": 1.469338190649666, + "grad_norm": 0.29711097478866577, + "learning_rate": 3.2854047678741625e-05, + "loss": 0.1772, + "step": 7260 + }, + { + "epoch": 1.4695405788301963, + "grad_norm": 0.3758523166179657, + "learning_rate": 3.2830480662691265e-05, + "loss": 0.2179, + "step": 7261 + }, + { + "epoch": 1.4697429670107267, + "grad_norm": 0.2720712125301361, + "learning_rate": 3.280692044214807e-05, + "loss": 0.184, + "step": 7262 + }, + { + "epoch": 1.469945355191257, + "grad_norm": 0.2819099724292755, + "learning_rate": 3.27833670194956e-05, + "loss": 0.1723, + "step": 7263 + }, + { + "epoch": 1.470147743371787, + "grad_norm": 0.3061216175556183, + "learning_rate": 3.2759820397116766e-05, + "loss": 0.1959, + "step": 7264 + }, + { + "epoch": 1.4703501315523173, + "grad_norm": 0.3212586045265198, + "learning_rate": 3.273628057739378e-05, + "loss": 0.1751, + "step": 7265 + }, + { + "epoch": 1.4705525197328475, + "grad_norm": 0.2917107343673706, + "learning_rate": 3.2712747562708115e-05, + "loss": 0.1878, + "step": 7266 + }, + { + "epoch": 1.470754907913378, + "grad_norm": 0.3021541237831116, + "learning_rate": 3.2689221355440615e-05, + "loss": 0.1802, + "step": 7267 + }, + { + "epoch": 1.4709572960939081, + "grad_norm": 0.3198321461677551, + "learning_rate": 3.266570195797142e-05, + "loss": 0.205, + "step": 7268 + }, + { + "epoch": 1.4711596842744383, + "grad_norm": 0.254639208316803, + "learning_rate": 3.264218937267996e-05, + "loss": 0.1688, + "step": 7269 + }, + { + "epoch": 1.4713620724549688, + "grad_norm": 0.26952847838401794, + "learning_rate": 3.261868360194501e-05, + "loss": 0.1957, + "step": 7270 + }, + { + "epoch": 1.471564460635499, + "grad_norm": 0.3483128845691681, + "learning_rate": 3.259518464814466e-05, + "loss": 0.2195, + "step": 7271 + }, + { + "epoch": 1.4717668488160291, + "grad_norm": 0.2697324752807617, + "learning_rate": 3.2571692513656226e-05, + "loss": 0.1582, + "step": 7272 + }, + { + "epoch": 1.4719692369965593, + "grad_norm": 0.24079424142837524, + "learning_rate": 3.254820720085643e-05, + "loss": 0.1647, + "step": 7273 + }, + { + "epoch": 1.4721716251770895, + "grad_norm": 0.2517320513725281, + "learning_rate": 3.252472871212125e-05, + "loss": 0.1839, + "step": 7274 + }, + { + "epoch": 1.47237401335762, + "grad_norm": 0.2581874430179596, + "learning_rate": 3.250125704982603e-05, + "loss": 0.1523, + "step": 7275 + }, + { + "epoch": 1.4725764015381502, + "grad_norm": 0.27980589866638184, + "learning_rate": 3.247779221634535e-05, + "loss": 0.2085, + "step": 7276 + }, + { + "epoch": 1.4727787897186804, + "grad_norm": 0.26071032881736755, + "learning_rate": 3.245433421405315e-05, + "loss": 0.1746, + "step": 7277 + }, + { + "epoch": 1.4729811778992108, + "grad_norm": 0.3794530928134918, + "learning_rate": 3.243088304532268e-05, + "loss": 0.2054, + "step": 7278 + }, + { + "epoch": 1.473183566079741, + "grad_norm": 0.28098902106285095, + "learning_rate": 3.240743871252646e-05, + "loss": 0.1807, + "step": 7279 + }, + { + "epoch": 1.4733859542602712, + "grad_norm": 0.25913292169570923, + "learning_rate": 3.238400121803635e-05, + "loss": 0.1559, + "step": 7280 + }, + { + "epoch": 1.4735883424408014, + "grad_norm": 0.2775214910507202, + "learning_rate": 3.2360570564223514e-05, + "loss": 0.2044, + "step": 7281 + }, + { + "epoch": 1.4737907306213316, + "grad_norm": 0.31280606985092163, + "learning_rate": 3.233714675345841e-05, + "loss": 0.2048, + "step": 7282 + }, + { + "epoch": 1.473993118801862, + "grad_norm": 0.337022602558136, + "learning_rate": 3.231372978811082e-05, + "loss": 0.1898, + "step": 7283 + }, + { + "epoch": 1.4741955069823922, + "grad_norm": 0.2549079954624176, + "learning_rate": 3.229031967054983e-05, + "loss": 0.1852, + "step": 7284 + }, + { + "epoch": 1.4743978951629224, + "grad_norm": 0.2797609865665436, + "learning_rate": 3.226691640314382e-05, + "loss": 0.1924, + "step": 7285 + }, + { + "epoch": 1.4746002833434528, + "grad_norm": 0.29348504543304443, + "learning_rate": 3.2243519988260495e-05, + "loss": 0.198, + "step": 7286 + }, + { + "epoch": 1.474802671523983, + "grad_norm": 0.3945770561695099, + "learning_rate": 3.2220130428266874e-05, + "loss": 0.2054, + "step": 7287 + }, + { + "epoch": 1.4750050597045132, + "grad_norm": 0.2882711887359619, + "learning_rate": 3.2196747725529234e-05, + "loss": 0.1845, + "step": 7288 + }, + { + "epoch": 1.4752074478850434, + "grad_norm": 0.25591593980789185, + "learning_rate": 3.217337188241321e-05, + "loss": 0.1597, + "step": 7289 + }, + { + "epoch": 1.4754098360655736, + "grad_norm": 0.31044450402259827, + "learning_rate": 3.2150002901283714e-05, + "loss": 0.1921, + "step": 7290 + }, + { + "epoch": 1.475612224246104, + "grad_norm": 0.2782125473022461, + "learning_rate": 3.2126640784504956e-05, + "loss": 0.1686, + "step": 7291 + }, + { + "epoch": 1.4758146124266343, + "grad_norm": 0.28518062829971313, + "learning_rate": 3.210328553444053e-05, + "loss": 0.2184, + "step": 7292 + }, + { + "epoch": 1.4760170006071647, + "grad_norm": 0.3196036219596863, + "learning_rate": 3.207993715345328e-05, + "loss": 0.2072, + "step": 7293 + }, + { + "epoch": 1.4762193887876949, + "grad_norm": 0.28817251324653625, + "learning_rate": 3.205659564390527e-05, + "loss": 0.1819, + "step": 7294 + }, + { + "epoch": 1.476421776968225, + "grad_norm": 0.27735039591789246, + "learning_rate": 3.203326100815799e-05, + "loss": 0.2075, + "step": 7295 + }, + { + "epoch": 1.4766241651487553, + "grad_norm": 0.2327238917350769, + "learning_rate": 3.2009933248572196e-05, + "loss": 0.1262, + "step": 7296 + }, + { + "epoch": 1.4768265533292855, + "grad_norm": 0.2707853615283966, + "learning_rate": 3.1986612367507954e-05, + "loss": 0.2087, + "step": 7297 + }, + { + "epoch": 1.477028941509816, + "grad_norm": 0.272924542427063, + "learning_rate": 3.1963298367324613e-05, + "loss": 0.1893, + "step": 7298 + }, + { + "epoch": 1.477231329690346, + "grad_norm": 0.2852308750152588, + "learning_rate": 3.193999125038083e-05, + "loss": 0.1875, + "step": 7299 + }, + { + "epoch": 1.4774337178708763, + "grad_norm": 0.28374990820884705, + "learning_rate": 3.191669101903459e-05, + "loss": 0.2186, + "step": 7300 + }, + { + "epoch": 1.4774337178708763, + "eval_loss": 0.26597627997398376, + "eval_runtime": 0.737, + "eval_samples_per_second": 6.784, + "eval_steps_per_second": 1.357, + "step": 7300 + }, + { + "epoch": 1.4776361060514067, + "grad_norm": 0.24514828622341156, + "learning_rate": 3.1893397675643176e-05, + "loss": 0.1843, + "step": 7301 + }, + { + "epoch": 1.477838494231937, + "grad_norm": 0.29631340503692627, + "learning_rate": 3.187011122256314e-05, + "loss": 0.1901, + "step": 7302 + }, + { + "epoch": 1.4780408824124671, + "grad_norm": 0.313414603471756, + "learning_rate": 3.184683166215038e-05, + "loss": 0.1935, + "step": 7303 + }, + { + "epoch": 1.4782432705929973, + "grad_norm": 0.289034366607666, + "learning_rate": 3.1823558996760064e-05, + "loss": 0.1774, + "step": 7304 + }, + { + "epoch": 1.4784456587735275, + "grad_norm": 0.25100448727607727, + "learning_rate": 3.180029322874668e-05, + "loss": 0.1883, + "step": 7305 + }, + { + "epoch": 1.478648046954058, + "grad_norm": 0.2747223675251007, + "learning_rate": 3.177703436046401e-05, + "loss": 0.2212, + "step": 7306 + }, + { + "epoch": 1.4788504351345881, + "grad_norm": 0.31175172328948975, + "learning_rate": 3.175378239426515e-05, + "loss": 0.2123, + "step": 7307 + }, + { + "epoch": 1.4790528233151183, + "grad_norm": 0.3189109265804291, + "learning_rate": 3.17305373325025e-05, + "loss": 0.1731, + "step": 7308 + }, + { + "epoch": 1.4792552114956488, + "grad_norm": 0.285255491733551, + "learning_rate": 3.170729917752773e-05, + "loss": 0.1806, + "step": 7309 + }, + { + "epoch": 1.479457599676179, + "grad_norm": 0.26634594798088074, + "learning_rate": 3.1684067931691844e-05, + "loss": 0.1765, + "step": 7310 + }, + { + "epoch": 1.4796599878567092, + "grad_norm": 0.2741389274597168, + "learning_rate": 3.1660843597345135e-05, + "loss": 0.2044, + "step": 7311 + }, + { + "epoch": 1.4798623760372394, + "grad_norm": 0.34997615218162537, + "learning_rate": 3.16376261768372e-05, + "loss": 0.2245, + "step": 7312 + }, + { + "epoch": 1.4800647642177696, + "grad_norm": 0.3089703917503357, + "learning_rate": 3.1614415672516914e-05, + "loss": 0.206, + "step": 7313 + }, + { + "epoch": 1.4802671523983, + "grad_norm": 0.2964693009853363, + "learning_rate": 3.1591212086732504e-05, + "loss": 0.1781, + "step": 7314 + }, + { + "epoch": 1.4804695405788302, + "grad_norm": 0.3101029098033905, + "learning_rate": 3.1568015421831475e-05, + "loss": 0.1944, + "step": 7315 + }, + { + "epoch": 1.4806719287593604, + "grad_norm": 0.3056146502494812, + "learning_rate": 3.154482568016057e-05, + "loss": 0.1865, + "step": 7316 + }, + { + "epoch": 1.4808743169398908, + "grad_norm": 0.2606217861175537, + "learning_rate": 3.15216428640659e-05, + "loss": 0.1797, + "step": 7317 + }, + { + "epoch": 1.481076705120421, + "grad_norm": 0.24992996454238892, + "learning_rate": 3.149846697589288e-05, + "loss": 0.1795, + "step": 7318 + }, + { + "epoch": 1.4812790933009512, + "grad_norm": 0.2621716558933258, + "learning_rate": 3.14752980179862e-05, + "loss": 0.2016, + "step": 7319 + }, + { + "epoch": 1.4814814814814814, + "grad_norm": 0.2822463810443878, + "learning_rate": 3.1452135992689836e-05, + "loss": 0.2244, + "step": 7320 + }, + { + "epoch": 1.4816838696620118, + "grad_norm": 0.29446855187416077, + "learning_rate": 3.1428980902347084e-05, + "loss": 0.1815, + "step": 7321 + }, + { + "epoch": 1.481886257842542, + "grad_norm": 0.24313370883464813, + "learning_rate": 3.140583274930055e-05, + "loss": 0.1586, + "step": 7322 + }, + { + "epoch": 1.4820886460230722, + "grad_norm": 0.31904950737953186, + "learning_rate": 3.1382691535892086e-05, + "loss": 0.1789, + "step": 7323 + }, + { + "epoch": 1.4822910342036026, + "grad_norm": 0.3379060626029968, + "learning_rate": 3.135955726446291e-05, + "loss": 0.2011, + "step": 7324 + }, + { + "epoch": 1.4824934223841328, + "grad_norm": 0.29935017228126526, + "learning_rate": 3.133642993735349e-05, + "loss": 0.2265, + "step": 7325 + }, + { + "epoch": 1.482695810564663, + "grad_norm": 0.30864277482032776, + "learning_rate": 3.1313309556903626e-05, + "loss": 0.2143, + "step": 7326 + }, + { + "epoch": 1.4828981987451932, + "grad_norm": 0.2781793475151062, + "learning_rate": 3.1290196125452366e-05, + "loss": 0.1825, + "step": 7327 + }, + { + "epoch": 1.4831005869257234, + "grad_norm": 0.2661908268928528, + "learning_rate": 3.12670896453381e-05, + "loss": 0.1537, + "step": 7328 + }, + { + "epoch": 1.4833029751062539, + "grad_norm": 0.26809945702552795, + "learning_rate": 3.12439901188985e-05, + "loss": 0.1801, + "step": 7329 + }, + { + "epoch": 1.483505363286784, + "grad_norm": 0.24053886532783508, + "learning_rate": 3.1220897548470526e-05, + "loss": 0.1554, + "step": 7330 + }, + { + "epoch": 1.4837077514673143, + "grad_norm": 0.7539495229721069, + "learning_rate": 3.1197811936390456e-05, + "loss": 0.2089, + "step": 7331 + }, + { + "epoch": 1.4839101396478447, + "grad_norm": 0.2663455009460449, + "learning_rate": 3.117473328499384e-05, + "loss": 0.1806, + "step": 7332 + }, + { + "epoch": 1.4841125278283749, + "grad_norm": 0.3148098587989807, + "learning_rate": 3.115166159661553e-05, + "loss": 0.2188, + "step": 7333 + }, + { + "epoch": 1.484314916008905, + "grad_norm": 0.2693808078765869, + "learning_rate": 3.112859687358969e-05, + "loss": 0.2097, + "step": 7334 + }, + { + "epoch": 1.4845173041894353, + "grad_norm": 0.23006850481033325, + "learning_rate": 3.110553911824975e-05, + "loss": 0.1786, + "step": 7335 + }, + { + "epoch": 1.4847196923699655, + "grad_norm": 0.3721303939819336, + "learning_rate": 3.108248833292846e-05, + "loss": 0.2057, + "step": 7336 + }, + { + "epoch": 1.484922080550496, + "grad_norm": 0.2560385763645172, + "learning_rate": 3.105944451995786e-05, + "loss": 0.1713, + "step": 7337 + }, + { + "epoch": 1.485124468731026, + "grad_norm": 0.28081420063972473, + "learning_rate": 3.103640768166928e-05, + "loss": 0.1893, + "step": 7338 + }, + { + "epoch": 1.4853268569115563, + "grad_norm": 0.2766065001487732, + "learning_rate": 3.101337782039334e-05, + "loss": 0.1871, + "step": 7339 + }, + { + "epoch": 1.4855292450920867, + "grad_norm": 0.2626553475856781, + "learning_rate": 3.0990354938459964e-05, + "loss": 0.1924, + "step": 7340 + }, + { + "epoch": 1.485731633272617, + "grad_norm": 0.3107207417488098, + "learning_rate": 3.096733903819837e-05, + "loss": 0.1927, + "step": 7341 + }, + { + "epoch": 1.4859340214531471, + "grad_norm": 0.2315855324268341, + "learning_rate": 3.094433012193706e-05, + "loss": 0.1639, + "step": 7342 + }, + { + "epoch": 1.4861364096336773, + "grad_norm": 0.24030828475952148, + "learning_rate": 3.092132819200383e-05, + "loss": 0.1924, + "step": 7343 + }, + { + "epoch": 1.4863387978142075, + "grad_norm": 0.30377745628356934, + "learning_rate": 3.089833325072578e-05, + "loss": 0.1586, + "step": 7344 + }, + { + "epoch": 1.486541185994738, + "grad_norm": 0.2831876873970032, + "learning_rate": 3.08753453004293e-05, + "loss": 0.2345, + "step": 7345 + }, + { + "epoch": 1.4867435741752681, + "grad_norm": 0.24797627329826355, + "learning_rate": 3.085236434344008e-05, + "loss": 0.1671, + "step": 7346 + }, + { + "epoch": 1.4869459623557983, + "grad_norm": 0.27497178316116333, + "learning_rate": 3.082939038208306e-05, + "loss": 0.183, + "step": 7347 + }, + { + "epoch": 1.4871483505363288, + "grad_norm": 0.3301697075366974, + "learning_rate": 3.080642341868252e-05, + "loss": 0.2028, + "step": 7348 + }, + { + "epoch": 1.487350738716859, + "grad_norm": 0.2448139786720276, + "learning_rate": 3.078346345556202e-05, + "loss": 0.1587, + "step": 7349 + }, + { + "epoch": 1.4875531268973892, + "grad_norm": 0.2885264754295349, + "learning_rate": 3.0760510495044413e-05, + "loss": 0.1975, + "step": 7350 + }, + { + "epoch": 1.4875531268973892, + "eval_loss": 0.26434725522994995, + "eval_runtime": 0.7415, + "eval_samples_per_second": 6.743, + "eval_steps_per_second": 1.349, + "step": 7350 + }, + { + "epoch": 1.4877555150779194, + "grad_norm": 0.2968370318412781, + "learning_rate": 3.0737564539451835e-05, + "loss": 0.1849, + "step": 7351 + }, + { + "epoch": 1.4879579032584498, + "grad_norm": 0.33912134170532227, + "learning_rate": 3.0714625591105704e-05, + "loss": 0.2101, + "step": 7352 + }, + { + "epoch": 1.48816029143898, + "grad_norm": 0.25544485449790955, + "learning_rate": 3.069169365232676e-05, + "loss": 0.1996, + "step": 7353 + }, + { + "epoch": 1.4883626796195102, + "grad_norm": 0.3111973702907562, + "learning_rate": 3.0668768725435004e-05, + "loss": 0.2041, + "step": 7354 + }, + { + "epoch": 1.4885650678000406, + "grad_norm": 0.2621065378189087, + "learning_rate": 3.0645850812749743e-05, + "loss": 0.17, + "step": 7355 + }, + { + "epoch": 1.4887674559805708, + "grad_norm": 0.2855996787548065, + "learning_rate": 3.062293991658958e-05, + "loss": 0.2126, + "step": 7356 + }, + { + "epoch": 1.488969844161101, + "grad_norm": 0.28797435760498047, + "learning_rate": 3.060003603927238e-05, + "loss": 0.1567, + "step": 7357 + }, + { + "epoch": 1.4891722323416312, + "grad_norm": 0.2602500915527344, + "learning_rate": 3.0577139183115346e-05, + "loss": 0.1521, + "step": 7358 + }, + { + "epoch": 1.4893746205221614, + "grad_norm": 0.2664164900779724, + "learning_rate": 3.0554249350434905e-05, + "loss": 0.1871, + "step": 7359 + }, + { + "epoch": 1.4895770087026918, + "grad_norm": 0.3115113377571106, + "learning_rate": 3.053136654354687e-05, + "loss": 0.2127, + "step": 7360 + }, + { + "epoch": 1.489779396883222, + "grad_norm": 0.2899424731731415, + "learning_rate": 3.0508490764766208e-05, + "loss": 0.1914, + "step": 7361 + }, + { + "epoch": 1.4899817850637522, + "grad_norm": 0.29101699590682983, + "learning_rate": 3.0485622016407277e-05, + "loss": 0.1894, + "step": 7362 + }, + { + "epoch": 1.4901841732442827, + "grad_norm": 0.3022737205028534, + "learning_rate": 3.046276030078371e-05, + "loss": 0.1906, + "step": 7363 + }, + { + "epoch": 1.4903865614248129, + "grad_norm": 0.2707030773162842, + "learning_rate": 3.043990562020842e-05, + "loss": 0.1956, + "step": 7364 + }, + { + "epoch": 1.490588949605343, + "grad_norm": 0.2698463201522827, + "learning_rate": 3.0417057976993578e-05, + "loss": 0.1597, + "step": 7365 + }, + { + "epoch": 1.4907913377858732, + "grad_norm": 0.278828501701355, + "learning_rate": 3.0394217373450695e-05, + "loss": 0.1911, + "step": 7366 + }, + { + "epoch": 1.4909937259664034, + "grad_norm": 0.25586068630218506, + "learning_rate": 3.037138381189053e-05, + "loss": 0.1566, + "step": 7367 + }, + { + "epoch": 1.4911961141469339, + "grad_norm": 0.3123515546321869, + "learning_rate": 3.0348557294623136e-05, + "loss": 0.2018, + "step": 7368 + }, + { + "epoch": 1.491398502327464, + "grad_norm": 0.3360455334186554, + "learning_rate": 3.032573782395789e-05, + "loss": 0.1848, + "step": 7369 + }, + { + "epoch": 1.4916008905079943, + "grad_norm": 0.32559528946876526, + "learning_rate": 3.0302925402203396e-05, + "loss": 0.1947, + "step": 7370 + }, + { + "epoch": 1.4918032786885247, + "grad_norm": 0.2493031769990921, + "learning_rate": 3.028012003166758e-05, + "loss": 0.1699, + "step": 7371 + }, + { + "epoch": 1.492005666869055, + "grad_norm": 0.2718972861766815, + "learning_rate": 3.0257321714657673e-05, + "loss": 0.205, + "step": 7372 + }, + { + "epoch": 1.492208055049585, + "grad_norm": 0.3305763900279999, + "learning_rate": 3.0234530453480137e-05, + "loss": 0.1773, + "step": 7373 + }, + { + "epoch": 1.4924104432301153, + "grad_norm": 0.23928435146808624, + "learning_rate": 3.0211746250440775e-05, + "loss": 0.1497, + "step": 7374 + }, + { + "epoch": 1.4926128314106455, + "grad_norm": 0.29377129673957825, + "learning_rate": 3.0188969107844655e-05, + "loss": 0.199, + "step": 7375 + }, + { + "epoch": 1.492815219591176, + "grad_norm": 0.31887951493263245, + "learning_rate": 3.0166199027996113e-05, + "loss": 0.1961, + "step": 7376 + }, + { + "epoch": 1.4930176077717061, + "grad_norm": 0.3151382505893707, + "learning_rate": 3.01434360131988e-05, + "loss": 0.1694, + "step": 7377 + }, + { + "epoch": 1.4932199959522363, + "grad_norm": 0.2822098135948181, + "learning_rate": 3.0120680065755635e-05, + "loss": 0.2054, + "step": 7378 + }, + { + "epoch": 1.4934223841327667, + "grad_norm": 0.2944871485233307, + "learning_rate": 3.009793118796882e-05, + "loss": 0.1962, + "step": 7379 + }, + { + "epoch": 1.493624772313297, + "grad_norm": 0.314345121383667, + "learning_rate": 3.0075189382139856e-05, + "loss": 0.1739, + "step": 7380 + }, + { + "epoch": 1.4938271604938271, + "grad_norm": 0.3003896176815033, + "learning_rate": 3.0052454650569528e-05, + "loss": 0.1954, + "step": 7381 + }, + { + "epoch": 1.4940295486743573, + "grad_norm": 0.29708629846572876, + "learning_rate": 3.0029726995557904e-05, + "loss": 0.1756, + "step": 7382 + }, + { + "epoch": 1.4942319368548878, + "grad_norm": 0.30133432149887085, + "learning_rate": 3.0007006419404283e-05, + "loss": 0.1944, + "step": 7383 + }, + { + "epoch": 1.494434325035418, + "grad_norm": 0.2833723723888397, + "learning_rate": 2.9984292924407332e-05, + "loss": 0.2044, + "step": 7384 + }, + { + "epoch": 1.4946367132159482, + "grad_norm": 0.2950754761695862, + "learning_rate": 2.9961586512864947e-05, + "loss": 0.1766, + "step": 7385 + }, + { + "epoch": 1.4948391013964786, + "grad_norm": 0.2856682240962982, + "learning_rate": 2.9938887187074314e-05, + "loss": 0.1868, + "step": 7386 + }, + { + "epoch": 1.4950414895770088, + "grad_norm": 0.23965318500995636, + "learning_rate": 2.9916194949331956e-05, + "loss": 0.1623, + "step": 7387 + }, + { + "epoch": 1.495243877757539, + "grad_norm": 0.30067017674446106, + "learning_rate": 2.9893509801933615e-05, + "loss": 0.198, + "step": 7388 + }, + { + "epoch": 1.4954462659380692, + "grad_norm": 0.2467557042837143, + "learning_rate": 2.9870831747174333e-05, + "loss": 0.17, + "step": 7389 + }, + { + "epoch": 1.4956486541185994, + "grad_norm": 0.2811994254589081, + "learning_rate": 2.9848160787348435e-05, + "loss": 0.2015, + "step": 7390 + }, + { + "epoch": 1.4958510422991298, + "grad_norm": 0.27143871784210205, + "learning_rate": 2.982549692474954e-05, + "loss": 0.1726, + "step": 7391 + }, + { + "epoch": 1.49605343047966, + "grad_norm": 0.297503799200058, + "learning_rate": 2.980284016167053e-05, + "loss": 0.2209, + "step": 7392 + }, + { + "epoch": 1.4962558186601902, + "grad_norm": 0.27570927143096924, + "learning_rate": 2.978019050040358e-05, + "loss": 0.2072, + "step": 7393 + }, + { + "epoch": 1.4964582068407206, + "grad_norm": 0.2997628450393677, + "learning_rate": 2.975754794324015e-05, + "loss": 0.2031, + "step": 7394 + }, + { + "epoch": 1.4966605950212508, + "grad_norm": 0.27644211053848267, + "learning_rate": 2.9734912492470968e-05, + "loss": 0.2034, + "step": 7395 + }, + { + "epoch": 1.496862983201781, + "grad_norm": 0.25040486454963684, + "learning_rate": 2.971228415038606e-05, + "loss": 0.1694, + "step": 7396 + }, + { + "epoch": 1.4970653713823112, + "grad_norm": 0.30418914556503296, + "learning_rate": 2.9689662919274718e-05, + "loss": 0.2076, + "step": 7397 + }, + { + "epoch": 1.4972677595628414, + "grad_norm": 0.3006996512413025, + "learning_rate": 2.9667048801425536e-05, + "loss": 0.1918, + "step": 7398 + }, + { + "epoch": 1.4974701477433718, + "grad_norm": 0.276265412569046, + "learning_rate": 2.9644441799126345e-05, + "loss": 0.1718, + "step": 7399 + }, + { + "epoch": 1.497672535923902, + "grad_norm": 0.2626464068889618, + "learning_rate": 2.9621841914664307e-05, + "loss": 0.2176, + "step": 7400 + }, + { + "epoch": 1.497672535923902, + "eval_loss": 0.2646274268627167, + "eval_runtime": 0.74, + "eval_samples_per_second": 6.756, + "eval_steps_per_second": 1.351, + "step": 7400 + }, + { + "epoch": 1.4978749241044322, + "grad_norm": 0.24904023110866547, + "learning_rate": 2.9599249150325838e-05, + "loss": 0.1877, + "step": 7401 + }, + { + "epoch": 1.4980773122849627, + "grad_norm": 0.2674250304698944, + "learning_rate": 2.957666350839663e-05, + "loss": 0.1725, + "step": 7402 + }, + { + "epoch": 1.4982797004654929, + "grad_norm": 0.2647687792778015, + "learning_rate": 2.9554084991161666e-05, + "loss": 0.1758, + "step": 7403 + }, + { + "epoch": 1.498482088646023, + "grad_norm": 0.3034684658050537, + "learning_rate": 2.9531513600905236e-05, + "loss": 0.211, + "step": 7404 + }, + { + "epoch": 1.4986844768265533, + "grad_norm": 0.2592996060848236, + "learning_rate": 2.9508949339910807e-05, + "loss": 0.1882, + "step": 7405 + }, + { + "epoch": 1.4988868650070835, + "grad_norm": 0.34100887179374695, + "learning_rate": 2.9486392210461224e-05, + "loss": 0.2016, + "step": 7406 + }, + { + "epoch": 1.4990892531876139, + "grad_norm": 0.29518815875053406, + "learning_rate": 2.94638422148386e-05, + "loss": 0.1946, + "step": 7407 + }, + { + "epoch": 1.499291641368144, + "grad_norm": 0.26552388072013855, + "learning_rate": 2.944129935532428e-05, + "loss": 0.1635, + "step": 7408 + }, + { + "epoch": 1.4994940295486743, + "grad_norm": 0.27012595534324646, + "learning_rate": 2.941876363419893e-05, + "loss": 0.1588, + "step": 7409 + }, + { + "epoch": 1.4996964177292047, + "grad_norm": 0.29524192214012146, + "learning_rate": 2.9396235053742483e-05, + "loss": 0.2042, + "step": 7410 + }, + { + "epoch": 1.499898805909735, + "grad_norm": 0.2819133400917053, + "learning_rate": 2.9373713616234133e-05, + "loss": 0.1783, + "step": 7411 + }, + { + "epoch": 1.500101194090265, + "grad_norm": 0.3101344704627991, + "learning_rate": 2.935119932395236e-05, + "loss": 0.205, + "step": 7412 + }, + { + "epoch": 1.5003035822707953, + "grad_norm": 0.26377272605895996, + "learning_rate": 2.9328692179174933e-05, + "loss": 0.2038, + "step": 7413 + }, + { + "epoch": 1.5005059704513255, + "grad_norm": 0.24763913452625275, + "learning_rate": 2.9306192184178884e-05, + "loss": 0.1719, + "step": 7414 + }, + { + "epoch": 1.500708358631856, + "grad_norm": 0.2799322009086609, + "learning_rate": 2.9283699341240534e-05, + "loss": 0.1862, + "step": 7415 + }, + { + "epoch": 1.5009107468123861, + "grad_norm": 0.3978762626647949, + "learning_rate": 2.9261213652635466e-05, + "loss": 0.2023, + "step": 7416 + }, + { + "epoch": 1.5011131349929165, + "grad_norm": 0.2425425797700882, + "learning_rate": 2.923873512063854e-05, + "loss": 0.1713, + "step": 7417 + }, + { + "epoch": 1.5013155231734467, + "grad_norm": 0.30181166529655457, + "learning_rate": 2.921626374752391e-05, + "loss": 0.2113, + "step": 7418 + }, + { + "epoch": 1.501517911353977, + "grad_norm": 0.33585092425346375, + "learning_rate": 2.9193799535564993e-05, + "loss": 0.2352, + "step": 7419 + }, + { + "epoch": 1.5017202995345071, + "grad_norm": 0.28944873809814453, + "learning_rate": 2.917134248703447e-05, + "loss": 0.1691, + "step": 7420 + }, + { + "epoch": 1.5019226877150373, + "grad_norm": 0.2766030430793762, + "learning_rate": 2.9148892604204325e-05, + "loss": 0.1836, + "step": 7421 + }, + { + "epoch": 1.5021250758955675, + "grad_norm": 0.24718813598155975, + "learning_rate": 2.9126449889345787e-05, + "loss": 0.1926, + "step": 7422 + }, + { + "epoch": 1.502327464076098, + "grad_norm": 0.26735419034957886, + "learning_rate": 2.910401434472938e-05, + "loss": 0.1768, + "step": 7423 + }, + { + "epoch": 1.5025298522566282, + "grad_norm": 0.2996431589126587, + "learning_rate": 2.9081585972624913e-05, + "loss": 0.1888, + "step": 7424 + }, + { + "epoch": 1.5027322404371586, + "grad_norm": 0.27923211455345154, + "learning_rate": 2.905916477530143e-05, + "loss": 0.1834, + "step": 7425 + }, + { + "epoch": 1.5029346286176888, + "grad_norm": 0.2527241110801697, + "learning_rate": 2.903675075502731e-05, + "loss": 0.1671, + "step": 7426 + }, + { + "epoch": 1.503137016798219, + "grad_norm": 0.2642647325992584, + "learning_rate": 2.9014343914070108e-05, + "loss": 0.1937, + "step": 7427 + }, + { + "epoch": 1.5033394049787492, + "grad_norm": 0.27124494314193726, + "learning_rate": 2.8991944254696746e-05, + "loss": 0.195, + "step": 7428 + }, + { + "epoch": 1.5035417931592794, + "grad_norm": 0.2615841031074524, + "learning_rate": 2.8969551779173388e-05, + "loss": 0.1582, + "step": 7429 + }, + { + "epoch": 1.5037441813398098, + "grad_norm": 0.2523174285888672, + "learning_rate": 2.8947166489765465e-05, + "loss": 0.1935, + "step": 7430 + }, + { + "epoch": 1.50394656952034, + "grad_norm": 0.2733636498451233, + "learning_rate": 2.892478838873768e-05, + "loss": 0.1796, + "step": 7431 + }, + { + "epoch": 1.5041489577008704, + "grad_norm": 0.26868680119514465, + "learning_rate": 2.8902417478354037e-05, + "loss": 0.192, + "step": 7432 + }, + { + "epoch": 1.5043513458814006, + "grad_norm": 0.28141334652900696, + "learning_rate": 2.8880053760877767e-05, + "loss": 0.203, + "step": 7433 + }, + { + "epoch": 1.5045537340619308, + "grad_norm": 0.25075894594192505, + "learning_rate": 2.8857697238571402e-05, + "loss": 0.2052, + "step": 7434 + }, + { + "epoch": 1.504756122242461, + "grad_norm": 0.282585084438324, + "learning_rate": 2.883534791369674e-05, + "loss": 0.1852, + "step": 7435 + }, + { + "epoch": 1.5049585104229912, + "grad_norm": 0.3138660490512848, + "learning_rate": 2.881300578851487e-05, + "loss": 0.2139, + "step": 7436 + }, + { + "epoch": 1.5051608986035214, + "grad_norm": 0.257865309715271, + "learning_rate": 2.8790670865286107e-05, + "loss": 0.1865, + "step": 7437 + }, + { + "epoch": 1.5053632867840518, + "grad_norm": 0.2725944519042969, + "learning_rate": 2.8768343146270072e-05, + "loss": 0.187, + "step": 7438 + }, + { + "epoch": 1.505565674964582, + "grad_norm": 0.37270545959472656, + "learning_rate": 2.8746022633725656e-05, + "loss": 0.2131, + "step": 7439 + }, + { + "epoch": 1.5057680631451125, + "grad_norm": 0.2656130790710449, + "learning_rate": 2.8723709329911007e-05, + "loss": 0.1774, + "step": 7440 + }, + { + "epoch": 1.5059704513256427, + "grad_norm": 0.25046414136886597, + "learning_rate": 2.8701403237083557e-05, + "loss": 0.1662, + "step": 7441 + }, + { + "epoch": 1.5061728395061729, + "grad_norm": 0.27029305696487427, + "learning_rate": 2.86791043575e-05, + "loss": 0.1936, + "step": 7442 + }, + { + "epoch": 1.506375227686703, + "grad_norm": 0.2958330512046814, + "learning_rate": 2.86568126934163e-05, + "loss": 0.1896, + "step": 7443 + }, + { + "epoch": 1.5065776158672333, + "grad_norm": 0.2662060856819153, + "learning_rate": 2.8634528247087668e-05, + "loss": 0.17, + "step": 7444 + }, + { + "epoch": 1.5067800040477635, + "grad_norm": 0.3227637708187103, + "learning_rate": 2.8612251020768665e-05, + "loss": 0.2281, + "step": 7445 + }, + { + "epoch": 1.5069823922282939, + "grad_norm": 0.28649061918258667, + "learning_rate": 2.858998101671305e-05, + "loss": 0.1995, + "step": 7446 + }, + { + "epoch": 1.507184780408824, + "grad_norm": 0.2932646870613098, + "learning_rate": 2.8567718237173857e-05, + "loss": 0.1908, + "step": 7447 + }, + { + "epoch": 1.5073871685893545, + "grad_norm": 0.28228962421417236, + "learning_rate": 2.854546268440339e-05, + "loss": 0.1763, + "step": 7448 + }, + { + "epoch": 1.5075895567698847, + "grad_norm": 0.2924324870109558, + "learning_rate": 2.8523214360653293e-05, + "loss": 0.2125, + "step": 7449 + }, + { + "epoch": 1.507791944950415, + "grad_norm": 0.31800493597984314, + "learning_rate": 2.8500973268174324e-05, + "loss": 0.1897, + "step": 7450 + }, + { + "epoch": 1.507791944950415, + "eval_loss": 0.2675067186355591, + "eval_runtime": 0.7394, + "eval_samples_per_second": 6.762, + "eval_steps_per_second": 1.352, + "step": 7450 + }, + { + "epoch": 1.507994333130945, + "grad_norm": 0.3096625804901123, + "learning_rate": 2.847873940921666e-05, + "loss": 0.188, + "step": 7451 + }, + { + "epoch": 1.5081967213114753, + "grad_norm": 0.5369827151298523, + "learning_rate": 2.8456512786029676e-05, + "loss": 0.2071, + "step": 7452 + }, + { + "epoch": 1.5083991094920055, + "grad_norm": 0.2631036639213562, + "learning_rate": 2.8434293400862022e-05, + "loss": 0.1894, + "step": 7453 + }, + { + "epoch": 1.508601497672536, + "grad_norm": 0.2811294198036194, + "learning_rate": 2.8412081255961644e-05, + "loss": 0.1778, + "step": 7454 + }, + { + "epoch": 1.5088038858530661, + "grad_norm": 0.26449280977249146, + "learning_rate": 2.8389876353575705e-05, + "loss": 0.1761, + "step": 7455 + }, + { + "epoch": 1.5090062740335966, + "grad_norm": 0.28584322333335876, + "learning_rate": 2.8367678695950695e-05, + "loss": 0.182, + "step": 7456 + }, + { + "epoch": 1.5092086622141268, + "grad_norm": 0.24409325420856476, + "learning_rate": 2.8345488285332324e-05, + "loss": 0.17, + "step": 7457 + }, + { + "epoch": 1.509411050394657, + "grad_norm": 0.2807093560695648, + "learning_rate": 2.8323305123965583e-05, + "loss": 0.1972, + "step": 7458 + }, + { + "epoch": 1.5096134385751871, + "grad_norm": 0.2984221577644348, + "learning_rate": 2.8301129214094735e-05, + "loss": 0.2116, + "step": 7459 + }, + { + "epoch": 1.5098158267557173, + "grad_norm": 0.27237123250961304, + "learning_rate": 2.8278960557963298e-05, + "loss": 0.1982, + "step": 7460 + }, + { + "epoch": 1.5100182149362478, + "grad_norm": 0.2626419961452484, + "learning_rate": 2.8256799157814074e-05, + "loss": 0.168, + "step": 7461 + }, + { + "epoch": 1.510220603116778, + "grad_norm": 0.26899945735931396, + "learning_rate": 2.8234645015889127e-05, + "loss": 0.2049, + "step": 7462 + }, + { + "epoch": 1.5104229912973084, + "grad_norm": 0.28485849499702454, + "learning_rate": 2.8212498134429766e-05, + "loss": 0.2242, + "step": 7463 + }, + { + "epoch": 1.5106253794778386, + "grad_norm": 0.27648624777793884, + "learning_rate": 2.8190358515676584e-05, + "loss": 0.1857, + "step": 7464 + }, + { + "epoch": 1.5108277676583688, + "grad_norm": 0.20960475504398346, + "learning_rate": 2.816822616186945e-05, + "loss": 0.1252, + "step": 7465 + }, + { + "epoch": 1.511030155838899, + "grad_norm": 0.2709536552429199, + "learning_rate": 2.8146101075247457e-05, + "loss": 0.1989, + "step": 7466 + }, + { + "epoch": 1.5112325440194292, + "grad_norm": 0.2901017963886261, + "learning_rate": 2.8123983258049e-05, + "loss": 0.1991, + "step": 7467 + }, + { + "epoch": 1.5114349321999594, + "grad_norm": 0.28593146800994873, + "learning_rate": 2.8101872712511745e-05, + "loss": 0.1977, + "step": 7468 + }, + { + "epoch": 1.5116373203804898, + "grad_norm": 0.321687787771225, + "learning_rate": 2.8079769440872582e-05, + "loss": 0.1963, + "step": 7469 + }, + { + "epoch": 1.51183970856102, + "grad_norm": 0.2690439522266388, + "learning_rate": 2.8057673445367694e-05, + "loss": 0.1718, + "step": 7470 + }, + { + "epoch": 1.5120420967415504, + "grad_norm": 0.29314038157463074, + "learning_rate": 2.8035584728232557e-05, + "loss": 0.176, + "step": 7471 + }, + { + "epoch": 1.5122444849220806, + "grad_norm": 0.3003440499305725, + "learning_rate": 2.8013503291701813e-05, + "loss": 0.2047, + "step": 7472 + }, + { + "epoch": 1.5124468731026108, + "grad_norm": 0.266367107629776, + "learning_rate": 2.799142913800946e-05, + "loss": 0.1916, + "step": 7473 + }, + { + "epoch": 1.512649261283141, + "grad_norm": 0.2905641198158264, + "learning_rate": 2.7969362269388732e-05, + "loss": 0.1944, + "step": 7474 + }, + { + "epoch": 1.5128516494636712, + "grad_norm": 0.39335983991622925, + "learning_rate": 2.794730268807212e-05, + "loss": 0.2294, + "step": 7475 + }, + { + "epoch": 1.5130540376442014, + "grad_norm": 0.24299614131450653, + "learning_rate": 2.792525039629138e-05, + "loss": 0.1626, + "step": 7476 + }, + { + "epoch": 1.5132564258247319, + "grad_norm": 0.35958167910575867, + "learning_rate": 2.7903205396277542e-05, + "loss": 0.1963, + "step": 7477 + }, + { + "epoch": 1.513458814005262, + "grad_norm": 0.30342257022857666, + "learning_rate": 2.7881167690260867e-05, + "loss": 0.2193, + "step": 7478 + }, + { + "epoch": 1.5136612021857925, + "grad_norm": 0.27842989563941956, + "learning_rate": 2.7859137280470915e-05, + "loss": 0.1961, + "step": 7479 + }, + { + "epoch": 1.5138635903663227, + "grad_norm": 0.30261504650115967, + "learning_rate": 2.783711416913649e-05, + "loss": 0.208, + "step": 7480 + }, + { + "epoch": 1.5140659785468529, + "grad_norm": 0.2848835289478302, + "learning_rate": 2.7815098358485646e-05, + "loss": 0.2145, + "step": 7481 + }, + { + "epoch": 1.514268366727383, + "grad_norm": 0.29721975326538086, + "learning_rate": 2.7793089850745736e-05, + "loss": 0.1871, + "step": 7482 + }, + { + "epoch": 1.5144707549079133, + "grad_norm": 0.30782490968704224, + "learning_rate": 2.777108864814333e-05, + "loss": 0.2136, + "step": 7483 + }, + { + "epoch": 1.5146731430884435, + "grad_norm": 0.23784612119197845, + "learning_rate": 2.7749094752904292e-05, + "loss": 0.1535, + "step": 7484 + }, + { + "epoch": 1.514875531268974, + "grad_norm": 0.27230408787727356, + "learning_rate": 2.7727108167253712e-05, + "loss": 0.216, + "step": 7485 + }, + { + "epoch": 1.515077919449504, + "grad_norm": 0.28563883900642395, + "learning_rate": 2.7705128893415987e-05, + "loss": 0.1972, + "step": 7486 + }, + { + "epoch": 1.5152803076300345, + "grad_norm": 0.300144761800766, + "learning_rate": 2.768315693361474e-05, + "loss": 0.2125, + "step": 7487 + }, + { + "epoch": 1.5154826958105647, + "grad_norm": 0.2823977768421173, + "learning_rate": 2.7661192290072857e-05, + "loss": 0.1948, + "step": 7488 + }, + { + "epoch": 1.515685083991095, + "grad_norm": 0.3386089503765106, + "learning_rate": 2.7639234965012505e-05, + "loss": 0.2125, + "step": 7489 + }, + { + "epoch": 1.5158874721716251, + "grad_norm": 0.3780275881290436, + "learning_rate": 2.7617284960655075e-05, + "loss": 0.2796, + "step": 7490 + }, + { + "epoch": 1.5160898603521553, + "grad_norm": 0.23867158591747284, + "learning_rate": 2.7595342279221258e-05, + "loss": 0.1654, + "step": 7491 + }, + { + "epoch": 1.5162922485326857, + "grad_norm": 0.263566255569458, + "learning_rate": 2.7573406922930978e-05, + "loss": 0.1919, + "step": 7492 + }, + { + "epoch": 1.516494636713216, + "grad_norm": 0.28585851192474365, + "learning_rate": 2.7551478894003413e-05, + "loss": 0.1762, + "step": 7493 + }, + { + "epoch": 1.5166970248937464, + "grad_norm": 0.2942905128002167, + "learning_rate": 2.7529558194657024e-05, + "loss": 0.2138, + "step": 7494 + }, + { + "epoch": 1.5168994130742766, + "grad_norm": 0.285055011510849, + "learning_rate": 2.7507644827109514e-05, + "loss": 0.1866, + "step": 7495 + }, + { + "epoch": 1.5171018012548068, + "grad_norm": 0.3072595000267029, + "learning_rate": 2.748573879357784e-05, + "loss": 0.2122, + "step": 7496 + }, + { + "epoch": 1.517304189435337, + "grad_norm": 0.24192121624946594, + "learning_rate": 2.7463840096278236e-05, + "loss": 0.1836, + "step": 7497 + }, + { + "epoch": 1.5175065776158672, + "grad_norm": 0.2530333697795868, + "learning_rate": 2.7441948737426183e-05, + "loss": 0.1618, + "step": 7498 + }, + { + "epoch": 1.5177089657963974, + "grad_norm": 0.3010926842689514, + "learning_rate": 2.7420064719236404e-05, + "loss": 0.1759, + "step": 7499 + }, + { + "epoch": 1.5179113539769278, + "grad_norm": 0.26380324363708496, + "learning_rate": 2.7398188043922912e-05, + "loss": 0.1671, + "step": 7500 + }, + { + "epoch": 1.5179113539769278, + "eval_loss": 0.26698416471481323, + "eval_runtime": 0.7399, + "eval_samples_per_second": 6.757, + "eval_steps_per_second": 1.351, + "step": 7500 + }, + { + "epoch": 1.518113742157458, + "grad_norm": 0.31068721413612366, + "learning_rate": 2.7376318713698957e-05, + "loss": 0.208, + "step": 7501 + }, + { + "epoch": 1.5183161303379884, + "grad_norm": 0.24927808344364166, + "learning_rate": 2.7354456730777035e-05, + "loss": 0.1741, + "step": 7502 + }, + { + "epoch": 1.5185185185185186, + "grad_norm": 0.2789771258831024, + "learning_rate": 2.733260209736891e-05, + "loss": 0.1886, + "step": 7503 + }, + { + "epoch": 1.5187209066990488, + "grad_norm": 0.28663644194602966, + "learning_rate": 2.7310754815685624e-05, + "loss": 0.159, + "step": 7504 + }, + { + "epoch": 1.518923294879579, + "grad_norm": 0.254955917596817, + "learning_rate": 2.7288914887937456e-05, + "loss": 0.1687, + "step": 7505 + }, + { + "epoch": 1.5191256830601092, + "grad_norm": 0.28779321908950806, + "learning_rate": 2.7267082316333913e-05, + "loss": 0.1851, + "step": 7506 + }, + { + "epoch": 1.5193280712406394, + "grad_norm": 0.2844981551170349, + "learning_rate": 2.724525710308381e-05, + "loss": 0.2195, + "step": 7507 + }, + { + "epoch": 1.5195304594211698, + "grad_norm": 0.3059309124946594, + "learning_rate": 2.7223439250395188e-05, + "loss": 0.198, + "step": 7508 + }, + { + "epoch": 1.5197328476017, + "grad_norm": 0.2609426975250244, + "learning_rate": 2.7201628760475352e-05, + "loss": 0.1752, + "step": 7509 + }, + { + "epoch": 1.5199352357822304, + "grad_norm": 0.2921501398086548, + "learning_rate": 2.717982563553084e-05, + "loss": 0.1863, + "step": 7510 + }, + { + "epoch": 1.5201376239627606, + "grad_norm": 0.3341974914073944, + "learning_rate": 2.715802987776749e-05, + "loss": 0.1851, + "step": 7511 + }, + { + "epoch": 1.5203400121432908, + "grad_norm": 0.27599576115608215, + "learning_rate": 2.7136241489390356e-05, + "loss": 0.187, + "step": 7512 + }, + { + "epoch": 1.520542400323821, + "grad_norm": 0.23091015219688416, + "learning_rate": 2.7114460472603754e-05, + "loss": 0.1639, + "step": 7513 + }, + { + "epoch": 1.5207447885043512, + "grad_norm": 0.2507723569869995, + "learning_rate": 2.709268682961126e-05, + "loss": 0.1831, + "step": 7514 + }, + { + "epoch": 1.5209471766848814, + "grad_norm": 0.29208138585090637, + "learning_rate": 2.7070920562615733e-05, + "loss": 0.2016, + "step": 7515 + }, + { + "epoch": 1.5211495648654119, + "grad_norm": 0.24572382867336273, + "learning_rate": 2.704916167381919e-05, + "loss": 0.1622, + "step": 7516 + }, + { + "epoch": 1.521351953045942, + "grad_norm": 0.2684371769428253, + "learning_rate": 2.7027410165423016e-05, + "loss": 0.1711, + "step": 7517 + }, + { + "epoch": 1.5215543412264725, + "grad_norm": 0.2745975852012634, + "learning_rate": 2.7005666039627788e-05, + "loss": 0.1599, + "step": 7518 + }, + { + "epoch": 1.5217567294070027, + "grad_norm": 0.2740444839000702, + "learning_rate": 2.6983929298633348e-05, + "loss": 0.189, + "step": 7519 + }, + { + "epoch": 1.5219591175875329, + "grad_norm": 0.32473820447921753, + "learning_rate": 2.6962199944638788e-05, + "loss": 0.2143, + "step": 7520 + }, + { + "epoch": 1.522161505768063, + "grad_norm": 0.29582443833351135, + "learning_rate": 2.694047797984247e-05, + "loss": 0.2101, + "step": 7521 + }, + { + "epoch": 1.5223638939485933, + "grad_norm": 0.31342431902885437, + "learning_rate": 2.6918763406441973e-05, + "loss": 0.1809, + "step": 7522 + }, + { + "epoch": 1.5225662821291237, + "grad_norm": 0.2780219614505768, + "learning_rate": 2.6897056226634175e-05, + "loss": 0.1788, + "step": 7523 + }, + { + "epoch": 1.522768670309654, + "grad_norm": 0.27648964524269104, + "learning_rate": 2.6875356442615162e-05, + "loss": 0.1759, + "step": 7524 + }, + { + "epoch": 1.5229710584901843, + "grad_norm": 0.3273850083351135, + "learning_rate": 2.6853664056580285e-05, + "loss": 0.2186, + "step": 7525 + }, + { + "epoch": 1.5231734466707145, + "grad_norm": 0.2797602117061615, + "learning_rate": 2.6831979070724177e-05, + "loss": 0.1941, + "step": 7526 + }, + { + "epoch": 1.5233758348512447, + "grad_norm": 0.3109395503997803, + "learning_rate": 2.6810301487240686e-05, + "loss": 0.1836, + "step": 7527 + }, + { + "epoch": 1.523578223031775, + "grad_norm": 0.25964033603668213, + "learning_rate": 2.6788631308322908e-05, + "loss": 0.1594, + "step": 7528 + }, + { + "epoch": 1.5237806112123051, + "grad_norm": 0.38888150453567505, + "learning_rate": 2.6766968536163218e-05, + "loss": 0.2233, + "step": 7529 + }, + { + "epoch": 1.5239829993928353, + "grad_norm": 0.3075036406517029, + "learning_rate": 2.6745313172953233e-05, + "loss": 0.2083, + "step": 7530 + }, + { + "epoch": 1.5241853875733657, + "grad_norm": 0.2736060619354248, + "learning_rate": 2.6723665220883798e-05, + "loss": 0.1835, + "step": 7531 + }, + { + "epoch": 1.524387775753896, + "grad_norm": 0.2539604902267456, + "learning_rate": 2.6702024682145043e-05, + "loss": 0.1743, + "step": 7532 + }, + { + "epoch": 1.5245901639344264, + "grad_norm": 0.2822546660900116, + "learning_rate": 2.6680391558926333e-05, + "loss": 0.1909, + "step": 7533 + }, + { + "epoch": 1.5247925521149566, + "grad_norm": 0.34847140312194824, + "learning_rate": 2.6658765853416256e-05, + "loss": 0.2242, + "step": 7534 + }, + { + "epoch": 1.5249949402954868, + "grad_norm": 0.3049599528312683, + "learning_rate": 2.663714756780269e-05, + "loss": 0.196, + "step": 7535 + }, + { + "epoch": 1.525197328476017, + "grad_norm": 0.2608265280723572, + "learning_rate": 2.661553670427276e-05, + "loss": 0.1903, + "step": 7536 + }, + { + "epoch": 1.5253997166565472, + "grad_norm": 0.31801798939704895, + "learning_rate": 2.6593933265012794e-05, + "loss": 0.2038, + "step": 7537 + }, + { + "epoch": 1.5256021048370774, + "grad_norm": 0.31101664900779724, + "learning_rate": 2.6572337252208455e-05, + "loss": 0.1776, + "step": 7538 + }, + { + "epoch": 1.5258044930176078, + "grad_norm": 0.25785762071609497, + "learning_rate": 2.6550748668044512e-05, + "loss": 0.2037, + "step": 7539 + }, + { + "epoch": 1.526006881198138, + "grad_norm": 0.3033619523048401, + "learning_rate": 2.6529167514705144e-05, + "loss": 0.1924, + "step": 7540 + }, + { + "epoch": 1.5262092693786684, + "grad_norm": 0.2987357974052429, + "learning_rate": 2.6507593794373696e-05, + "loss": 0.1882, + "step": 7541 + }, + { + "epoch": 1.5264116575591986, + "grad_norm": 0.347606897354126, + "learning_rate": 2.648602750923276e-05, + "loss": 0.2135, + "step": 7542 + }, + { + "epoch": 1.5266140457397288, + "grad_norm": 0.28092533349990845, + "learning_rate": 2.6464468661464183e-05, + "loss": 0.2041, + "step": 7543 + }, + { + "epoch": 1.526816433920259, + "grad_norm": 0.2929527163505554, + "learning_rate": 2.6442917253249065e-05, + "loss": 0.1787, + "step": 7544 + }, + { + "epoch": 1.5270188221007892, + "grad_norm": 0.30891942977905273, + "learning_rate": 2.6421373286767758e-05, + "loss": 0.1835, + "step": 7545 + }, + { + "epoch": 1.5272212102813194, + "grad_norm": 0.331102579832077, + "learning_rate": 2.6399836764199846e-05, + "loss": 0.2143, + "step": 7546 + }, + { + "epoch": 1.5274235984618498, + "grad_norm": 0.34742066264152527, + "learning_rate": 2.637830768772418e-05, + "loss": 0.2306, + "step": 7547 + }, + { + "epoch": 1.5276259866423803, + "grad_norm": 0.2488187551498413, + "learning_rate": 2.6356786059518833e-05, + "loss": 0.1625, + "step": 7548 + }, + { + "epoch": 1.5278283748229105, + "grad_norm": 0.294069766998291, + "learning_rate": 2.6335271881761148e-05, + "loss": 0.1875, + "step": 7549 + }, + { + "epoch": 1.5280307630034407, + "grad_norm": 0.3056446611881256, + "learning_rate": 2.631376515662769e-05, + "loss": 0.2226, + "step": 7550 + }, + { + "epoch": 1.5280307630034407, + "eval_loss": 0.2665191888809204, + "eval_runtime": 0.7394, + "eval_samples_per_second": 6.762, + "eval_steps_per_second": 1.352, + "step": 7550 + }, + { + "epoch": 1.5282331511839709, + "grad_norm": 0.3098391890525818, + "learning_rate": 2.62922658862943e-05, + "loss": 0.2245, + "step": 7551 + }, + { + "epoch": 1.528435539364501, + "grad_norm": 0.2537216544151306, + "learning_rate": 2.6270774072936033e-05, + "loss": 0.1839, + "step": 7552 + }, + { + "epoch": 1.5286379275450312, + "grad_norm": 0.25821807980537415, + "learning_rate": 2.624928971872722e-05, + "loss": 0.1435, + "step": 7553 + }, + { + "epoch": 1.5288403157255617, + "grad_norm": 0.2951766550540924, + "learning_rate": 2.6227812825841412e-05, + "loss": 0.1989, + "step": 7554 + }, + { + "epoch": 1.5290427039060919, + "grad_norm": 0.2677651047706604, + "learning_rate": 2.6206343396451427e-05, + "loss": 0.1952, + "step": 7555 + }, + { + "epoch": 1.5292450920866223, + "grad_norm": 0.25762563943862915, + "learning_rate": 2.6184881432729304e-05, + "loss": 0.2038, + "step": 7556 + }, + { + "epoch": 1.5294474802671525, + "grad_norm": 0.2664134204387665, + "learning_rate": 2.6163426936846346e-05, + "loss": 0.2034, + "step": 7557 + }, + { + "epoch": 1.5296498684476827, + "grad_norm": 0.26822999119758606, + "learning_rate": 2.614197991097309e-05, + "loss": 0.1711, + "step": 7558 + }, + { + "epoch": 1.529852256628213, + "grad_norm": 0.25438639521598816, + "learning_rate": 2.612054035727932e-05, + "loss": 0.2009, + "step": 7559 + }, + { + "epoch": 1.530054644808743, + "grad_norm": 0.2914297878742218, + "learning_rate": 2.6099108277934103e-05, + "loss": 0.1888, + "step": 7560 + }, + { + "epoch": 1.5302570329892733, + "grad_norm": 0.2948731780052185, + "learning_rate": 2.6077683675105645e-05, + "loss": 0.2113, + "step": 7561 + }, + { + "epoch": 1.5304594211698037, + "grad_norm": 0.28212210536003113, + "learning_rate": 2.6056266550961495e-05, + "loss": 0.1551, + "step": 7562 + }, + { + "epoch": 1.530661809350334, + "grad_norm": 0.2819977104663849, + "learning_rate": 2.6034856907668414e-05, + "loss": 0.1761, + "step": 7563 + }, + { + "epoch": 1.5308641975308643, + "grad_norm": 0.24788232147693634, + "learning_rate": 2.6013454747392408e-05, + "loss": 0.1629, + "step": 7564 + }, + { + "epoch": 1.5310665857113945, + "grad_norm": 0.22148968279361725, + "learning_rate": 2.599206007229872e-05, + "loss": 0.1467, + "step": 7565 + }, + { + "epoch": 1.5312689738919247, + "grad_norm": 0.2706809937953949, + "learning_rate": 2.5970672884551826e-05, + "loss": 0.1757, + "step": 7566 + }, + { + "epoch": 1.531471362072455, + "grad_norm": 0.2636460065841675, + "learning_rate": 2.594929318631547e-05, + "loss": 0.1862, + "step": 7567 + }, + { + "epoch": 1.5316737502529851, + "grad_norm": 0.2994755804538727, + "learning_rate": 2.592792097975263e-05, + "loss": 0.1901, + "step": 7568 + }, + { + "epoch": 1.5318761384335153, + "grad_norm": 0.28958430886268616, + "learning_rate": 2.5906556267025517e-05, + "loss": 0.1998, + "step": 7569 + }, + { + "epoch": 1.5320785266140458, + "grad_norm": 0.24085618555545807, + "learning_rate": 2.5885199050295585e-05, + "loss": 0.1703, + "step": 7570 + }, + { + "epoch": 1.532280914794576, + "grad_norm": 0.2558616101741791, + "learning_rate": 2.5863849331723532e-05, + "loss": 0.1694, + "step": 7571 + }, + { + "epoch": 1.5324833029751064, + "grad_norm": 0.32883763313293457, + "learning_rate": 2.5842507113469304e-05, + "loss": 0.2227, + "step": 7572 + }, + { + "epoch": 1.5326856911556366, + "grad_norm": 0.26645487546920776, + "learning_rate": 2.5821172397692085e-05, + "loss": 0.1865, + "step": 7573 + }, + { + "epoch": 1.5328880793361668, + "grad_norm": 0.3352511525154114, + "learning_rate": 2.5799845186550285e-05, + "loss": 0.1929, + "step": 7574 + }, + { + "epoch": 1.533090467516697, + "grad_norm": 0.2677745223045349, + "learning_rate": 2.5778525482201575e-05, + "loss": 0.1783, + "step": 7575 + }, + { + "epoch": 1.5332928556972272, + "grad_norm": 0.2944205105304718, + "learning_rate": 2.5757213286802873e-05, + "loss": 0.1758, + "step": 7576 + }, + { + "epoch": 1.5334952438777574, + "grad_norm": 0.2755123972892761, + "learning_rate": 2.5735908602510294e-05, + "loss": 0.1712, + "step": 7577 + }, + { + "epoch": 1.5336976320582878, + "grad_norm": 0.33217665553092957, + "learning_rate": 2.571461143147925e-05, + "loss": 0.1853, + "step": 7578 + }, + { + "epoch": 1.5339000202388182, + "grad_norm": 0.3142439126968384, + "learning_rate": 2.5693321775864356e-05, + "loss": 0.2122, + "step": 7579 + }, + { + "epoch": 1.5341024084193484, + "grad_norm": 0.2924261689186096, + "learning_rate": 2.5672039637819456e-05, + "loss": 0.1793, + "step": 7580 + }, + { + "epoch": 1.5343047965998786, + "grad_norm": 0.2553829550743103, + "learning_rate": 2.565076501949769e-05, + "loss": 0.1344, + "step": 7581 + }, + { + "epoch": 1.5345071847804088, + "grad_norm": 0.2889065444469452, + "learning_rate": 2.5629497923051404e-05, + "loss": 0.1833, + "step": 7582 + }, + { + "epoch": 1.534709572960939, + "grad_norm": 0.2888411581516266, + "learning_rate": 2.5608238350632118e-05, + "loss": 0.1948, + "step": 7583 + }, + { + "epoch": 1.5349119611414692, + "grad_norm": 0.3007155954837799, + "learning_rate": 2.5586986304390704e-05, + "loss": 0.218, + "step": 7584 + }, + { + "epoch": 1.5351143493219996, + "grad_norm": 0.3101598620414734, + "learning_rate": 2.5565741786477204e-05, + "loss": 0.1716, + "step": 7585 + }, + { + "epoch": 1.5353167375025298, + "grad_norm": 0.29315635561943054, + "learning_rate": 2.5544504799040925e-05, + "loss": 0.1707, + "step": 7586 + }, + { + "epoch": 1.5355191256830603, + "grad_norm": 0.3432648777961731, + "learning_rate": 2.552327534423039e-05, + "loss": 0.2077, + "step": 7587 + }, + { + "epoch": 1.5357215138635905, + "grad_norm": 0.28879377245903015, + "learning_rate": 2.5502053424193384e-05, + "loss": 0.1842, + "step": 7588 + }, + { + "epoch": 1.5359239020441207, + "grad_norm": 0.2736664414405823, + "learning_rate": 2.548083904107692e-05, + "loss": 0.201, + "step": 7589 + }, + { + "epoch": 1.5361262902246509, + "grad_norm": 0.24728530645370483, + "learning_rate": 2.545963219702724e-05, + "loss": 0.1777, + "step": 7590 + }, + { + "epoch": 1.536328678405181, + "grad_norm": 0.3157190978527069, + "learning_rate": 2.5438432894189824e-05, + "loss": 0.2011, + "step": 7591 + }, + { + "epoch": 1.5365310665857113, + "grad_norm": 0.28984534740448, + "learning_rate": 2.5417241134709403e-05, + "loss": 0.2377, + "step": 7592 + }, + { + "epoch": 1.5367334547662417, + "grad_norm": 0.2625545263290405, + "learning_rate": 2.539605692072994e-05, + "loss": 0.2019, + "step": 7593 + }, + { + "epoch": 1.5369358429467719, + "grad_norm": 0.2867405116558075, + "learning_rate": 2.5374880254394628e-05, + "loss": 0.1882, + "step": 7594 + }, + { + "epoch": 1.5371382311273023, + "grad_norm": 0.3031767010688782, + "learning_rate": 2.5353711137845892e-05, + "loss": 0.1957, + "step": 7595 + }, + { + "epoch": 1.5373406193078325, + "grad_norm": 0.254905641078949, + "learning_rate": 2.5332549573225416e-05, + "loss": 0.182, + "step": 7596 + }, + { + "epoch": 1.5375430074883627, + "grad_norm": 0.5628305077552795, + "learning_rate": 2.5311395562674066e-05, + "loss": 0.1782, + "step": 7597 + }, + { + "epoch": 1.537745395668893, + "grad_norm": 0.35798123478889465, + "learning_rate": 2.5290249108332042e-05, + "loss": 0.2021, + "step": 7598 + }, + { + "epoch": 1.537947783849423, + "grad_norm": 0.3951321840286255, + "learning_rate": 2.5269110212338697e-05, + "loss": 0.2244, + "step": 7599 + }, + { + "epoch": 1.5381501720299533, + "grad_norm": 0.3249993622303009, + "learning_rate": 2.5247978876832633e-05, + "loss": 0.1882, + "step": 7600 + }, + { + "epoch": 1.5381501720299533, + "eval_loss": 0.2659483253955841, + "eval_runtime": 0.7356, + "eval_samples_per_second": 6.797, + "eval_steps_per_second": 1.359, + "step": 7600 + }, + { + "epoch": 1.5383525602104837, + "grad_norm": 0.300327330827713, + "learning_rate": 2.5226855103951706e-05, + "loss": 0.1931, + "step": 7601 + }, + { + "epoch": 1.538554948391014, + "grad_norm": 0.2503814995288849, + "learning_rate": 2.5205738895832998e-05, + "loss": 0.189, + "step": 7602 + }, + { + "epoch": 1.5387573365715443, + "grad_norm": 0.27575522661209106, + "learning_rate": 2.5184630254612817e-05, + "loss": 0.2085, + "step": 7603 + }, + { + "epoch": 1.5389597247520745, + "grad_norm": 0.2803657054901123, + "learning_rate": 2.516352918242675e-05, + "loss": 0.1885, + "step": 7604 + }, + { + "epoch": 1.5391621129326047, + "grad_norm": 0.2981926500797272, + "learning_rate": 2.5142435681409516e-05, + "loss": 0.2364, + "step": 7605 + }, + { + "epoch": 1.539364501113135, + "grad_norm": 0.2323596477508545, + "learning_rate": 2.5121349753695168e-05, + "loss": 0.1475, + "step": 7606 + }, + { + "epoch": 1.5395668892936651, + "grad_norm": 0.27811673283576965, + "learning_rate": 2.5100271401416962e-05, + "loss": 0.1532, + "step": 7607 + }, + { + "epoch": 1.5397692774741953, + "grad_norm": 0.3188576102256775, + "learning_rate": 2.5079200626707377e-05, + "loss": 0.2101, + "step": 7608 + }, + { + "epoch": 1.5399716656547258, + "grad_norm": 0.2822002172470093, + "learning_rate": 2.505813743169815e-05, + "loss": 0.2018, + "step": 7609 + }, + { + "epoch": 1.5401740538352562, + "grad_norm": 0.26504015922546387, + "learning_rate": 2.50370818185202e-05, + "loss": 0.1743, + "step": 7610 + }, + { + "epoch": 1.5403764420157864, + "grad_norm": 0.27246421575546265, + "learning_rate": 2.501603378930375e-05, + "loss": 0.2185, + "step": 7611 + }, + { + "epoch": 1.5405788301963166, + "grad_norm": 0.2715609073638916, + "learning_rate": 2.499499334617821e-05, + "loss": 0.2015, + "step": 7612 + }, + { + "epoch": 1.5407812183768468, + "grad_norm": 0.2291431874036789, + "learning_rate": 2.4973960491272207e-05, + "loss": 0.178, + "step": 7613 + }, + { + "epoch": 1.540983606557377, + "grad_norm": 0.34131288528442383, + "learning_rate": 2.495293522671366e-05, + "loss": 0.2127, + "step": 7614 + }, + { + "epoch": 1.5411859947379072, + "grad_norm": 0.24734556674957275, + "learning_rate": 2.4931917554629656e-05, + "loss": 0.1733, + "step": 7615 + }, + { + "epoch": 1.5413883829184376, + "grad_norm": 0.3385285437107086, + "learning_rate": 2.491090747714655e-05, + "loss": 0.2291, + "step": 7616 + }, + { + "epoch": 1.5415907710989678, + "grad_norm": 0.27316218614578247, + "learning_rate": 2.4889904996389936e-05, + "loss": 0.1838, + "step": 7617 + }, + { + "epoch": 1.5417931592794982, + "grad_norm": 0.27246224880218506, + "learning_rate": 2.48689101144846e-05, + "loss": 0.1803, + "step": 7618 + }, + { + "epoch": 1.5419955474600284, + "grad_norm": 0.27257344126701355, + "learning_rate": 2.4847922833554603e-05, + "loss": 0.1823, + "step": 7619 + }, + { + "epoch": 1.5421979356405586, + "grad_norm": 0.2625625431537628, + "learning_rate": 2.4826943155723215e-05, + "loss": 0.1701, + "step": 7620 + }, + { + "epoch": 1.5424003238210888, + "grad_norm": 0.3060097396373749, + "learning_rate": 2.4805971083112933e-05, + "loss": 0.1876, + "step": 7621 + }, + { + "epoch": 1.542602712001619, + "grad_norm": 0.2705547511577606, + "learning_rate": 2.4785006617845497e-05, + "loss": 0.1905, + "step": 7622 + }, + { + "epoch": 1.5428051001821492, + "grad_norm": 0.2615959644317627, + "learning_rate": 2.4764049762041874e-05, + "loss": 0.1679, + "step": 7623 + }, + { + "epoch": 1.5430074883626796, + "grad_norm": 0.2921803891658783, + "learning_rate": 2.474310051782225e-05, + "loss": 0.1873, + "step": 7624 + }, + { + "epoch": 1.5432098765432098, + "grad_norm": 0.2853137254714966, + "learning_rate": 2.4722158887306047e-05, + "loss": 0.1915, + "step": 7625 + }, + { + "epoch": 1.5434122647237403, + "grad_norm": 0.32528531551361084, + "learning_rate": 2.470122487261194e-05, + "loss": 0.2114, + "step": 7626 + }, + { + "epoch": 1.5436146529042705, + "grad_norm": 0.3797064423561096, + "learning_rate": 2.468029847585781e-05, + "loss": 0.2072, + "step": 7627 + }, + { + "epoch": 1.5438170410848007, + "grad_norm": 0.23940971493721008, + "learning_rate": 2.4659379699160746e-05, + "loss": 0.1513, + "step": 7628 + }, + { + "epoch": 1.5440194292653309, + "grad_norm": 0.28272753953933716, + "learning_rate": 2.4638468544637093e-05, + "loss": 0.1674, + "step": 7629 + }, + { + "epoch": 1.544221817445861, + "grad_norm": 0.28421708941459656, + "learning_rate": 2.4617565014402444e-05, + "loss": 0.1915, + "step": 7630 + }, + { + "epoch": 1.5444242056263913, + "grad_norm": 0.2440253049135208, + "learning_rate": 2.459666911057158e-05, + "loss": 0.1747, + "step": 7631 + }, + { + "epoch": 1.5446265938069217, + "grad_norm": 0.26639822125434875, + "learning_rate": 2.4575780835258544e-05, + "loss": 0.1807, + "step": 7632 + }, + { + "epoch": 1.5448289819874519, + "grad_norm": 0.30751553177833557, + "learning_rate": 2.455490019057658e-05, + "loss": 0.2205, + "step": 7633 + }, + { + "epoch": 1.5450313701679823, + "grad_norm": 0.2998270094394684, + "learning_rate": 2.4534027178638184e-05, + "loss": 0.1782, + "step": 7634 + }, + { + "epoch": 1.5452337583485125, + "grad_norm": 0.29298698902130127, + "learning_rate": 2.451316180155505e-05, + "loss": 0.1996, + "step": 7635 + }, + { + "epoch": 1.5454361465290427, + "grad_norm": 0.317030131816864, + "learning_rate": 2.4492304061438143e-05, + "loss": 0.1811, + "step": 7636 + }, + { + "epoch": 1.545638534709573, + "grad_norm": 0.2533614933490753, + "learning_rate": 2.4471453960397617e-05, + "loss": 0.1518, + "step": 7637 + }, + { + "epoch": 1.545840922890103, + "grad_norm": 0.34969180822372437, + "learning_rate": 2.4450611500542864e-05, + "loss": 0.2098, + "step": 7638 + }, + { + "epoch": 1.5460433110706335, + "grad_norm": 0.26460447907447815, + "learning_rate": 2.442977668398251e-05, + "loss": 0.1834, + "step": 7639 + }, + { + "epoch": 1.5462456992511637, + "grad_norm": 0.29677653312683105, + "learning_rate": 2.44089495128244e-05, + "loss": 0.1944, + "step": 7640 + }, + { + "epoch": 1.5464480874316942, + "grad_norm": 0.24911251664161682, + "learning_rate": 2.4388129989175613e-05, + "loss": 0.1702, + "step": 7641 + }, + { + "epoch": 1.5466504756122244, + "grad_norm": 0.28196683526039124, + "learning_rate": 2.4367318115142446e-05, + "loss": 0.1501, + "step": 7642 + }, + { + "epoch": 1.5468528637927546, + "grad_norm": 0.32040929794311523, + "learning_rate": 2.4346513892830423e-05, + "loss": 0.2017, + "step": 7643 + }, + { + "epoch": 1.5470552519732848, + "grad_norm": 0.319196492433548, + "learning_rate": 2.432571732434431e-05, + "loss": 0.2117, + "step": 7644 + }, + { + "epoch": 1.547257640153815, + "grad_norm": 0.3500312268733978, + "learning_rate": 2.4304928411788064e-05, + "loss": 0.2345, + "step": 7645 + }, + { + "epoch": 1.5474600283343452, + "grad_norm": 0.30522117018699646, + "learning_rate": 2.4284147157264913e-05, + "loss": 0.2093, + "step": 7646 + }, + { + "epoch": 1.5476624165148756, + "grad_norm": 0.3220088481903076, + "learning_rate": 2.4263373562877278e-05, + "loss": 0.1842, + "step": 7647 + }, + { + "epoch": 1.5478648046954058, + "grad_norm": 0.2923586368560791, + "learning_rate": 2.42426076307268e-05, + "loss": 0.2043, + "step": 7648 + }, + { + "epoch": 1.5480671928759362, + "grad_norm": 0.2700873911380768, + "learning_rate": 2.4221849362914373e-05, + "loss": 0.1741, + "step": 7649 + }, + { + "epoch": 1.5482695810564664, + "grad_norm": 0.2897738516330719, + "learning_rate": 2.4201098761540098e-05, + "loss": 0.179, + "step": 7650 + }, + { + "epoch": 1.5482695810564664, + "eval_loss": 0.26571616530418396, + "eval_runtime": 0.7387, + "eval_samples_per_second": 6.769, + "eval_steps_per_second": 1.354, + "step": 7650 + }, + { + "epoch": 1.5484719692369966, + "grad_norm": 0.2596552073955536, + "learning_rate": 2.4180355828703303e-05, + "loss": 0.1644, + "step": 7651 + }, + { + "epoch": 1.5486743574175268, + "grad_norm": 0.2775457203388214, + "learning_rate": 2.415962056650254e-05, + "loss": 0.194, + "step": 7652 + }, + { + "epoch": 1.548876745598057, + "grad_norm": 0.257010281085968, + "learning_rate": 2.4138892977035576e-05, + "loss": 0.173, + "step": 7653 + }, + { + "epoch": 1.5490791337785872, + "grad_norm": 0.2591480612754822, + "learning_rate": 2.4118173062399418e-05, + "loss": 0.1568, + "step": 7654 + }, + { + "epoch": 1.5492815219591176, + "grad_norm": 0.2812730371952057, + "learning_rate": 2.40974608246903e-05, + "loss": 0.1773, + "step": 7655 + }, + { + "epoch": 1.5494839101396478, + "grad_norm": 0.3284202814102173, + "learning_rate": 2.4076756266003652e-05, + "loss": 0.1861, + "step": 7656 + }, + { + "epoch": 1.5496862983201782, + "grad_norm": 0.3464701473712921, + "learning_rate": 2.405605938843416e-05, + "loss": 0.1946, + "step": 7657 + }, + { + "epoch": 1.5498886865007084, + "grad_norm": 0.27741116285324097, + "learning_rate": 2.40353701940757e-05, + "loss": 0.199, + "step": 7658 + }, + { + "epoch": 1.5500910746812386, + "grad_norm": 0.2622848451137543, + "learning_rate": 2.4014688685021402e-05, + "loss": 0.1846, + "step": 7659 + }, + { + "epoch": 1.5502934628617688, + "grad_norm": 0.29812902212142944, + "learning_rate": 2.399401486336359e-05, + "loss": 0.1926, + "step": 7660 + }, + { + "epoch": 1.550495851042299, + "grad_norm": 0.2542051672935486, + "learning_rate": 2.3973348731193834e-05, + "loss": 0.1708, + "step": 7661 + }, + { + "epoch": 1.5506982392228292, + "grad_norm": 0.37435442209243774, + "learning_rate": 2.395269029060292e-05, + "loss": 0.1908, + "step": 7662 + }, + { + "epoch": 1.5509006274033597, + "grad_norm": 0.28960543870925903, + "learning_rate": 2.393203954368085e-05, + "loss": 0.1697, + "step": 7663 + }, + { + "epoch": 1.5511030155838899, + "grad_norm": 0.30785804986953735, + "learning_rate": 2.3911396492516836e-05, + "loss": 0.1839, + "step": 7664 + }, + { + "epoch": 1.5513054037644203, + "grad_norm": 0.31761255860328674, + "learning_rate": 2.3890761139199346e-05, + "loss": 0.2377, + "step": 7665 + }, + { + "epoch": 1.5515077919449505, + "grad_norm": 0.28986719250679016, + "learning_rate": 2.387013348581604e-05, + "loss": 0.2144, + "step": 7666 + }, + { + "epoch": 1.5517101801254807, + "grad_norm": 0.25136709213256836, + "learning_rate": 2.3849513534453793e-05, + "loss": 0.1715, + "step": 7667 + }, + { + "epoch": 1.5519125683060109, + "grad_norm": 0.2762148082256317, + "learning_rate": 2.3828901287198746e-05, + "loss": 0.1847, + "step": 7668 + }, + { + "epoch": 1.552114956486541, + "grad_norm": 0.3051958382129669, + "learning_rate": 2.3808296746136195e-05, + "loss": 0.2113, + "step": 7669 + }, + { + "epoch": 1.5523173446670715, + "grad_norm": 0.2701307237148285, + "learning_rate": 2.3787699913350724e-05, + "loss": 0.191, + "step": 7670 + }, + { + "epoch": 1.5525197328476017, + "grad_norm": 0.31726446747779846, + "learning_rate": 2.3767110790926107e-05, + "loss": 0.2092, + "step": 7671 + }, + { + "epoch": 1.5527221210281321, + "grad_norm": 0.27608180046081543, + "learning_rate": 2.3746529380945292e-05, + "loss": 0.156, + "step": 7672 + }, + { + "epoch": 1.5529245092086623, + "grad_norm": 0.2835337519645691, + "learning_rate": 2.372595568549052e-05, + "loss": 0.2033, + "step": 7673 + }, + { + "epoch": 1.5531268973891925, + "grad_norm": 0.2694108188152313, + "learning_rate": 2.370538970664321e-05, + "loss": 0.1905, + "step": 7674 + }, + { + "epoch": 1.5533292855697227, + "grad_norm": 0.2937160134315491, + "learning_rate": 2.3684831446484025e-05, + "loss": 0.1881, + "step": 7675 + }, + { + "epoch": 1.553531673750253, + "grad_norm": 0.27962058782577515, + "learning_rate": 2.366428090709283e-05, + "loss": 0.1992, + "step": 7676 + }, + { + "epoch": 1.5537340619307831, + "grad_norm": 0.31856977939605713, + "learning_rate": 2.3643738090548706e-05, + "loss": 0.2171, + "step": 7677 + }, + { + "epoch": 1.5539364501113135, + "grad_norm": 0.31848418712615967, + "learning_rate": 2.362320299892996e-05, + "loss": 0.2048, + "step": 7678 + }, + { + "epoch": 1.5541388382918437, + "grad_norm": 0.2699210047721863, + "learning_rate": 2.360267563431413e-05, + "loss": 0.2099, + "step": 7679 + }, + { + "epoch": 1.5543412264723742, + "grad_norm": 0.3069915473461151, + "learning_rate": 2.3582155998777954e-05, + "loss": 0.2065, + "step": 7680 + }, + { + "epoch": 1.5545436146529044, + "grad_norm": 0.24691063165664673, + "learning_rate": 2.3561644094397382e-05, + "loss": 0.1647, + "step": 7681 + }, + { + "epoch": 1.5547460028334346, + "grad_norm": 0.29513469338417053, + "learning_rate": 2.3541139923247614e-05, + "loss": 0.2003, + "step": 7682 + }, + { + "epoch": 1.5549483910139648, + "grad_norm": 0.2617926001548767, + "learning_rate": 2.3520643487403026e-05, + "loss": 0.1643, + "step": 7683 + }, + { + "epoch": 1.555150779194495, + "grad_norm": 0.26496002078056335, + "learning_rate": 2.3500154788937244e-05, + "loss": 0.2033, + "step": 7684 + }, + { + "epoch": 1.5553531673750252, + "grad_norm": 0.3225070536136627, + "learning_rate": 2.347967382992309e-05, + "loss": 0.184, + "step": 7685 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.2362879514694214, + "learning_rate": 2.345920061243263e-05, + "loss": 0.1651, + "step": 7686 + }, + { + "epoch": 1.5557579437360858, + "grad_norm": 0.2885676920413971, + "learning_rate": 2.3438735138537116e-05, + "loss": 0.1702, + "step": 7687 + }, + { + "epoch": 1.5559603319166162, + "grad_norm": 0.28008636832237244, + "learning_rate": 2.341827741030702e-05, + "loss": 0.1835, + "step": 7688 + }, + { + "epoch": 1.5561627200971464, + "grad_norm": 0.27259331941604614, + "learning_rate": 2.339782742981207e-05, + "loss": 0.1785, + "step": 7689 + }, + { + "epoch": 1.5563651082776766, + "grad_norm": 0.24857011437416077, + "learning_rate": 2.337738519912115e-05, + "loss": 0.1748, + "step": 7690 + }, + { + "epoch": 1.5565674964582068, + "grad_norm": 0.28130772709846497, + "learning_rate": 2.3356950720302405e-05, + "loss": 0.1955, + "step": 7691 + }, + { + "epoch": 1.556769884638737, + "grad_norm": 0.3307057023048401, + "learning_rate": 2.3336523995423188e-05, + "loss": 0.2333, + "step": 7692 + }, + { + "epoch": 1.5569722728192672, + "grad_norm": 0.2997671067714691, + "learning_rate": 2.331610502655005e-05, + "loss": 0.1984, + "step": 7693 + }, + { + "epoch": 1.5571746609997976, + "grad_norm": 0.25136569142341614, + "learning_rate": 2.3295693815748763e-05, + "loss": 0.1646, + "step": 7694 + }, + { + "epoch": 1.5573770491803278, + "grad_norm": 0.2711356282234192, + "learning_rate": 2.3275290365084336e-05, + "loss": 0.2123, + "step": 7695 + }, + { + "epoch": 1.5575794373608582, + "grad_norm": 0.2791728377342224, + "learning_rate": 2.3254894676620964e-05, + "loss": 0.2155, + "step": 7696 + }, + { + "epoch": 1.5577818255413884, + "grad_norm": 0.24361151456832886, + "learning_rate": 2.323450675242207e-05, + "loss": 0.1759, + "step": 7697 + }, + { + "epoch": 1.5579842137219186, + "grad_norm": 0.2774006426334381, + "learning_rate": 2.321412659455029e-05, + "loss": 0.1939, + "step": 7698 + }, + { + "epoch": 1.5581866019024488, + "grad_norm": 0.29601287841796875, + "learning_rate": 2.3193754205067475e-05, + "loss": 0.2084, + "step": 7699 + }, + { + "epoch": 1.558388990082979, + "grad_norm": 0.2615942656993866, + "learning_rate": 2.31733895860347e-05, + "loss": 0.216, + "step": 7700 + }, + { + "epoch": 1.558388990082979, + "eval_loss": 0.2588045597076416, + "eval_runtime": 0.7405, + "eval_samples_per_second": 6.752, + "eval_steps_per_second": 1.35, + "step": 7700 + }, + { + "epoch": 1.5585913782635095, + "grad_norm": 0.29083025455474854, + "learning_rate": 2.3153032739512226e-05, + "loss": 0.196, + "step": 7701 + }, + { + "epoch": 1.5587937664440397, + "grad_norm": 0.27147766947746277, + "learning_rate": 2.313268366755955e-05, + "loss": 0.1938, + "step": 7702 + }, + { + "epoch": 1.55899615462457, + "grad_norm": 0.2409001886844635, + "learning_rate": 2.3112342372235395e-05, + "loss": 0.1785, + "step": 7703 + }, + { + "epoch": 1.5591985428051003, + "grad_norm": 0.2982461452484131, + "learning_rate": 2.3092008855597657e-05, + "loss": 0.1916, + "step": 7704 + }, + { + "epoch": 1.5594009309856305, + "grad_norm": 0.30464914441108704, + "learning_rate": 2.307168311970347e-05, + "loss": 0.2271, + "step": 7705 + }, + { + "epoch": 1.5596033191661607, + "grad_norm": 0.2842462956905365, + "learning_rate": 2.3051365166609197e-05, + "loss": 0.1751, + "step": 7706 + }, + { + "epoch": 1.5598057073466909, + "grad_norm": 0.2824763357639313, + "learning_rate": 2.303105499837037e-05, + "loss": 0.192, + "step": 7707 + }, + { + "epoch": 1.560008095527221, + "grad_norm": 0.27954190969467163, + "learning_rate": 2.3010752617041786e-05, + "loss": 0.183, + "step": 7708 + }, + { + "epoch": 1.5602104837077515, + "grad_norm": 0.3235276937484741, + "learning_rate": 2.299045802467741e-05, + "loss": 0.2573, + "step": 7709 + }, + { + "epoch": 1.5604128718882817, + "grad_norm": 0.2770065367221832, + "learning_rate": 2.2970171223330438e-05, + "loss": 0.197, + "step": 7710 + }, + { + "epoch": 1.5606152600688121, + "grad_norm": 0.31573495268821716, + "learning_rate": 2.294989221505327e-05, + "loss": 0.219, + "step": 7711 + }, + { + "epoch": 1.5608176482493423, + "grad_norm": 0.31908997893333435, + "learning_rate": 2.292962100189754e-05, + "loss": 0.2058, + "step": 7712 + }, + { + "epoch": 1.5610200364298725, + "grad_norm": 0.2873445451259613, + "learning_rate": 2.290935758591406e-05, + "loss": 0.1732, + "step": 7713 + }, + { + "epoch": 1.5612224246104027, + "grad_norm": 0.2437688261270523, + "learning_rate": 2.2889101969152882e-05, + "loss": 0.1679, + "step": 7714 + }, + { + "epoch": 1.561424812790933, + "grad_norm": 0.2525308132171631, + "learning_rate": 2.2868854153663243e-05, + "loss": 0.1584, + "step": 7715 + }, + { + "epoch": 1.5616272009714631, + "grad_norm": 0.3853948414325714, + "learning_rate": 2.284861414149365e-05, + "loss": 0.2164, + "step": 7716 + }, + { + "epoch": 1.5618295891519935, + "grad_norm": 0.2611543536186218, + "learning_rate": 2.282838193469171e-05, + "loss": 0.1748, + "step": 7717 + }, + { + "epoch": 1.5620319773325237, + "grad_norm": 0.27541646361351013, + "learning_rate": 2.280815753530433e-05, + "loss": 0.1741, + "step": 7718 + }, + { + "epoch": 1.5622343655130542, + "grad_norm": 0.3103647232055664, + "learning_rate": 2.2787940945377604e-05, + "loss": 0.2263, + "step": 7719 + }, + { + "epoch": 1.5624367536935844, + "grad_norm": 0.26702842116355896, + "learning_rate": 2.2767732166956834e-05, + "loss": 0.1878, + "step": 7720 + }, + { + "epoch": 1.5626391418741146, + "grad_norm": 0.24968096613883972, + "learning_rate": 2.2747531202086537e-05, + "loss": 0.1762, + "step": 7721 + }, + { + "epoch": 1.5628415300546448, + "grad_norm": 0.2748836278915405, + "learning_rate": 2.2727338052810433e-05, + "loss": 0.1903, + "step": 7722 + }, + { + "epoch": 1.563043918235175, + "grad_norm": 0.2877596914768219, + "learning_rate": 2.2707152721171455e-05, + "loss": 0.1981, + "step": 7723 + }, + { + "epoch": 1.5632463064157052, + "grad_norm": 0.34964072704315186, + "learning_rate": 2.2686975209211737e-05, + "loss": 0.1866, + "step": 7724 + }, + { + "epoch": 1.5634486945962356, + "grad_norm": 0.3263644874095917, + "learning_rate": 2.2666805518972633e-05, + "loss": 0.2138, + "step": 7725 + }, + { + "epoch": 1.5636510827767658, + "grad_norm": 0.28281792998313904, + "learning_rate": 2.2646643652494692e-05, + "loss": 0.2022, + "step": 7726 + }, + { + "epoch": 1.5638534709572962, + "grad_norm": 0.27802562713623047, + "learning_rate": 2.2626489611817692e-05, + "loss": 0.1902, + "step": 7727 + }, + { + "epoch": 1.5640558591378264, + "grad_norm": 0.28225937485694885, + "learning_rate": 2.26063433989806e-05, + "loss": 0.2079, + "step": 7728 + }, + { + "epoch": 1.5642582473183566, + "grad_norm": 0.2651384472846985, + "learning_rate": 2.2586205016021612e-05, + "loss": 0.1713, + "step": 7729 + }, + { + "epoch": 1.5644606354988868, + "grad_norm": 0.4168740212917328, + "learning_rate": 2.2566074464978092e-05, + "loss": 0.2111, + "step": 7730 + }, + { + "epoch": 1.564663023679417, + "grad_norm": 0.25438109040260315, + "learning_rate": 2.254595174788665e-05, + "loss": 0.1627, + "step": 7731 + }, + { + "epoch": 1.5648654118599474, + "grad_norm": 0.352674275636673, + "learning_rate": 2.25258368667831e-05, + "loss": 0.2266, + "step": 7732 + }, + { + "epoch": 1.5650678000404776, + "grad_norm": 0.2683711647987366, + "learning_rate": 2.2505729823702458e-05, + "loss": 0.1713, + "step": 7733 + }, + { + "epoch": 1.565270188221008, + "grad_norm": 0.2740427553653717, + "learning_rate": 2.2485630620678922e-05, + "loss": 0.1937, + "step": 7734 + }, + { + "epoch": 1.5654725764015383, + "grad_norm": 0.2754979431629181, + "learning_rate": 2.2465539259745937e-05, + "loss": 0.179, + "step": 7735 + }, + { + "epoch": 1.5656749645820685, + "grad_norm": 0.29458338022232056, + "learning_rate": 2.244545574293613e-05, + "loss": 0.2165, + "step": 7736 + }, + { + "epoch": 1.5658773527625987, + "grad_norm": 0.25274816155433655, + "learning_rate": 2.2425380072281332e-05, + "loss": 0.1507, + "step": 7737 + }, + { + "epoch": 1.5660797409431289, + "grad_norm": 0.26597270369529724, + "learning_rate": 2.240531224981264e-05, + "loss": 0.1598, + "step": 7738 + }, + { + "epoch": 1.566282129123659, + "grad_norm": 0.29303765296936035, + "learning_rate": 2.238525227756022e-05, + "loss": 0.2107, + "step": 7739 + }, + { + "epoch": 1.5664845173041895, + "grad_norm": 0.28918832540512085, + "learning_rate": 2.2365200157553577e-05, + "loss": 0.2219, + "step": 7740 + }, + { + "epoch": 1.5666869054847197, + "grad_norm": 0.2689541280269623, + "learning_rate": 2.2345155891821367e-05, + "loss": 0.1744, + "step": 7741 + }, + { + "epoch": 1.56688929366525, + "grad_norm": 0.3268440067768097, + "learning_rate": 2.2325119482391467e-05, + "loss": 0.2066, + "step": 7742 + }, + { + "epoch": 1.5670916818457803, + "grad_norm": 0.3139539659023285, + "learning_rate": 2.230509093129095e-05, + "loss": 0.1822, + "step": 7743 + }, + { + "epoch": 1.5672940700263105, + "grad_norm": 0.27447032928466797, + "learning_rate": 2.228507024054608e-05, + "loss": 0.1818, + "step": 7744 + }, + { + "epoch": 1.5674964582068407, + "grad_norm": 0.2501983940601349, + "learning_rate": 2.2265057412182343e-05, + "loss": 0.1698, + "step": 7745 + }, + { + "epoch": 1.567698846387371, + "grad_norm": 0.240719735622406, + "learning_rate": 2.2245052448224445e-05, + "loss": 0.1456, + "step": 7746 + }, + { + "epoch": 1.567901234567901, + "grad_norm": 0.336001992225647, + "learning_rate": 2.2225055350696267e-05, + "loss": 0.1782, + "step": 7747 + }, + { + "epoch": 1.5681036227484315, + "grad_norm": 0.28484591841697693, + "learning_rate": 2.22050661216209e-05, + "loss": 0.2143, + "step": 7748 + }, + { + "epoch": 1.5683060109289617, + "grad_norm": 0.3016730844974518, + "learning_rate": 2.2185084763020647e-05, + "loss": 0.1828, + "step": 7749 + }, + { + "epoch": 1.5685083991094921, + "grad_norm": 0.30280449986457825, + "learning_rate": 2.2165111276916994e-05, + "loss": 0.1791, + "step": 7750 + }, + { + "epoch": 1.5685083991094921, + "eval_loss": 0.2602272033691406, + "eval_runtime": 0.7375, + "eval_samples_per_second": 6.78, + "eval_steps_per_second": 1.356, + "step": 7750 + }, + { + "epoch": 1.5687107872900223, + "grad_norm": 0.27848196029663086, + "learning_rate": 2.214514566533069e-05, + "loss": 0.1732, + "step": 7751 + }, + { + "epoch": 1.5689131754705525, + "grad_norm": 0.29691216349601746, + "learning_rate": 2.2125187930281633e-05, + "loss": 0.2041, + "step": 7752 + }, + { + "epoch": 1.5691155636510827, + "grad_norm": 0.26868849992752075, + "learning_rate": 2.2105238073788937e-05, + "loss": 0.1715, + "step": 7753 + }, + { + "epoch": 1.569317951831613, + "grad_norm": 0.26445257663726807, + "learning_rate": 2.20852960978709e-05, + "loss": 0.1615, + "step": 7754 + }, + { + "epoch": 1.5695203400121431, + "grad_norm": 0.27450621128082275, + "learning_rate": 2.2065362004545053e-05, + "loss": 0.1829, + "step": 7755 + }, + { + "epoch": 1.5697227281926736, + "grad_norm": 0.2744404673576355, + "learning_rate": 2.2045435795828128e-05, + "loss": 0.1749, + "step": 7756 + }, + { + "epoch": 1.5699251163732038, + "grad_norm": 0.2578592598438263, + "learning_rate": 2.2025517473736035e-05, + "loss": 0.1653, + "step": 7757 + }, + { + "epoch": 1.5701275045537342, + "grad_norm": 0.3351926803588867, + "learning_rate": 2.2005607040283905e-05, + "loss": 0.1907, + "step": 7758 + }, + { + "epoch": 1.5703298927342644, + "grad_norm": 0.30829042196273804, + "learning_rate": 2.198570449748608e-05, + "loss": 0.1687, + "step": 7759 + }, + { + "epoch": 1.5705322809147946, + "grad_norm": 0.34289824962615967, + "learning_rate": 2.19658098473561e-05, + "loss": 0.2137, + "step": 7760 + }, + { + "epoch": 1.5707346690953248, + "grad_norm": 0.24221114814281464, + "learning_rate": 2.194592309190665e-05, + "loss": 0.1739, + "step": 7761 + }, + { + "epoch": 1.570937057275855, + "grad_norm": 0.2792688012123108, + "learning_rate": 2.1926044233149678e-05, + "loss": 0.207, + "step": 7762 + }, + { + "epoch": 1.5711394454563854, + "grad_norm": 0.25298550724983215, + "learning_rate": 2.190617327309634e-05, + "loss": 0.1634, + "step": 7763 + }, + { + "epoch": 1.5713418336369156, + "grad_norm": 0.27479955554008484, + "learning_rate": 2.1886310213756965e-05, + "loss": 0.1746, + "step": 7764 + }, + { + "epoch": 1.571544221817446, + "grad_norm": 0.2710821330547333, + "learning_rate": 2.1866455057141078e-05, + "loss": 0.1784, + "step": 7765 + }, + { + "epoch": 1.5717466099979762, + "grad_norm": 0.3006027340888977, + "learning_rate": 2.1846607805257426e-05, + "loss": 0.1959, + "step": 7766 + }, + { + "epoch": 1.5719489981785064, + "grad_norm": 0.3054922819137573, + "learning_rate": 2.1826768460113943e-05, + "loss": 0.1932, + "step": 7767 + }, + { + "epoch": 1.5721513863590366, + "grad_norm": 0.3049944043159485, + "learning_rate": 2.1806937023717767e-05, + "loss": 0.2066, + "step": 7768 + }, + { + "epoch": 1.5723537745395668, + "grad_norm": 0.2929500639438629, + "learning_rate": 2.178711349807523e-05, + "loss": 0.1834, + "step": 7769 + }, + { + "epoch": 1.572556162720097, + "grad_norm": 0.2628859877586365, + "learning_rate": 2.1767297885191862e-05, + "loss": 0.1958, + "step": 7770 + }, + { + "epoch": 1.5727585509006274, + "grad_norm": 0.30760657787323, + "learning_rate": 2.1747490187072418e-05, + "loss": 0.1956, + "step": 7771 + }, + { + "epoch": 1.5729609390811576, + "grad_norm": 0.2853303551673889, + "learning_rate": 2.1727690405720814e-05, + "loss": 0.1978, + "step": 7772 + }, + { + "epoch": 1.573163327261688, + "grad_norm": 0.2791989743709564, + "learning_rate": 2.1707898543140203e-05, + "loss": 0.1951, + "step": 7773 + }, + { + "epoch": 1.5733657154422183, + "grad_norm": 0.3224547207355499, + "learning_rate": 2.16881146013329e-05, + "loss": 0.1915, + "step": 7774 + }, + { + "epoch": 1.5735681036227485, + "grad_norm": 0.23901736736297607, + "learning_rate": 2.166833858230045e-05, + "loss": 0.1616, + "step": 7775 + }, + { + "epoch": 1.5737704918032787, + "grad_norm": 0.3068159520626068, + "learning_rate": 2.1648570488043575e-05, + "loss": 0.1809, + "step": 7776 + }, + { + "epoch": 1.5739728799838089, + "grad_norm": 0.2887951135635376, + "learning_rate": 2.162881032056221e-05, + "loss": 0.2189, + "step": 7777 + }, + { + "epoch": 1.574175268164339, + "grad_norm": 0.27136558294296265, + "learning_rate": 2.160905808185547e-05, + "loss": 0.194, + "step": 7778 + }, + { + "epoch": 1.5743776563448695, + "grad_norm": 0.26406726241111755, + "learning_rate": 2.1589313773921684e-05, + "loss": 0.1766, + "step": 7779 + }, + { + "epoch": 1.5745800445253997, + "grad_norm": 0.369335800409317, + "learning_rate": 2.156957739875838e-05, + "loss": 0.2335, + "step": 7780 + }, + { + "epoch": 1.57478243270593, + "grad_norm": 0.2897554934024811, + "learning_rate": 2.154984895836227e-05, + "loss": 0.1566, + "step": 7781 + }, + { + "epoch": 1.5749848208864603, + "grad_norm": 0.26279720664024353, + "learning_rate": 2.1530128454729315e-05, + "loss": 0.1961, + "step": 7782 + }, + { + "epoch": 1.5751872090669905, + "grad_norm": 0.32653093338012695, + "learning_rate": 2.1510415889854553e-05, + "loss": 0.2299, + "step": 7783 + }, + { + "epoch": 1.5753895972475207, + "grad_norm": 0.34345322847366333, + "learning_rate": 2.1490711265732332e-05, + "loss": 0.215, + "step": 7784 + }, + { + "epoch": 1.575591985428051, + "grad_norm": 0.30096638202667236, + "learning_rate": 2.147101458435615e-05, + "loss": 0.1803, + "step": 7785 + }, + { + "epoch": 1.575794373608581, + "grad_norm": 0.25723904371261597, + "learning_rate": 2.1451325847718716e-05, + "loss": 0.1529, + "step": 7786 + }, + { + "epoch": 1.5759967617891115, + "grad_norm": 0.30034515261650085, + "learning_rate": 2.1431645057811943e-05, + "loss": 0.1939, + "step": 7787 + }, + { + "epoch": 1.5761991499696417, + "grad_norm": 0.33007681369781494, + "learning_rate": 2.141197221662691e-05, + "loss": 0.216, + "step": 7788 + }, + { + "epoch": 1.5764015381501721, + "grad_norm": 0.2949335277080536, + "learning_rate": 2.1392307326153903e-05, + "loss": 0.2043, + "step": 7789 + }, + { + "epoch": 1.5766039263307023, + "grad_norm": 0.24349358677864075, + "learning_rate": 2.137265038838243e-05, + "loss": 0.158, + "step": 7790 + }, + { + "epoch": 1.5768063145112325, + "grad_norm": 0.2552421987056732, + "learning_rate": 2.1353001405301155e-05, + "loss": 0.1746, + "step": 7791 + }, + { + "epoch": 1.5770087026917627, + "grad_norm": 0.2736150324344635, + "learning_rate": 2.133336037889797e-05, + "loss": 0.1802, + "step": 7792 + }, + { + "epoch": 1.577211090872293, + "grad_norm": 0.31855508685112, + "learning_rate": 2.1313727311159948e-05, + "loss": 0.2008, + "step": 7793 + }, + { + "epoch": 1.5774134790528234, + "grad_norm": 0.2970321476459503, + "learning_rate": 2.129410220407334e-05, + "loss": 0.2028, + "step": 7794 + }, + { + "epoch": 1.5776158672333536, + "grad_norm": 0.25379669666290283, + "learning_rate": 2.127448505962363e-05, + "loss": 0.189, + "step": 7795 + }, + { + "epoch": 1.577818255413884, + "grad_norm": 0.25099897384643555, + "learning_rate": 2.1254875879795454e-05, + "loss": 0.1859, + "step": 7796 + }, + { + "epoch": 1.5780206435944142, + "grad_norm": 0.32417258620262146, + "learning_rate": 2.123527466657268e-05, + "loss": 0.2089, + "step": 7797 + }, + { + "epoch": 1.5782230317749444, + "grad_norm": 0.33070147037506104, + "learning_rate": 2.1215681421938338e-05, + "loss": 0.1779, + "step": 7798 + }, + { + "epoch": 1.5784254199554746, + "grad_norm": 0.312674343585968, + "learning_rate": 2.1196096147874677e-05, + "loss": 0.2029, + "step": 7799 + }, + { + "epoch": 1.5786278081360048, + "grad_norm": 0.33172622323036194, + "learning_rate": 2.1176518846363136e-05, + "loss": 0.2064, + "step": 7800 + }, + { + "epoch": 1.5786278081360048, + "eval_loss": 0.25997477769851685, + "eval_runtime": 0.741, + "eval_samples_per_second": 6.748, + "eval_steps_per_second": 1.35, + "step": 7800 + }, + { + "epoch": 1.578830196316535, + "grad_norm": 0.34337952733039856, + "learning_rate": 2.1156949519384328e-05, + "loss": 0.2133, + "step": 7801 + }, + { + "epoch": 1.5790325844970654, + "grad_norm": 0.27992233633995056, + "learning_rate": 2.113738816891808e-05, + "loss": 0.1891, + "step": 7802 + }, + { + "epoch": 1.5792349726775956, + "grad_norm": 0.35382652282714844, + "learning_rate": 2.1117834796943392e-05, + "loss": 0.1931, + "step": 7803 + }, + { + "epoch": 1.579437360858126, + "grad_norm": 0.24651670455932617, + "learning_rate": 2.1098289405438487e-05, + "loss": 0.1603, + "step": 7804 + }, + { + "epoch": 1.5796397490386562, + "grad_norm": 0.28411322832107544, + "learning_rate": 2.107875199638075e-05, + "loss": 0.1937, + "step": 7805 + }, + { + "epoch": 1.5798421372191864, + "grad_norm": 0.26361873745918274, + "learning_rate": 2.1059222571746785e-05, + "loss": 0.1676, + "step": 7806 + }, + { + "epoch": 1.5800445253997166, + "grad_norm": 0.2438620775938034, + "learning_rate": 2.1039701133512346e-05, + "loss": 0.1646, + "step": 7807 + }, + { + "epoch": 1.5802469135802468, + "grad_norm": 0.27672767639160156, + "learning_rate": 2.102018768365244e-05, + "loss": 0.1771, + "step": 7808 + }, + { + "epoch": 1.580449301760777, + "grad_norm": 0.2786938548088074, + "learning_rate": 2.100068222414121e-05, + "loss": 0.1899, + "step": 7809 + }, + { + "epoch": 1.5806516899413074, + "grad_norm": 0.2908424139022827, + "learning_rate": 2.098118475695202e-05, + "loss": 0.1776, + "step": 7810 + }, + { + "epoch": 1.5808540781218376, + "grad_norm": 0.2723267078399658, + "learning_rate": 2.0961695284057438e-05, + "loss": 0.185, + "step": 7811 + }, + { + "epoch": 1.581056466302368, + "grad_norm": 0.30924347043037415, + "learning_rate": 2.0942213807429166e-05, + "loss": 0.2106, + "step": 7812 + }, + { + "epoch": 1.5812588544828983, + "grad_norm": 0.2639710605144501, + "learning_rate": 2.092274032903817e-05, + "loss": 0.1744, + "step": 7813 + }, + { + "epoch": 1.5814612426634285, + "grad_norm": 0.28179532289505005, + "learning_rate": 2.090327485085456e-05, + "loss": 0.1761, + "step": 7814 + }, + { + "epoch": 1.5816636308439587, + "grad_norm": 0.26997724175453186, + "learning_rate": 2.0883817374847646e-05, + "loss": 0.1628, + "step": 7815 + }, + { + "epoch": 1.5818660190244889, + "grad_norm": 0.26997148990631104, + "learning_rate": 2.0864367902985927e-05, + "loss": 0.197, + "step": 7816 + }, + { + "epoch": 1.582068407205019, + "grad_norm": 0.3067236542701721, + "learning_rate": 2.0844926437237112e-05, + "loss": 0.2307, + "step": 7817 + }, + { + "epoch": 1.5822707953855495, + "grad_norm": 0.3065381944179535, + "learning_rate": 2.082549297956806e-05, + "loss": 0.2012, + "step": 7818 + }, + { + "epoch": 1.5824731835660797, + "grad_norm": 0.28395718336105347, + "learning_rate": 2.0806067531944874e-05, + "loss": 0.1967, + "step": 7819 + }, + { + "epoch": 1.58267557174661, + "grad_norm": 0.27498477697372437, + "learning_rate": 2.0786650096332805e-05, + "loss": 0.1584, + "step": 7820 + }, + { + "epoch": 1.5828779599271403, + "grad_norm": 0.3125120997428894, + "learning_rate": 2.0767240674696297e-05, + "loss": 0.1938, + "step": 7821 + }, + { + "epoch": 1.5830803481076705, + "grad_norm": 0.3111341595649719, + "learning_rate": 2.0747839268998994e-05, + "loss": 0.1993, + "step": 7822 + }, + { + "epoch": 1.5832827362882007, + "grad_norm": 0.28946933150291443, + "learning_rate": 2.072844588120374e-05, + "loss": 0.1821, + "step": 7823 + }, + { + "epoch": 1.583485124468731, + "grad_norm": 0.25271058082580566, + "learning_rate": 2.070906051327254e-05, + "loss": 0.1729, + "step": 7824 + }, + { + "epoch": 1.5836875126492613, + "grad_norm": 0.293929785490036, + "learning_rate": 2.0689683167166597e-05, + "loss": 0.178, + "step": 7825 + }, + { + "epoch": 1.5838899008297915, + "grad_norm": 0.36047351360321045, + "learning_rate": 2.0670313844846335e-05, + "loss": 0.179, + "step": 7826 + }, + { + "epoch": 1.584092289010322, + "grad_norm": 0.2902386486530304, + "learning_rate": 2.065095254827133e-05, + "loss": 0.2103, + "step": 7827 + }, + { + "epoch": 1.5842946771908522, + "grad_norm": 0.26246702671051025, + "learning_rate": 2.0631599279400328e-05, + "loss": 0.1896, + "step": 7828 + }, + { + "epoch": 1.5844970653713824, + "grad_norm": 0.2978105843067169, + "learning_rate": 2.0612254040191314e-05, + "loss": 0.207, + "step": 7829 + }, + { + "epoch": 1.5846994535519126, + "grad_norm": 0.2901962995529175, + "learning_rate": 2.0592916832601428e-05, + "loss": 0.1672, + "step": 7830 + }, + { + "epoch": 1.5849018417324428, + "grad_norm": 0.29052335023880005, + "learning_rate": 2.0573587658587002e-05, + "loss": 0.1933, + "step": 7831 + }, + { + "epoch": 1.585104229912973, + "grad_norm": 0.30138322710990906, + "learning_rate": 2.055426652010356e-05, + "loss": 0.2104, + "step": 7832 + }, + { + "epoch": 1.5853066180935034, + "grad_norm": 0.29424503445625305, + "learning_rate": 2.0534953419105828e-05, + "loss": 0.2047, + "step": 7833 + }, + { + "epoch": 1.5855090062740336, + "grad_norm": 0.25280043482780457, + "learning_rate": 2.051564835754769e-05, + "loss": 0.1862, + "step": 7834 + }, + { + "epoch": 1.585711394454564, + "grad_norm": 0.21730902791023254, + "learning_rate": 2.0496351337382224e-05, + "loss": 0.1382, + "step": 7835 + }, + { + "epoch": 1.5859137826350942, + "grad_norm": 0.26311731338500977, + "learning_rate": 2.0477062360561716e-05, + "loss": 0.1818, + "step": 7836 + }, + { + "epoch": 1.5861161708156244, + "grad_norm": 0.30090418457984924, + "learning_rate": 2.0457781429037604e-05, + "loss": 0.2145, + "step": 7837 + }, + { + "epoch": 1.5863185589961546, + "grad_norm": 0.2743259072303772, + "learning_rate": 2.043850854476055e-05, + "loss": 0.1798, + "step": 7838 + }, + { + "epoch": 1.5865209471766848, + "grad_norm": 0.2695198059082031, + "learning_rate": 2.041924370968037e-05, + "loss": 0.1826, + "step": 7839 + }, + { + "epoch": 1.586723335357215, + "grad_norm": 0.25866490602493286, + "learning_rate": 2.0399986925746072e-05, + "loss": 0.1824, + "step": 7840 + }, + { + "epoch": 1.5869257235377454, + "grad_norm": 0.28438708186149597, + "learning_rate": 2.038073819490587e-05, + "loss": 0.2106, + "step": 7841 + }, + { + "epoch": 1.5871281117182756, + "grad_norm": 0.29071030020713806, + "learning_rate": 2.0361497519107144e-05, + "loss": 0.1937, + "step": 7842 + }, + { + "epoch": 1.587330499898806, + "grad_norm": 0.257099986076355, + "learning_rate": 2.034226490029646e-05, + "loss": 0.1674, + "step": 7843 + }, + { + "epoch": 1.5875328880793362, + "grad_norm": 0.31228503584861755, + "learning_rate": 2.0323040340419575e-05, + "loss": 0.1575, + "step": 7844 + }, + { + "epoch": 1.5877352762598664, + "grad_norm": 0.2626998722553253, + "learning_rate": 2.030382384142142e-05, + "loss": 0.1673, + "step": 7845 + }, + { + "epoch": 1.5879376644403966, + "grad_norm": 0.25823095440864563, + "learning_rate": 2.0284615405246132e-05, + "loss": 0.1646, + "step": 7846 + }, + { + "epoch": 1.5881400526209268, + "grad_norm": 0.3161360025405884, + "learning_rate": 2.026541503383702e-05, + "loss": 0.2215, + "step": 7847 + }, + { + "epoch": 1.588342440801457, + "grad_norm": 0.27048152685165405, + "learning_rate": 2.0246222729136565e-05, + "loss": 0.1853, + "step": 7848 + }, + { + "epoch": 1.5885448289819875, + "grad_norm": 0.3523916006088257, + "learning_rate": 2.022703849308645e-05, + "loss": 0.1951, + "step": 7849 + }, + { + "epoch": 1.5887472171625177, + "grad_norm": 0.2627606987953186, + "learning_rate": 2.0207862327627526e-05, + "loss": 0.1871, + "step": 7850 + }, + { + "epoch": 1.5887472171625177, + "eval_loss": 0.2618047893047333, + "eval_runtime": 0.7404, + "eval_samples_per_second": 6.753, + "eval_steps_per_second": 1.351, + "step": 7850 + }, + { + "epoch": 1.588949605343048, + "grad_norm": 0.33208194375038147, + "learning_rate": 2.0188694234699835e-05, + "loss": 0.1907, + "step": 7851 + }, + { + "epoch": 1.5891519935235783, + "grad_norm": 0.32101473212242126, + "learning_rate": 2.0169534216242626e-05, + "loss": 0.196, + "step": 7852 + }, + { + "epoch": 1.5893543817041085, + "grad_norm": 0.3157752454280853, + "learning_rate": 2.015038227419428e-05, + "loss": 0.1779, + "step": 7853 + }, + { + "epoch": 1.5895567698846387, + "grad_norm": 0.32359275221824646, + "learning_rate": 2.0131238410492416e-05, + "loss": 0.1995, + "step": 7854 + }, + { + "epoch": 1.5897591580651689, + "grad_norm": 0.2650693356990814, + "learning_rate": 2.011210262707379e-05, + "loss": 0.1822, + "step": 7855 + }, + { + "epoch": 1.5899615462456993, + "grad_norm": 0.24113725125789642, + "learning_rate": 2.0092974925874365e-05, + "loss": 0.1699, + "step": 7856 + }, + { + "epoch": 1.5901639344262295, + "grad_norm": 0.2838568687438965, + "learning_rate": 2.007385530882928e-05, + "loss": 0.1952, + "step": 7857 + }, + { + "epoch": 1.59036632260676, + "grad_norm": 0.2544390857219696, + "learning_rate": 2.0054743777872864e-05, + "loss": 0.148, + "step": 7858 + }, + { + "epoch": 1.5905687107872901, + "grad_norm": 0.2745042145252228, + "learning_rate": 2.003564033493862e-05, + "loss": 0.1686, + "step": 7859 + }, + { + "epoch": 1.5907710989678203, + "grad_norm": 0.29645898938179016, + "learning_rate": 2.001654498195922e-05, + "loss": 0.2024, + "step": 7860 + }, + { + "epoch": 1.5909734871483505, + "grad_norm": 0.2774428129196167, + "learning_rate": 1.999745772086655e-05, + "loss": 0.1713, + "step": 7861 + }, + { + "epoch": 1.5911758753288807, + "grad_norm": 0.2696341872215271, + "learning_rate": 1.997837855359165e-05, + "loss": 0.1901, + "step": 7862 + }, + { + "epoch": 1.591378263509411, + "grad_norm": 0.26396724581718445, + "learning_rate": 1.995930748206475e-05, + "loss": 0.177, + "step": 7863 + }, + { + "epoch": 1.5915806516899413, + "grad_norm": 0.2746181786060333, + "learning_rate": 1.9940244508215255e-05, + "loss": 0.1755, + "step": 7864 + }, + { + "epoch": 1.5917830398704715, + "grad_norm": 0.22371596097946167, + "learning_rate": 1.9921189633971772e-05, + "loss": 0.1732, + "step": 7865 + }, + { + "epoch": 1.591985428051002, + "grad_norm": 0.25550487637519836, + "learning_rate": 1.9902142861262063e-05, + "loss": 0.1551, + "step": 7866 + }, + { + "epoch": 1.5921878162315322, + "grad_norm": 0.27655959129333496, + "learning_rate": 1.988310419201308e-05, + "loss": 0.206, + "step": 7867 + }, + { + "epoch": 1.5923902044120624, + "grad_norm": 0.2964775860309601, + "learning_rate": 1.9864073628150958e-05, + "loss": 0.1677, + "step": 7868 + }, + { + "epoch": 1.5925925925925926, + "grad_norm": 0.2599323093891144, + "learning_rate": 1.9845051171601005e-05, + "loss": 0.1946, + "step": 7869 + }, + { + "epoch": 1.5927949807731228, + "grad_norm": 0.27363482117652893, + "learning_rate": 1.982603682428772e-05, + "loss": 0.211, + "step": 7870 + }, + { + "epoch": 1.592997368953653, + "grad_norm": 0.322819322347641, + "learning_rate": 1.98070305881348e-05, + "loss": 0.2174, + "step": 7871 + }, + { + "epoch": 1.5931997571341834, + "grad_norm": 0.3303944170475006, + "learning_rate": 1.9788032465065054e-05, + "loss": 0.1916, + "step": 7872 + }, + { + "epoch": 1.5934021453147136, + "grad_norm": 0.24966974556446075, + "learning_rate": 1.976904245700052e-05, + "loss": 0.1766, + "step": 7873 + }, + { + "epoch": 1.593604533495244, + "grad_norm": 0.27133068442344666, + "learning_rate": 1.9750060565862417e-05, + "loss": 0.1797, + "step": 7874 + }, + { + "epoch": 1.5938069216757742, + "grad_norm": 0.3636033535003662, + "learning_rate": 1.973108679357113e-05, + "loss": 0.2262, + "step": 7875 + }, + { + "epoch": 1.5940093098563044, + "grad_norm": 0.3111708164215088, + "learning_rate": 1.9712121142046237e-05, + "loss": 0.1893, + "step": 7876 + }, + { + "epoch": 1.5942116980368346, + "grad_norm": 0.3064495325088501, + "learning_rate": 1.969316361320647e-05, + "loss": 0.1886, + "step": 7877 + }, + { + "epoch": 1.5944140862173648, + "grad_norm": 0.2953830361366272, + "learning_rate": 1.9674214208969754e-05, + "loss": 0.2083, + "step": 7878 + }, + { + "epoch": 1.594616474397895, + "grad_norm": 0.29661932587623596, + "learning_rate": 1.9655272931253197e-05, + "loss": 0.198, + "step": 7879 + }, + { + "epoch": 1.5948188625784254, + "grad_norm": 0.3036056458950043, + "learning_rate": 1.963633978197308e-05, + "loss": 0.2531, + "step": 7880 + }, + { + "epoch": 1.5950212507589556, + "grad_norm": 0.24966773390769958, + "learning_rate": 1.961741476304486e-05, + "loss": 0.1692, + "step": 7881 + }, + { + "epoch": 1.595223638939486, + "grad_norm": 0.2774880528450012, + "learning_rate": 1.959849787638317e-05, + "loss": 0.2304, + "step": 7882 + }, + { + "epoch": 1.5954260271200162, + "grad_norm": 0.2703644931316376, + "learning_rate": 1.957958912390182e-05, + "loss": 0.207, + "step": 7883 + }, + { + "epoch": 1.5956284153005464, + "grad_norm": 0.308747798204422, + "learning_rate": 1.956068850751379e-05, + "loss": 0.2115, + "step": 7884 + }, + { + "epoch": 1.5958308034810766, + "grad_norm": 0.2647557854652405, + "learning_rate": 1.9541796029131278e-05, + "loss": 0.1739, + "step": 7885 + }, + { + "epoch": 1.5960331916616068, + "grad_norm": 0.3066260814666748, + "learning_rate": 1.9522911690665592e-05, + "loss": 0.2116, + "step": 7886 + }, + { + "epoch": 1.5962355798421373, + "grad_norm": 0.2615651488304138, + "learning_rate": 1.950403549402726e-05, + "loss": 0.1891, + "step": 7887 + }, + { + "epoch": 1.5964379680226675, + "grad_norm": 0.28497937321662903, + "learning_rate": 1.9485167441125995e-05, + "loss": 0.1681, + "step": 7888 + }, + { + "epoch": 1.5966403562031979, + "grad_norm": 0.2624446153640747, + "learning_rate": 1.9466307533870643e-05, + "loss": 0.1786, + "step": 7889 + }, + { + "epoch": 1.596842744383728, + "grad_norm": 0.31461378931999207, + "learning_rate": 1.9447455774169276e-05, + "loss": 0.1967, + "step": 7890 + }, + { + "epoch": 1.5970451325642583, + "grad_norm": 0.31892645359039307, + "learning_rate": 1.9428612163929093e-05, + "loss": 0.1966, + "step": 7891 + }, + { + "epoch": 1.5972475207447885, + "grad_norm": 0.27543261647224426, + "learning_rate": 1.9409776705056516e-05, + "loss": 0.1968, + "step": 7892 + }, + { + "epoch": 1.5974499089253187, + "grad_norm": 0.2699550986289978, + "learning_rate": 1.9390949399457104e-05, + "loss": 0.2061, + "step": 7893 + }, + { + "epoch": 1.5976522971058489, + "grad_norm": 0.269255667924881, + "learning_rate": 1.9372130249035638e-05, + "loss": 0.2118, + "step": 7894 + }, + { + "epoch": 1.5978546852863793, + "grad_norm": 0.26168566942214966, + "learning_rate": 1.935331925569599e-05, + "loss": 0.1675, + "step": 7895 + }, + { + "epoch": 1.5980570734669095, + "grad_norm": 0.25531288981437683, + "learning_rate": 1.9334516421341276e-05, + "loss": 0.1679, + "step": 7896 + }, + { + "epoch": 1.59825946164744, + "grad_norm": 0.2488667219877243, + "learning_rate": 1.931572174787378e-05, + "loss": 0.1667, + "step": 7897 + }, + { + "epoch": 1.5984618498279701, + "grad_norm": 0.2918820381164551, + "learning_rate": 1.929693523719496e-05, + "loss": 0.2051, + "step": 7898 + }, + { + "epoch": 1.5986642380085003, + "grad_norm": 0.3507806360721588, + "learning_rate": 1.927815689120541e-05, + "loss": 0.2313, + "step": 7899 + }, + { + "epoch": 1.5988666261890305, + "grad_norm": 0.26194295287132263, + "learning_rate": 1.925938671180495e-05, + "loss": 0.2001, + "step": 7900 + }, + { + "epoch": 1.5988666261890305, + "eval_loss": 0.2608179450035095, + "eval_runtime": 0.7369, + "eval_samples_per_second": 6.785, + "eval_steps_per_second": 1.357, + "step": 7900 + }, + { + "epoch": 1.5990690143695607, + "grad_norm": 0.29185500741004944, + "learning_rate": 1.924062470089253e-05, + "loss": 0.1777, + "step": 7901 + }, + { + "epoch": 1.599271402550091, + "grad_norm": 0.30840426683425903, + "learning_rate": 1.922187086036632e-05, + "loss": 0.2055, + "step": 7902 + }, + { + "epoch": 1.5994737907306213, + "grad_norm": 0.2773885428905487, + "learning_rate": 1.9203125192123584e-05, + "loss": 0.203, + "step": 7903 + }, + { + "epoch": 1.5996761789111515, + "grad_norm": 0.2688390910625458, + "learning_rate": 1.918438769806088e-05, + "loss": 0.1963, + "step": 7904 + }, + { + "epoch": 1.599878567091682, + "grad_norm": 0.3111615478992462, + "learning_rate": 1.9165658380073838e-05, + "loss": 0.1637, + "step": 7905 + }, + { + "epoch": 1.6000809552722122, + "grad_norm": 0.2527560293674469, + "learning_rate": 1.9146937240057295e-05, + "loss": 0.1517, + "step": 7906 + }, + { + "epoch": 1.6002833434527424, + "grad_norm": 0.3315982222557068, + "learning_rate": 1.912822427990526e-05, + "loss": 0.1985, + "step": 7907 + }, + { + "epoch": 1.6004857316332726, + "grad_norm": 0.3067420423030853, + "learning_rate": 1.9109519501510907e-05, + "loss": 0.198, + "step": 7908 + }, + { + "epoch": 1.6006881198138028, + "grad_norm": 0.2605978846549988, + "learning_rate": 1.9090822906766616e-05, + "loss": 0.1716, + "step": 7909 + }, + { + "epoch": 1.600890507994333, + "grad_norm": 0.25154924392700195, + "learning_rate": 1.9072134497563877e-05, + "loss": 0.1828, + "step": 7910 + }, + { + "epoch": 1.6010928961748634, + "grad_norm": 0.32552117109298706, + "learning_rate": 1.9053454275793403e-05, + "loss": 0.1962, + "step": 7911 + }, + { + "epoch": 1.6012952843553938, + "grad_norm": 0.25949302315711975, + "learning_rate": 1.903478224334507e-05, + "loss": 0.1519, + "step": 7912 + }, + { + "epoch": 1.601497672535924, + "grad_norm": 0.2832423746585846, + "learning_rate": 1.9016118402107907e-05, + "loss": 0.1901, + "step": 7913 + }, + { + "epoch": 1.6017000607164542, + "grad_norm": 0.30006927251815796, + "learning_rate": 1.899746275397014e-05, + "loss": 0.2194, + "step": 7914 + }, + { + "epoch": 1.6019024488969844, + "grad_norm": 0.2572685480117798, + "learning_rate": 1.897881530081913e-05, + "loss": 0.1503, + "step": 7915 + }, + { + "epoch": 1.6021048370775146, + "grad_norm": 0.25450223684310913, + "learning_rate": 1.8960176044541468e-05, + "loss": 0.1649, + "step": 7916 + }, + { + "epoch": 1.6023072252580448, + "grad_norm": 0.25192636251449585, + "learning_rate": 1.894154498702283e-05, + "loss": 0.1719, + "step": 7917 + }, + { + "epoch": 1.6025096134385752, + "grad_norm": 0.2704130709171295, + "learning_rate": 1.8922922130148135e-05, + "loss": 0.2517, + "step": 7918 + }, + { + "epoch": 1.6027120016191054, + "grad_norm": 0.24531984329223633, + "learning_rate": 1.8904307475801453e-05, + "loss": 0.1886, + "step": 7919 + }, + { + "epoch": 1.6029143897996359, + "grad_norm": 0.25487950444221497, + "learning_rate": 1.8885701025865998e-05, + "loss": 0.1867, + "step": 7920 + }, + { + "epoch": 1.603116777980166, + "grad_norm": 0.2579203248023987, + "learning_rate": 1.88671027822242e-05, + "loss": 0.1704, + "step": 7921 + }, + { + "epoch": 1.6033191661606963, + "grad_norm": 0.2408556193113327, + "learning_rate": 1.884851274675763e-05, + "loss": 0.1505, + "step": 7922 + }, + { + "epoch": 1.6035215543412265, + "grad_norm": 0.3046492040157318, + "learning_rate": 1.8829930921347016e-05, + "loss": 0.1735, + "step": 7923 + }, + { + "epoch": 1.6037239425217567, + "grad_norm": 0.272602915763855, + "learning_rate": 1.8811357307872292e-05, + "loss": 0.1881, + "step": 7924 + }, + { + "epoch": 1.6039263307022869, + "grad_norm": 0.2787960469722748, + "learning_rate": 1.8792791908212527e-05, + "loss": 0.1472, + "step": 7925 + }, + { + "epoch": 1.6041287188828173, + "grad_norm": 0.26666730642318726, + "learning_rate": 1.8774234724245977e-05, + "loss": 0.1689, + "step": 7926 + }, + { + "epoch": 1.6043311070633475, + "grad_norm": 0.2414676994085312, + "learning_rate": 1.875568575785007e-05, + "loss": 0.1747, + "step": 7927 + }, + { + "epoch": 1.604533495243878, + "grad_norm": 0.2558421492576599, + "learning_rate": 1.8737145010901392e-05, + "loss": 0.1747, + "step": 7928 + }, + { + "epoch": 1.604735883424408, + "grad_norm": 0.2758699357509613, + "learning_rate": 1.87186124852757e-05, + "loss": 0.2173, + "step": 7929 + }, + { + "epoch": 1.6049382716049383, + "grad_norm": 0.26736441254615784, + "learning_rate": 1.870008818284792e-05, + "loss": 0.1528, + "step": 7930 + }, + { + "epoch": 1.6051406597854685, + "grad_norm": 0.310761034488678, + "learning_rate": 1.868157210549215e-05, + "loss": 0.2001, + "step": 7931 + }, + { + "epoch": 1.6053430479659987, + "grad_norm": 0.299629807472229, + "learning_rate": 1.866306425508164e-05, + "loss": 0.192, + "step": 7932 + }, + { + "epoch": 1.605545436146529, + "grad_norm": 0.29254329204559326, + "learning_rate": 1.8644564633488836e-05, + "loss": 0.1932, + "step": 7933 + }, + { + "epoch": 1.6057478243270593, + "grad_norm": 0.2738923132419586, + "learning_rate": 1.862607324258534e-05, + "loss": 0.213, + "step": 7934 + }, + { + "epoch": 1.6059502125075895, + "grad_norm": 0.26249390840530396, + "learning_rate": 1.860759008424189e-05, + "loss": 0.1909, + "step": 7935 + }, + { + "epoch": 1.60615260068812, + "grad_norm": 0.3084731996059418, + "learning_rate": 1.858911516032844e-05, + "loss": 0.2246, + "step": 7936 + }, + { + "epoch": 1.6063549888686501, + "grad_norm": 0.2792483866214752, + "learning_rate": 1.857064847271409e-05, + "loss": 0.1892, + "step": 7937 + }, + { + "epoch": 1.6065573770491803, + "grad_norm": 0.30546650290489197, + "learning_rate": 1.8552190023267112e-05, + "loss": 0.1726, + "step": 7938 + }, + { + "epoch": 1.6067597652297105, + "grad_norm": 0.2785407304763794, + "learning_rate": 1.8533739813854912e-05, + "loss": 0.1678, + "step": 7939 + }, + { + "epoch": 1.6069621534102407, + "grad_norm": 0.3166220486164093, + "learning_rate": 1.8515297846344093e-05, + "loss": 0.2157, + "step": 7940 + }, + { + "epoch": 1.607164541590771, + "grad_norm": 0.30899468064308167, + "learning_rate": 1.8496864122600434e-05, + "loss": 0.1936, + "step": 7941 + }, + { + "epoch": 1.6073669297713014, + "grad_norm": 0.25260448455810547, + "learning_rate": 1.847843864448886e-05, + "loss": 0.1749, + "step": 7942 + }, + { + "epoch": 1.6075693179518318, + "grad_norm": 0.2828786373138428, + "learning_rate": 1.846002141387346e-05, + "loss": 0.1664, + "step": 7943 + }, + { + "epoch": 1.607771706132362, + "grad_norm": 0.2762792110443115, + "learning_rate": 1.8441612432617517e-05, + "loss": 0.1799, + "step": 7944 + }, + { + "epoch": 1.6079740943128922, + "grad_norm": 0.2897892892360687, + "learning_rate": 1.8423211702583442e-05, + "loss": 0.2078, + "step": 7945 + }, + { + "epoch": 1.6081764824934224, + "grad_norm": 0.2666051685810089, + "learning_rate": 1.840481922563283e-05, + "loss": 0.1653, + "step": 7946 + }, + { + "epoch": 1.6083788706739526, + "grad_norm": 0.2855166792869568, + "learning_rate": 1.8386435003626436e-05, + "loss": 0.1844, + "step": 7947 + }, + { + "epoch": 1.6085812588544828, + "grad_norm": 0.22759123146533966, + "learning_rate": 1.8368059038424192e-05, + "loss": 0.1561, + "step": 7948 + }, + { + "epoch": 1.6087836470350132, + "grad_norm": 0.2589799761772156, + "learning_rate": 1.8349691331885178e-05, + "loss": 0.1857, + "step": 7949 + }, + { + "epoch": 1.6089860352155434, + "grad_norm": 0.3158065974712372, + "learning_rate": 1.8331331885867643e-05, + "loss": 0.2109, + "step": 7950 + }, + { + "epoch": 1.6089860352155434, + "eval_loss": 0.2593998312950134, + "eval_runtime": 0.7395, + "eval_samples_per_second": 6.762, + "eval_steps_per_second": 1.352, + "step": 7950 + }, + { + "epoch": 1.6091884233960738, + "grad_norm": 0.3034374415874481, + "learning_rate": 1.831298070222902e-05, + "loss": 0.205, + "step": 7951 + }, + { + "epoch": 1.609390811576604, + "grad_norm": 0.271475225687027, + "learning_rate": 1.8294637782825875e-05, + "loss": 0.1823, + "step": 7952 + }, + { + "epoch": 1.6095931997571342, + "grad_norm": 0.3029545247554779, + "learning_rate": 1.827630312951395e-05, + "loss": 0.1872, + "step": 7953 + }, + { + "epoch": 1.6097955879376644, + "grad_norm": 0.2891842722892761, + "learning_rate": 1.8257976744148153e-05, + "loss": 0.1942, + "step": 7954 + }, + { + "epoch": 1.6099979761181946, + "grad_norm": 0.30320534110069275, + "learning_rate": 1.8239658628582567e-05, + "loss": 0.2053, + "step": 7955 + }, + { + "epoch": 1.6102003642987248, + "grad_norm": 0.2838584780693054, + "learning_rate": 1.822134878467041e-05, + "loss": 0.1848, + "step": 7956 + }, + { + "epoch": 1.6104027524792552, + "grad_norm": 0.3041428327560425, + "learning_rate": 1.8203047214264103e-05, + "loss": 0.1858, + "step": 7957 + }, + { + "epoch": 1.6106051406597854, + "grad_norm": 0.2750832438468933, + "learning_rate": 1.818475391921518e-05, + "loss": 0.1843, + "step": 7958 + }, + { + "epoch": 1.6108075288403159, + "grad_norm": 0.3491036593914032, + "learning_rate": 1.816646890137439e-05, + "loss": 0.1402, + "step": 7959 + }, + { + "epoch": 1.611009917020846, + "grad_norm": 0.23265735805034637, + "learning_rate": 1.8148192162591605e-05, + "loss": 0.1547, + "step": 7960 + }, + { + "epoch": 1.6112123052013763, + "grad_norm": 0.26172006130218506, + "learning_rate": 1.8129923704715868e-05, + "loss": 0.1739, + "step": 7961 + }, + { + "epoch": 1.6114146933819065, + "grad_norm": 0.25250962376594543, + "learning_rate": 1.81116635295954e-05, + "loss": 0.1906, + "step": 7962 + }, + { + "epoch": 1.6116170815624367, + "grad_norm": 0.3147255480289459, + "learning_rate": 1.8093411639077572e-05, + "loss": 0.2193, + "step": 7963 + }, + { + "epoch": 1.6118194697429669, + "grad_norm": 0.2867109775543213, + "learning_rate": 1.8075168035008917e-05, + "loss": 0.2072, + "step": 7964 + }, + { + "epoch": 1.6120218579234973, + "grad_norm": 0.28083083033561707, + "learning_rate": 1.805693271923514e-05, + "loss": 0.1933, + "step": 7965 + }, + { + "epoch": 1.6122242461040275, + "grad_norm": 0.28652331233024597, + "learning_rate": 1.803870569360109e-05, + "loss": 0.1897, + "step": 7966 + }, + { + "epoch": 1.612426634284558, + "grad_norm": 0.3437660038471222, + "learning_rate": 1.8020486959950777e-05, + "loss": 0.1855, + "step": 7967 + }, + { + "epoch": 1.612629022465088, + "grad_norm": 0.35121816396713257, + "learning_rate": 1.8002276520127405e-05, + "loss": 0.2342, + "step": 7968 + }, + { + "epoch": 1.6128314106456183, + "grad_norm": 0.28837767243385315, + "learning_rate": 1.7984074375973292e-05, + "loss": 0.2186, + "step": 7969 + }, + { + "epoch": 1.6130337988261485, + "grad_norm": 0.3354552984237671, + "learning_rate": 1.796588052932996e-05, + "loss": 0.2227, + "step": 7970 + }, + { + "epoch": 1.6132361870066787, + "grad_norm": 0.3056231439113617, + "learning_rate": 1.7947694982038054e-05, + "loss": 0.214, + "step": 7971 + }, + { + "epoch": 1.613438575187209, + "grad_norm": 0.2902771830558777, + "learning_rate": 1.7929517735937405e-05, + "loss": 0.17, + "step": 7972 + }, + { + "epoch": 1.6136409633677393, + "grad_norm": 0.24810972809791565, + "learning_rate": 1.7911348792867e-05, + "loss": 0.1818, + "step": 7973 + }, + { + "epoch": 1.6138433515482697, + "grad_norm": 0.303475022315979, + "learning_rate": 1.7893188154664984e-05, + "loss": 0.2232, + "step": 7974 + }, + { + "epoch": 1.6140457397288, + "grad_norm": 0.2750754952430725, + "learning_rate": 1.787503582316864e-05, + "loss": 0.1794, + "step": 7975 + }, + { + "epoch": 1.6142481279093301, + "grad_norm": 0.2861912250518799, + "learning_rate": 1.785689180021445e-05, + "loss": 0.2084, + "step": 7976 + }, + { + "epoch": 1.6144505160898603, + "grad_norm": 0.3108361065387726, + "learning_rate": 1.7838756087638032e-05, + "loss": 0.1995, + "step": 7977 + }, + { + "epoch": 1.6146529042703905, + "grad_norm": 0.24518351256847382, + "learning_rate": 1.7820628687274165e-05, + "loss": 0.1889, + "step": 7978 + }, + { + "epoch": 1.6148552924509207, + "grad_norm": 0.2598412036895752, + "learning_rate": 1.7802509600956783e-05, + "loss": 0.1951, + "step": 7979 + }, + { + "epoch": 1.6150576806314512, + "grad_norm": 0.2555294334888458, + "learning_rate": 1.7784398830519e-05, + "loss": 0.1794, + "step": 7980 + }, + { + "epoch": 1.6152600688119814, + "grad_norm": 0.2353818565607071, + "learning_rate": 1.7766296377793058e-05, + "loss": 0.1348, + "step": 7981 + }, + { + "epoch": 1.6154624569925118, + "grad_norm": 0.3107486069202423, + "learning_rate": 1.774820224461038e-05, + "loss": 0.2107, + "step": 7982 + }, + { + "epoch": 1.615664845173042, + "grad_norm": 0.29552415013313293, + "learning_rate": 1.773011643280157e-05, + "loss": 0.1962, + "step": 7983 + }, + { + "epoch": 1.6158672333535722, + "grad_norm": 0.27952340245246887, + "learning_rate": 1.7712038944196296e-05, + "loss": 0.205, + "step": 7984 + }, + { + "epoch": 1.6160696215341024, + "grad_norm": 0.3028942346572876, + "learning_rate": 1.76939697806235e-05, + "loss": 0.1742, + "step": 7985 + }, + { + "epoch": 1.6162720097146326, + "grad_norm": 0.29906564950942993, + "learning_rate": 1.7675908943911202e-05, + "loss": 0.1792, + "step": 7986 + }, + { + "epoch": 1.6164743978951628, + "grad_norm": 0.285399854183197, + "learning_rate": 1.7657856435886623e-05, + "loss": 0.1989, + "step": 7987 + }, + { + "epoch": 1.6166767860756932, + "grad_norm": 0.26634126901626587, + "learning_rate": 1.763981225837612e-05, + "loss": 0.1761, + "step": 7988 + }, + { + "epoch": 1.6168791742562234, + "grad_norm": 0.3051709532737732, + "learning_rate": 1.7621776413205225e-05, + "loss": 0.2189, + "step": 7989 + }, + { + "epoch": 1.6170815624367538, + "grad_norm": 0.29787564277648926, + "learning_rate": 1.7603748902198604e-05, + "loss": 0.1883, + "step": 7990 + }, + { + "epoch": 1.617283950617284, + "grad_norm": 0.30989035964012146, + "learning_rate": 1.75857297271801e-05, + "loss": 0.2233, + "step": 7991 + }, + { + "epoch": 1.6174863387978142, + "grad_norm": 0.256502240896225, + "learning_rate": 1.7567718889972683e-05, + "loss": 0.1822, + "step": 7992 + }, + { + "epoch": 1.6176887269783444, + "grad_norm": 0.26667922735214233, + "learning_rate": 1.754971639239853e-05, + "loss": 0.177, + "step": 7993 + }, + { + "epoch": 1.6178911151588746, + "grad_norm": 0.2559124827384949, + "learning_rate": 1.7531722236278936e-05, + "loss": 0.169, + "step": 7994 + }, + { + "epoch": 1.6180935033394048, + "grad_norm": 0.2702696919441223, + "learning_rate": 1.7513736423434345e-05, + "loss": 0.1685, + "step": 7995 + }, + { + "epoch": 1.6182958915199352, + "grad_norm": 0.2676244080066681, + "learning_rate": 1.7495758955684392e-05, + "loss": 0.1805, + "step": 7996 + }, + { + "epoch": 1.6184982797004654, + "grad_norm": 0.2880995273590088, + "learning_rate": 1.7477789834847837e-05, + "loss": 0.1847, + "step": 7997 + }, + { + "epoch": 1.6187006678809959, + "grad_norm": 0.27174052596092224, + "learning_rate": 1.7459829062742605e-05, + "loss": 0.182, + "step": 7998 + }, + { + "epoch": 1.618903056061526, + "grad_norm": 0.2822988033294678, + "learning_rate": 1.7441876641185795e-05, + "loss": 0.2077, + "step": 7999 + }, + { + "epoch": 1.6191054442420563, + "grad_norm": 0.30251890420913696, + "learning_rate": 1.742393257199363e-05, + "loss": 0.197, + "step": 8000 + }, + { + "epoch": 1.6191054442420563, + "eval_loss": 0.25870126485824585, + "eval_runtime": 0.7362, + "eval_samples_per_second": 6.791, + "eval_steps_per_second": 1.358, + "step": 8000 + }, + { + "epoch": 1.6193078324225865, + "grad_norm": 0.3131512403488159, + "learning_rate": 1.740599685698151e-05, + "loss": 0.2221, + "step": 8001 + }, + { + "epoch": 1.6195102206031167, + "grad_norm": 0.28746527433395386, + "learning_rate": 1.7388069497963967e-05, + "loss": 0.1911, + "step": 8002 + }, + { + "epoch": 1.619712608783647, + "grad_norm": 0.30155429244041443, + "learning_rate": 1.7370150496754722e-05, + "loss": 0.2029, + "step": 8003 + }, + { + "epoch": 1.6199149969641773, + "grad_norm": 0.2872103452682495, + "learning_rate": 1.7352239855166628e-05, + "loss": 0.1808, + "step": 8004 + }, + { + "epoch": 1.6201173851447077, + "grad_norm": 0.32985246181488037, + "learning_rate": 1.7334337575011693e-05, + "loss": 0.2171, + "step": 8005 + }, + { + "epoch": 1.620319773325238, + "grad_norm": 0.24914324283599854, + "learning_rate": 1.731644365810108e-05, + "loss": 0.1695, + "step": 8006 + }, + { + "epoch": 1.620522161505768, + "grad_norm": 0.3195323646068573, + "learning_rate": 1.72985581062451e-05, + "loss": 0.1846, + "step": 8007 + }, + { + "epoch": 1.6207245496862983, + "grad_norm": 0.2776571810245514, + "learning_rate": 1.7280680921253244e-05, + "loss": 0.1696, + "step": 8008 + }, + { + "epoch": 1.6209269378668285, + "grad_norm": 0.2772268056869507, + "learning_rate": 1.7262812104934124e-05, + "loss": 0.2042, + "step": 8009 + }, + { + "epoch": 1.6211293260473587, + "grad_norm": 0.2957817316055298, + "learning_rate": 1.724495165909553e-05, + "loss": 0.173, + "step": 8010 + }, + { + "epoch": 1.6213317142278891, + "grad_norm": 0.27109494805336, + "learning_rate": 1.7227099585544383e-05, + "loss": 0.156, + "step": 8011 + }, + { + "epoch": 1.6215341024084193, + "grad_norm": 0.2688787281513214, + "learning_rate": 1.7209255886086772e-05, + "loss": 0.2092, + "step": 8012 + }, + { + "epoch": 1.6217364905889498, + "grad_norm": 0.2872391641139984, + "learning_rate": 1.7191420562527937e-05, + "loss": 0.2132, + "step": 8013 + }, + { + "epoch": 1.62193887876948, + "grad_norm": 0.2846378684043884, + "learning_rate": 1.717359361667228e-05, + "loss": 0.1788, + "step": 8014 + }, + { + "epoch": 1.6221412669500102, + "grad_norm": 0.2909705638885498, + "learning_rate": 1.715577505032332e-05, + "loss": 0.1824, + "step": 8015 + }, + { + "epoch": 1.6223436551305404, + "grad_norm": 0.3115446865558624, + "learning_rate": 1.7137964865283772e-05, + "loss": 0.1996, + "step": 8016 + }, + { + "epoch": 1.6225460433110706, + "grad_norm": 0.26595601439476013, + "learning_rate": 1.7120163063355477e-05, + "loss": 0.1705, + "step": 8017 + }, + { + "epoch": 1.6227484314916008, + "grad_norm": 0.29795345664024353, + "learning_rate": 1.7102369646339433e-05, + "loss": 0.1812, + "step": 8018 + }, + { + "epoch": 1.6229508196721312, + "grad_norm": 0.30830883979797363, + "learning_rate": 1.7084584616035792e-05, + "loss": 0.214, + "step": 8019 + }, + { + "epoch": 1.6231532078526614, + "grad_norm": 0.24998906254768372, + "learning_rate": 1.706680797424386e-05, + "loss": 0.18, + "step": 8020 + }, + { + "epoch": 1.6233555960331918, + "grad_norm": 0.25060421228408813, + "learning_rate": 1.7049039722762093e-05, + "loss": 0.1782, + "step": 8021 + }, + { + "epoch": 1.623557984213722, + "grad_norm": 0.310921847820282, + "learning_rate": 1.7031279863388083e-05, + "loss": 0.2213, + "step": 8022 + }, + { + "epoch": 1.6237603723942522, + "grad_norm": 0.2564176321029663, + "learning_rate": 1.701352839791861e-05, + "loss": 0.1784, + "step": 8023 + }, + { + "epoch": 1.6239627605747824, + "grad_norm": 0.3001807928085327, + "learning_rate": 1.699578532814955e-05, + "loss": 0.222, + "step": 8024 + }, + { + "epoch": 1.6241651487553126, + "grad_norm": 0.34355929493904114, + "learning_rate": 1.6978050655875987e-05, + "loss": 0.2239, + "step": 8025 + }, + { + "epoch": 1.6243675369358428, + "grad_norm": 0.27204430103302, + "learning_rate": 1.6960324382892123e-05, + "loss": 0.1633, + "step": 8026 + }, + { + "epoch": 1.6245699251163732, + "grad_norm": 0.2537243962287903, + "learning_rate": 1.6942606510991334e-05, + "loss": 0.1327, + "step": 8027 + }, + { + "epoch": 1.6247723132969034, + "grad_norm": 0.2479013055562973, + "learning_rate": 1.692489704196607e-05, + "loss": 0.1683, + "step": 8028 + }, + { + "epoch": 1.6249747014774338, + "grad_norm": 0.28489288687705994, + "learning_rate": 1.6907195977608036e-05, + "loss": 0.1985, + "step": 8029 + }, + { + "epoch": 1.625177089657964, + "grad_norm": 0.25739508867263794, + "learning_rate": 1.6889503319708032e-05, + "loss": 0.1932, + "step": 8030 + }, + { + "epoch": 1.6253794778384942, + "grad_norm": 0.2719423472881317, + "learning_rate": 1.6871819070056017e-05, + "loss": 0.1664, + "step": 8031 + }, + { + "epoch": 1.6255818660190244, + "grad_norm": 0.2762310802936554, + "learning_rate": 1.685414323044109e-05, + "loss": 0.1725, + "step": 8032 + }, + { + "epoch": 1.6257842541995546, + "grad_norm": 0.2826240658760071, + "learning_rate": 1.683647580265151e-05, + "loss": 0.1978, + "step": 8033 + }, + { + "epoch": 1.625986642380085, + "grad_norm": 0.2921895384788513, + "learning_rate": 1.681881678847468e-05, + "loss": 0.162, + "step": 8034 + }, + { + "epoch": 1.6261890305606153, + "grad_norm": 0.2500416338443756, + "learning_rate": 1.680116618969716e-05, + "loss": 0.1404, + "step": 8035 + }, + { + "epoch": 1.6263914187411457, + "grad_norm": 0.3063223659992218, + "learning_rate": 1.6783524008104647e-05, + "loss": 0.1942, + "step": 8036 + }, + { + "epoch": 1.6265938069216759, + "grad_norm": 0.30133017897605896, + "learning_rate": 1.6765890245481997e-05, + "loss": 0.1747, + "step": 8037 + }, + { + "epoch": 1.626796195102206, + "grad_norm": 0.3058010935783386, + "learning_rate": 1.6748264903613208e-05, + "loss": 0.2044, + "step": 8038 + }, + { + "epoch": 1.6269985832827363, + "grad_norm": 0.2959883213043213, + "learning_rate": 1.6730647984281423e-05, + "loss": 0.1969, + "step": 8039 + }, + { + "epoch": 1.6272009714632665, + "grad_norm": 0.29775944352149963, + "learning_rate": 1.6713039489268945e-05, + "loss": 0.2063, + "step": 8040 + }, + { + "epoch": 1.6274033596437967, + "grad_norm": 0.32729557156562805, + "learning_rate": 1.6695439420357206e-05, + "loss": 0.2266, + "step": 8041 + }, + { + "epoch": 1.627605747824327, + "grad_norm": 0.2869478464126587, + "learning_rate": 1.6677847779326805e-05, + "loss": 0.1958, + "step": 8042 + }, + { + "epoch": 1.6278081360048573, + "grad_norm": 0.2807890772819519, + "learning_rate": 1.6660264567957474e-05, + "loss": 0.2006, + "step": 8043 + }, + { + "epoch": 1.6280105241853877, + "grad_norm": 0.278018981218338, + "learning_rate": 1.66426897880281e-05, + "loss": 0.1989, + "step": 8044 + }, + { + "epoch": 1.628212912365918, + "grad_norm": 0.27039635181427, + "learning_rate": 1.6625123441316716e-05, + "loss": 0.1856, + "step": 8045 + }, + { + "epoch": 1.6284153005464481, + "grad_norm": 0.3077291250228882, + "learning_rate": 1.66075655296005e-05, + "loss": 0.1884, + "step": 8046 + }, + { + "epoch": 1.6286176887269783, + "grad_norm": 0.30340608954429626, + "learning_rate": 1.6590016054655766e-05, + "loss": 0.2076, + "step": 8047 + }, + { + "epoch": 1.6288200769075085, + "grad_norm": 0.2628271281719208, + "learning_rate": 1.6572475018258015e-05, + "loss": 0.1698, + "step": 8048 + }, + { + "epoch": 1.6290224650880387, + "grad_norm": 0.31509193778038025, + "learning_rate": 1.6554942422181863e-05, + "loss": 0.2022, + "step": 8049 + }, + { + "epoch": 1.6292248532685691, + "grad_norm": 0.2697297930717468, + "learning_rate": 1.6537418268201034e-05, + "loss": 0.1686, + "step": 8050 + }, + { + "epoch": 1.6292248532685691, + "eval_loss": 0.25894036889076233, + "eval_runtime": 0.7385, + "eval_samples_per_second": 6.77, + "eval_steps_per_second": 1.354, + "step": 8050 + }, + { + "epoch": 1.6294272414490993, + "grad_norm": 0.2741582691669464, + "learning_rate": 1.651990255808845e-05, + "loss": 0.1972, + "step": 8051 + }, + { + "epoch": 1.6296296296296298, + "grad_norm": 0.26430466771125793, + "learning_rate": 1.650239529361619e-05, + "loss": 0.1881, + "step": 8052 + }, + { + "epoch": 1.62983201781016, + "grad_norm": 0.26232969760894775, + "learning_rate": 1.6484896476555445e-05, + "loss": 0.2028, + "step": 8053 + }, + { + "epoch": 1.6300344059906902, + "grad_norm": 0.3205881118774414, + "learning_rate": 1.6467406108676554e-05, + "loss": 0.219, + "step": 8054 + }, + { + "epoch": 1.6302367941712204, + "grad_norm": 0.26163017749786377, + "learning_rate": 1.6449924191749024e-05, + "loss": 0.1615, + "step": 8055 + }, + { + "epoch": 1.6304391823517506, + "grad_norm": 0.2618088722229004, + "learning_rate": 1.643245072754145e-05, + "loss": 0.175, + "step": 8056 + }, + { + "epoch": 1.6306415705322808, + "grad_norm": 0.28531819581985474, + "learning_rate": 1.6414985717821673e-05, + "loss": 0.2005, + "step": 8057 + }, + { + "epoch": 1.6308439587128112, + "grad_norm": 0.27115190029144287, + "learning_rate": 1.6397529164356606e-05, + "loss": 0.1824, + "step": 8058 + }, + { + "epoch": 1.6310463468933414, + "grad_norm": 0.28367385268211365, + "learning_rate": 1.638008106891229e-05, + "loss": 0.1639, + "step": 8059 + }, + { + "epoch": 1.6312487350738718, + "grad_norm": 0.2776460647583008, + "learning_rate": 1.6362641433253968e-05, + "loss": 0.1709, + "step": 8060 + }, + { + "epoch": 1.631451123254402, + "grad_norm": 0.2931157052516937, + "learning_rate": 1.634521025914598e-05, + "loss": 0.2073, + "step": 8061 + }, + { + "epoch": 1.6316535114349322, + "grad_norm": 0.32506170868873596, + "learning_rate": 1.6327787548351848e-05, + "loss": 0.1834, + "step": 8062 + }, + { + "epoch": 1.6318558996154624, + "grad_norm": 0.31267428398132324, + "learning_rate": 1.6310373302634208e-05, + "loss": 0.1932, + "step": 8063 + }, + { + "epoch": 1.6320582877959926, + "grad_norm": 0.30341413617134094, + "learning_rate": 1.6292967523754855e-05, + "loss": 0.2089, + "step": 8064 + }, + { + "epoch": 1.632260675976523, + "grad_norm": 0.2488972693681717, + "learning_rate": 1.627557021347471e-05, + "loss": 0.1712, + "step": 8065 + }, + { + "epoch": 1.6324630641570532, + "grad_norm": 0.25763022899627686, + "learning_rate": 1.625818137355386e-05, + "loss": 0.1739, + "step": 8066 + }, + { + "epoch": 1.6326654523375836, + "grad_norm": 0.27001476287841797, + "learning_rate": 1.6240801005751538e-05, + "loss": 0.1675, + "step": 8067 + }, + { + "epoch": 1.6328678405181138, + "grad_norm": 0.2840779423713684, + "learning_rate": 1.6223429111826083e-05, + "loss": 0.1873, + "step": 8068 + }, + { + "epoch": 1.633070228698644, + "grad_norm": 0.3620816469192505, + "learning_rate": 1.620606569353502e-05, + "loss": 0.2138, + "step": 8069 + }, + { + "epoch": 1.6332726168791742, + "grad_norm": 0.3016754984855652, + "learning_rate": 1.6188710752634985e-05, + "loss": 0.1899, + "step": 8070 + }, + { + "epoch": 1.6334750050597044, + "grad_norm": 0.3155854642391205, + "learning_rate": 1.6171364290881808e-05, + "loss": 0.2162, + "step": 8071 + }, + { + "epoch": 1.6336773932402346, + "grad_norm": 0.3163856565952301, + "learning_rate": 1.6154026310030358e-05, + "loss": 0.2142, + "step": 8072 + }, + { + "epoch": 1.633879781420765, + "grad_norm": 0.3073074221611023, + "learning_rate": 1.6136696811834727e-05, + "loss": 0.1627, + "step": 8073 + }, + { + "epoch": 1.6340821696012953, + "grad_norm": 0.3004322648048401, + "learning_rate": 1.6119375798048163e-05, + "loss": 0.1736, + "step": 8074 + }, + { + "epoch": 1.6342845577818257, + "grad_norm": 0.24818210303783417, + "learning_rate": 1.6102063270422995e-05, + "loss": 0.1756, + "step": 8075 + }, + { + "epoch": 1.6344869459623559, + "grad_norm": 0.33458349108695984, + "learning_rate": 1.6084759230710745e-05, + "loss": 0.1882, + "step": 8076 + }, + { + "epoch": 1.634689334142886, + "grad_norm": 0.24042929708957672, + "learning_rate": 1.6067463680662043e-05, + "loss": 0.1762, + "step": 8077 + }, + { + "epoch": 1.6348917223234163, + "grad_norm": 0.275277704000473, + "learning_rate": 1.605017662202666e-05, + "loss": 0.176, + "step": 8078 + }, + { + "epoch": 1.6350941105039465, + "grad_norm": 0.2901867628097534, + "learning_rate": 1.6032898056553535e-05, + "loss": 0.1828, + "step": 8079 + }, + { + "epoch": 1.6352964986844767, + "grad_norm": 0.2616790235042572, + "learning_rate": 1.6015627985990732e-05, + "loss": 0.1824, + "step": 8080 + }, + { + "epoch": 1.635498886865007, + "grad_norm": 0.2518100440502167, + "learning_rate": 1.5998366412085452e-05, + "loss": 0.1616, + "step": 8081 + }, + { + "epoch": 1.6357012750455373, + "grad_norm": 0.30384114384651184, + "learning_rate": 1.5981113336584043e-05, + "loss": 0.2056, + "step": 8082 + }, + { + "epoch": 1.6359036632260677, + "grad_norm": 0.2821972072124481, + "learning_rate": 1.5963868761231983e-05, + "loss": 0.1708, + "step": 8083 + }, + { + "epoch": 1.636106051406598, + "grad_norm": 0.2594367265701294, + "learning_rate": 1.594663268777389e-05, + "loss": 0.1987, + "step": 8084 + }, + { + "epoch": 1.6363084395871281, + "grad_norm": 0.2959327697753906, + "learning_rate": 1.5929405117953557e-05, + "loss": 0.2098, + "step": 8085 + }, + { + "epoch": 1.6365108277676583, + "grad_norm": 0.2778414785861969, + "learning_rate": 1.5912186053513856e-05, + "loss": 0.1993, + "step": 8086 + }, + { + "epoch": 1.6367132159481885, + "grad_norm": 0.2850133776664734, + "learning_rate": 1.589497549619685e-05, + "loss": 0.1948, + "step": 8087 + }, + { + "epoch": 1.6369156041287187, + "grad_norm": 0.26495492458343506, + "learning_rate": 1.5877773447743725e-05, + "loss": 0.1821, + "step": 8088 + }, + { + "epoch": 1.6371179923092491, + "grad_norm": 0.2508051097393036, + "learning_rate": 1.58605799098948e-05, + "loss": 0.1768, + "step": 8089 + }, + { + "epoch": 1.6373203804897793, + "grad_norm": 0.27083274722099304, + "learning_rate": 1.5843394884389528e-05, + "loss": 0.1993, + "step": 8090 + }, + { + "epoch": 1.6375227686703098, + "grad_norm": 0.2604304850101471, + "learning_rate": 1.5826218372966517e-05, + "loss": 0.1809, + "step": 8091 + }, + { + "epoch": 1.63772515685084, + "grad_norm": 0.3725582957267761, + "learning_rate": 1.580905037736351e-05, + "loss": 0.1671, + "step": 8092 + }, + { + "epoch": 1.6379275450313702, + "grad_norm": 0.2975594401359558, + "learning_rate": 1.5791890899317374e-05, + "loss": 0.1744, + "step": 8093 + }, + { + "epoch": 1.6381299332119004, + "grad_norm": 0.2582997679710388, + "learning_rate": 1.5774739940564165e-05, + "loss": 0.1551, + "step": 8094 + }, + { + "epoch": 1.6383323213924306, + "grad_norm": 0.3007787764072418, + "learning_rate": 1.5757597502838973e-05, + "loss": 0.2484, + "step": 8095 + }, + { + "epoch": 1.638534709572961, + "grad_norm": 0.2605661153793335, + "learning_rate": 1.574046358787612e-05, + "loss": 0.171, + "step": 8096 + }, + { + "epoch": 1.6387370977534912, + "grad_norm": 0.3007476329803467, + "learning_rate": 1.572333819740903e-05, + "loss": 0.1868, + "step": 8097 + }, + { + "epoch": 1.6389394859340216, + "grad_norm": 0.2613348364830017, + "learning_rate": 1.570622133317028e-05, + "loss": 0.1622, + "step": 8098 + }, + { + "epoch": 1.6391418741145518, + "grad_norm": 0.277700275182724, + "learning_rate": 1.5689112996891576e-05, + "loss": 0.184, + "step": 8099 + }, + { + "epoch": 1.639344262295082, + "grad_norm": 0.25784534215927124, + "learning_rate": 1.5672013190303757e-05, + "loss": 0.1875, + "step": 8100 + }, + { + "epoch": 1.639344262295082, + "eval_loss": 0.25882890820503235, + "eval_runtime": 0.7369, + "eval_samples_per_second": 6.785, + "eval_steps_per_second": 1.357, + "step": 8100 + }, + { + "epoch": 1.6395466504756122, + "grad_norm": 0.2594480514526367, + "learning_rate": 1.5654921915136787e-05, + "loss": 0.2041, + "step": 8101 + }, + { + "epoch": 1.6397490386561424, + "grad_norm": 0.2852243185043335, + "learning_rate": 1.5637839173119807e-05, + "loss": 0.2062, + "step": 8102 + }, + { + "epoch": 1.6399514268366726, + "grad_norm": 0.25183483958244324, + "learning_rate": 1.5620764965981048e-05, + "loss": 0.1562, + "step": 8103 + }, + { + "epoch": 1.640153815017203, + "grad_norm": 0.30132558941841125, + "learning_rate": 1.5603699295447916e-05, + "loss": 0.2095, + "step": 8104 + }, + { + "epoch": 1.6403562031977332, + "grad_norm": 0.25491568446159363, + "learning_rate": 1.5586642163246934e-05, + "loss": 0.1582, + "step": 8105 + }, + { + "epoch": 1.6405585913782637, + "grad_norm": 0.26761573553085327, + "learning_rate": 1.5569593571103747e-05, + "loss": 0.1787, + "step": 8106 + }, + { + "epoch": 1.6407609795587939, + "grad_norm": 0.29488828778266907, + "learning_rate": 1.5552553520743163e-05, + "loss": 0.1755, + "step": 8107 + }, + { + "epoch": 1.640963367739324, + "grad_norm": 0.2793586254119873, + "learning_rate": 1.5535522013889125e-05, + "loss": 0.18, + "step": 8108 + }, + { + "epoch": 1.6411657559198543, + "grad_norm": 0.3017469644546509, + "learning_rate": 1.551849905226469e-05, + "loss": 0.1824, + "step": 8109 + }, + { + "epoch": 1.6413681441003845, + "grad_norm": 0.30350354313850403, + "learning_rate": 1.5501484637592067e-05, + "loss": 0.2096, + "step": 8110 + }, + { + "epoch": 1.6415705322809147, + "grad_norm": 0.3905794024467468, + "learning_rate": 1.5484478771592602e-05, + "loss": 0.2198, + "step": 8111 + }, + { + "epoch": 1.641772920461445, + "grad_norm": 0.2384035289287567, + "learning_rate": 1.5467481455986755e-05, + "loss": 0.1817, + "step": 8112 + }, + { + "epoch": 1.6419753086419753, + "grad_norm": 0.3253072500228882, + "learning_rate": 1.5450492692494146e-05, + "loss": 0.1822, + "step": 8113 + }, + { + "epoch": 1.6421776968225057, + "grad_norm": 0.23735348880290985, + "learning_rate": 1.5433512482833523e-05, + "loss": 0.1631, + "step": 8114 + }, + { + "epoch": 1.642380085003036, + "grad_norm": 0.3243256211280823, + "learning_rate": 1.5416540828722738e-05, + "loss": 0.2103, + "step": 8115 + }, + { + "epoch": 1.642582473183566, + "grad_norm": 0.3111126720905304, + "learning_rate": 1.5399577731878867e-05, + "loss": 0.2073, + "step": 8116 + }, + { + "epoch": 1.6427848613640963, + "grad_norm": 0.24906755983829498, + "learning_rate": 1.5382623194017996e-05, + "loss": 0.1634, + "step": 8117 + }, + { + "epoch": 1.6429872495446265, + "grad_norm": 0.24573220312595367, + "learning_rate": 1.5365677216855435e-05, + "loss": 0.1508, + "step": 8118 + }, + { + "epoch": 1.6431896377251567, + "grad_norm": 0.34453287720680237, + "learning_rate": 1.5348739802105592e-05, + "loss": 0.2242, + "step": 8119 + }, + { + "epoch": 1.6433920259056871, + "grad_norm": 0.2754497528076172, + "learning_rate": 1.533181095148203e-05, + "loss": 0.2004, + "step": 8120 + }, + { + "epoch": 1.6435944140862173, + "grad_norm": 0.3048870265483856, + "learning_rate": 1.531489066669741e-05, + "loss": 0.2032, + "step": 8121 + }, + { + "epoch": 1.6437968022667477, + "grad_norm": 0.2695624828338623, + "learning_rate": 1.5297978949463566e-05, + "loss": 0.1528, + "step": 8122 + }, + { + "epoch": 1.643999190447278, + "grad_norm": 0.2954610288143158, + "learning_rate": 1.5281075801491452e-05, + "loss": 0.1739, + "step": 8123 + }, + { + "epoch": 1.6442015786278081, + "grad_norm": 0.26412299275398254, + "learning_rate": 1.5264181224491138e-05, + "loss": 0.168, + "step": 8124 + }, + { + "epoch": 1.6444039668083383, + "grad_norm": 0.27403730154037476, + "learning_rate": 1.524729522017183e-05, + "loss": 0.1737, + "step": 8125 + }, + { + "epoch": 1.6446063549888685, + "grad_norm": 0.24554625153541565, + "learning_rate": 1.5230417790241913e-05, + "loss": 0.1825, + "step": 8126 + }, + { + "epoch": 1.644808743169399, + "grad_norm": 0.30419591069221497, + "learning_rate": 1.5213548936408829e-05, + "loss": 0.1677, + "step": 8127 + }, + { + "epoch": 1.6450111313499292, + "grad_norm": 0.23226524889469147, + "learning_rate": 1.519668866037922e-05, + "loss": 0.1396, + "step": 8128 + }, + { + "epoch": 1.6452135195304596, + "grad_norm": 0.27901968359947205, + "learning_rate": 1.5179836963858818e-05, + "loss": 0.2219, + "step": 8129 + }, + { + "epoch": 1.6454159077109898, + "grad_norm": 0.2586843967437744, + "learning_rate": 1.5162993848552509e-05, + "loss": 0.1851, + "step": 8130 + }, + { + "epoch": 1.64561829589152, + "grad_norm": 0.3563934862613678, + "learning_rate": 1.5146159316164299e-05, + "loss": 0.1968, + "step": 8131 + }, + { + "epoch": 1.6458206840720502, + "grad_norm": 0.2968688905239105, + "learning_rate": 1.5129333368397314e-05, + "loss": 0.1996, + "step": 8132 + }, + { + "epoch": 1.6460230722525804, + "grad_norm": 0.27744680643081665, + "learning_rate": 1.5112516006953858e-05, + "loss": 0.1761, + "step": 8133 + }, + { + "epoch": 1.6462254604331106, + "grad_norm": 0.274127334356308, + "learning_rate": 1.5095707233535306e-05, + "loss": 0.1802, + "step": 8134 + }, + { + "epoch": 1.646427848613641, + "grad_norm": 0.295303612947464, + "learning_rate": 1.5078907049842217e-05, + "loss": 0.2064, + "step": 8135 + }, + { + "epoch": 1.6466302367941712, + "grad_norm": 0.21443642675876617, + "learning_rate": 1.5062115457574232e-05, + "loss": 0.165, + "step": 8136 + }, + { + "epoch": 1.6468326249747016, + "grad_norm": 0.3120724558830261, + "learning_rate": 1.5045332458430173e-05, + "loss": 0.1912, + "step": 8137 + }, + { + "epoch": 1.6470350131552318, + "grad_norm": 0.25102463364601135, + "learning_rate": 1.5028558054107977e-05, + "loss": 0.1789, + "step": 8138 + }, + { + "epoch": 1.647237401335762, + "grad_norm": 0.3505784273147583, + "learning_rate": 1.5011792246304657e-05, + "loss": 0.1845, + "step": 8139 + }, + { + "epoch": 1.6474397895162922, + "grad_norm": 0.2952693998813629, + "learning_rate": 1.499503503671642e-05, + "loss": 0.2515, + "step": 8140 + }, + { + "epoch": 1.6476421776968224, + "grad_norm": 0.2916133999824524, + "learning_rate": 1.4978286427038601e-05, + "loss": 0.1895, + "step": 8141 + }, + { + "epoch": 1.6478445658773526, + "grad_norm": 0.30217990279197693, + "learning_rate": 1.4961546418965633e-05, + "loss": 0.1896, + "step": 8142 + }, + { + "epoch": 1.648046954057883, + "grad_norm": 0.2839777171611786, + "learning_rate": 1.4944815014191104e-05, + "loss": 0.1674, + "step": 8143 + }, + { + "epoch": 1.6482493422384132, + "grad_norm": 0.2538793087005615, + "learning_rate": 1.4928092214407719e-05, + "loss": 0.1737, + "step": 8144 + }, + { + "epoch": 1.6484517304189437, + "grad_norm": 0.2852862775325775, + "learning_rate": 1.491137802130731e-05, + "loss": 0.183, + "step": 8145 + }, + { + "epoch": 1.6486541185994739, + "grad_norm": 0.28725042939186096, + "learning_rate": 1.4894672436580847e-05, + "loss": 0.1602, + "step": 8146 + }, + { + "epoch": 1.648856506780004, + "grad_norm": 0.29489025473594666, + "learning_rate": 1.4877975461918436e-05, + "loss": 0.1929, + "step": 8147 + }, + { + "epoch": 1.6490588949605343, + "grad_norm": 0.25739914178848267, + "learning_rate": 1.486128709900928e-05, + "loss": 0.1816, + "step": 8148 + }, + { + "epoch": 1.6492612831410645, + "grad_norm": 0.2747301757335663, + "learning_rate": 1.484460734954175e-05, + "loss": 0.1734, + "step": 8149 + }, + { + "epoch": 1.6494636713215947, + "grad_norm": 0.3500189483165741, + "learning_rate": 1.482793621520333e-05, + "loss": 0.1936, + "step": 8150 + }, + { + "epoch": 1.6494636713215947, + "eval_loss": 0.26037317514419556, + "eval_runtime": 0.7394, + "eval_samples_per_second": 6.762, + "eval_steps_per_second": 1.352, + "step": 8150 + }, + { + "epoch": 1.649666059502125, + "grad_norm": 0.2614065110683441, + "learning_rate": 1.4811273697680616e-05, + "loss": 0.1686, + "step": 8151 + }, + { + "epoch": 1.6498684476826553, + "grad_norm": 0.31688347458839417, + "learning_rate": 1.4794619798659359e-05, + "loss": 0.1951, + "step": 8152 + }, + { + "epoch": 1.6500708358631857, + "grad_norm": 0.29736804962158203, + "learning_rate": 1.4777974519824411e-05, + "loss": 0.1883, + "step": 8153 + }, + { + "epoch": 1.650273224043716, + "grad_norm": 0.2695557177066803, + "learning_rate": 1.4761337862859782e-05, + "loss": 0.1575, + "step": 8154 + }, + { + "epoch": 1.650475612224246, + "grad_norm": 0.27493923902511597, + "learning_rate": 1.4744709829448588e-05, + "loss": 0.1606, + "step": 8155 + }, + { + "epoch": 1.6506780004047763, + "grad_norm": 0.2712497115135193, + "learning_rate": 1.4728090421273088e-05, + "loss": 0.1817, + "step": 8156 + }, + { + "epoch": 1.6508803885853065, + "grad_norm": 0.3294946551322937, + "learning_rate": 1.4711479640014646e-05, + "loss": 0.1953, + "step": 8157 + }, + { + "epoch": 1.651082776765837, + "grad_norm": 0.32231223583221436, + "learning_rate": 1.4694877487353765e-05, + "loss": 0.2271, + "step": 8158 + }, + { + "epoch": 1.6512851649463671, + "grad_norm": 0.24417562782764435, + "learning_rate": 1.4678283964970096e-05, + "loss": 0.1673, + "step": 8159 + }, + { + "epoch": 1.6514875531268975, + "grad_norm": 0.3897631764411926, + "learning_rate": 1.4661699074542378e-05, + "loss": 0.2331, + "step": 8160 + }, + { + "epoch": 1.6516899413074277, + "grad_norm": 0.25601083040237427, + "learning_rate": 1.4645122817748503e-05, + "loss": 0.1779, + "step": 8161 + }, + { + "epoch": 1.651892329487958, + "grad_norm": 0.2515465021133423, + "learning_rate": 1.4628555196265482e-05, + "loss": 0.1894, + "step": 8162 + }, + { + "epoch": 1.6520947176684881, + "grad_norm": 0.2911391258239746, + "learning_rate": 1.4611996211769452e-05, + "loss": 0.187, + "step": 8163 + }, + { + "epoch": 1.6522971058490183, + "grad_norm": 0.2927611768245697, + "learning_rate": 1.4595445865935676e-05, + "loss": 0.213, + "step": 8164 + }, + { + "epoch": 1.6524994940295485, + "grad_norm": 0.24805690348148346, + "learning_rate": 1.4578904160438555e-05, + "loss": 0.1776, + "step": 8165 + }, + { + "epoch": 1.652701882210079, + "grad_norm": 0.32265952229499817, + "learning_rate": 1.45623710969516e-05, + "loss": 0.2215, + "step": 8166 + }, + { + "epoch": 1.6529042703906092, + "grad_norm": 0.30108514428138733, + "learning_rate": 1.4545846677147445e-05, + "loss": 0.1853, + "step": 8167 + }, + { + "epoch": 1.6531066585711396, + "grad_norm": 0.26640450954437256, + "learning_rate": 1.4529330902697857e-05, + "loss": 0.1889, + "step": 8168 + }, + { + "epoch": 1.6533090467516698, + "grad_norm": 0.2683650553226471, + "learning_rate": 1.4512823775273743e-05, + "loss": 0.1558, + "step": 8169 + }, + { + "epoch": 1.6535114349322, + "grad_norm": 0.2825409173965454, + "learning_rate": 1.449632529654512e-05, + "loss": 0.1778, + "step": 8170 + }, + { + "epoch": 1.6537138231127302, + "grad_norm": 0.26279154419898987, + "learning_rate": 1.4479835468181113e-05, + "loss": 0.1961, + "step": 8171 + }, + { + "epoch": 1.6539162112932604, + "grad_norm": 0.3498830199241638, + "learning_rate": 1.4463354291850007e-05, + "loss": 0.1924, + "step": 8172 + }, + { + "epoch": 1.6541185994737906, + "grad_norm": 0.3345295190811157, + "learning_rate": 1.4446881769219178e-05, + "loss": 0.193, + "step": 8173 + }, + { + "epoch": 1.654320987654321, + "grad_norm": 0.27633896470069885, + "learning_rate": 1.4430417901955163e-05, + "loss": 0.1685, + "step": 8174 + }, + { + "epoch": 1.6545233758348512, + "grad_norm": 0.2868563234806061, + "learning_rate": 1.4413962691723582e-05, + "loss": 0.173, + "step": 8175 + }, + { + "epoch": 1.6547257640153816, + "grad_norm": 0.2706867754459381, + "learning_rate": 1.4397516140189216e-05, + "loss": 0.2111, + "step": 8176 + }, + { + "epoch": 1.6549281521959118, + "grad_norm": 0.25419893860816956, + "learning_rate": 1.4381078249015955e-05, + "loss": 0.1958, + "step": 8177 + }, + { + "epoch": 1.655130540376442, + "grad_norm": 0.3127739429473877, + "learning_rate": 1.4364649019866804e-05, + "loss": 0.2044, + "step": 8178 + }, + { + "epoch": 1.6553329285569722, + "grad_norm": 0.25201788544654846, + "learning_rate": 1.4348228454403912e-05, + "loss": 0.1591, + "step": 8179 + }, + { + "epoch": 1.6555353167375024, + "grad_norm": 0.277378648519516, + "learning_rate": 1.4331816554288524e-05, + "loss": 0.2106, + "step": 8180 + }, + { + "epoch": 1.6557377049180326, + "grad_norm": 0.26922157406806946, + "learning_rate": 1.4315413321181027e-05, + "loss": 0.1854, + "step": 8181 + }, + { + "epoch": 1.655940093098563, + "grad_norm": 0.2691102623939514, + "learning_rate": 1.4299018756740933e-05, + "loss": 0.1687, + "step": 8182 + }, + { + "epoch": 1.6561424812790932, + "grad_norm": 0.28088414669036865, + "learning_rate": 1.4282632862626899e-05, + "loss": 0.1704, + "step": 8183 + }, + { + "epoch": 1.6563448694596237, + "grad_norm": 0.2544093430042267, + "learning_rate": 1.4266255640496629e-05, + "loss": 0.1811, + "step": 8184 + }, + { + "epoch": 1.6565472576401539, + "grad_norm": 0.2784738838672638, + "learning_rate": 1.424988709200702e-05, + "loss": 0.1995, + "step": 8185 + }, + { + "epoch": 1.656749645820684, + "grad_norm": 0.31763532757759094, + "learning_rate": 1.4233527218814058e-05, + "loss": 0.199, + "step": 8186 + }, + { + "epoch": 1.6569520340012143, + "grad_norm": 0.29642024636268616, + "learning_rate": 1.4217176022572887e-05, + "loss": 0.153, + "step": 8187 + }, + { + "epoch": 1.6571544221817445, + "grad_norm": 0.3127974569797516, + "learning_rate": 1.4200833504937727e-05, + "loss": 0.2044, + "step": 8188 + }, + { + "epoch": 1.657356810362275, + "grad_norm": 0.2773086428642273, + "learning_rate": 1.4184499667561956e-05, + "loss": 0.1911, + "step": 8189 + }, + { + "epoch": 1.657559198542805, + "grad_norm": 0.3058187663555145, + "learning_rate": 1.4168174512098064e-05, + "loss": 0.216, + "step": 8190 + }, + { + "epoch": 1.6577615867233355, + "grad_norm": 0.26577994227409363, + "learning_rate": 1.415185804019764e-05, + "loss": 0.1779, + "step": 8191 + }, + { + "epoch": 1.6579639749038657, + "grad_norm": 0.32752305269241333, + "learning_rate": 1.4135550253511432e-05, + "loss": 0.2328, + "step": 8192 + }, + { + "epoch": 1.658166363084396, + "grad_norm": 0.3030085265636444, + "learning_rate": 1.4119251153689283e-05, + "loss": 0.1938, + "step": 8193 + }, + { + "epoch": 1.658368751264926, + "grad_norm": 0.3024362027645111, + "learning_rate": 1.4102960742380167e-05, + "loss": 0.2202, + "step": 8194 + }, + { + "epoch": 1.6585711394454563, + "grad_norm": 0.29923704266548157, + "learning_rate": 1.4086679021232174e-05, + "loss": 0.2046, + "step": 8195 + }, + { + "epoch": 1.6587735276259865, + "grad_norm": 0.3013087213039398, + "learning_rate": 1.407040599189252e-05, + "loss": 0.194, + "step": 8196 + }, + { + "epoch": 1.658975915806517, + "grad_norm": 0.30144721269607544, + "learning_rate": 1.4054141656007536e-05, + "loss": 0.2071, + "step": 8197 + }, + { + "epoch": 1.6591783039870471, + "grad_norm": 0.2825508713722229, + "learning_rate": 1.403788601522268e-05, + "loss": 0.1909, + "step": 8198 + }, + { + "epoch": 1.6593806921675776, + "grad_norm": 0.2649502456188202, + "learning_rate": 1.4021639071182534e-05, + "loss": 0.1847, + "step": 8199 + }, + { + "epoch": 1.6595830803481078, + "grad_norm": 0.26707082986831665, + "learning_rate": 1.4005400825530778e-05, + "loss": 0.1752, + "step": 8200 + }, + { + "epoch": 1.6595830803481078, + "eval_loss": 0.26061928272247314, + "eval_runtime": 0.7392, + "eval_samples_per_second": 6.764, + "eval_steps_per_second": 1.353, + "step": 8200 + }, + { + "epoch": 1.659785468528638, + "grad_norm": 0.30190932750701904, + "learning_rate": 1.398917127991024e-05, + "loss": 0.2433, + "step": 8201 + }, + { + "epoch": 1.6599878567091682, + "grad_norm": 0.2780781090259552, + "learning_rate": 1.397295043596285e-05, + "loss": 0.1669, + "step": 8202 + }, + { + "epoch": 1.6601902448896984, + "grad_norm": 0.2770734131336212, + "learning_rate": 1.3956738295329664e-05, + "loss": 0.2001, + "step": 8203 + }, + { + "epoch": 1.6603926330702286, + "grad_norm": 0.2806360125541687, + "learning_rate": 1.3940534859650844e-05, + "loss": 0.1948, + "step": 8204 + }, + { + "epoch": 1.660595021250759, + "grad_norm": 0.25580450892448425, + "learning_rate": 1.3924340130565727e-05, + "loss": 0.1718, + "step": 8205 + }, + { + "epoch": 1.6607974094312892, + "grad_norm": 0.3279639482498169, + "learning_rate": 1.3908154109712679e-05, + "loss": 0.1913, + "step": 8206 + }, + { + "epoch": 1.6609997976118196, + "grad_norm": 0.3011409640312195, + "learning_rate": 1.3891976798729234e-05, + "loss": 0.1917, + "step": 8207 + }, + { + "epoch": 1.6612021857923498, + "grad_norm": 0.24959488213062286, + "learning_rate": 1.387580819925206e-05, + "loss": 0.1595, + "step": 8208 + }, + { + "epoch": 1.66140457397288, + "grad_norm": 0.2853519916534424, + "learning_rate": 1.3859648312916907e-05, + "loss": 0.1986, + "step": 8209 + }, + { + "epoch": 1.6616069621534102, + "grad_norm": 0.2913047969341278, + "learning_rate": 1.3843497141358685e-05, + "loss": 0.2041, + "step": 8210 + }, + { + "epoch": 1.6618093503339404, + "grad_norm": 0.25920918583869934, + "learning_rate": 1.3827354686211403e-05, + "loss": 0.1602, + "step": 8211 + }, + { + "epoch": 1.6620117385144706, + "grad_norm": 0.28853464126586914, + "learning_rate": 1.3811220949108172e-05, + "loss": 0.183, + "step": 8212 + }, + { + "epoch": 1.662214126695001, + "grad_norm": 0.2855565845966339, + "learning_rate": 1.3795095931681235e-05, + "loss": 0.1666, + "step": 8213 + }, + { + "epoch": 1.6624165148755312, + "grad_norm": 0.29030704498291016, + "learning_rate": 1.3778979635561962e-05, + "loss": 0.211, + "step": 8214 + }, + { + "epoch": 1.6626189030560616, + "grad_norm": 0.23719587922096252, + "learning_rate": 1.3762872062380805e-05, + "loss": 0.1485, + "step": 8215 + }, + { + "epoch": 1.6628212912365918, + "grad_norm": 0.26473090052604675, + "learning_rate": 1.3746773213767394e-05, + "loss": 0.2039, + "step": 8216 + }, + { + "epoch": 1.663023679417122, + "grad_norm": 0.26345399022102356, + "learning_rate": 1.3730683091350415e-05, + "loss": 0.1863, + "step": 8217 + }, + { + "epoch": 1.6632260675976522, + "grad_norm": 0.28619056940078735, + "learning_rate": 1.3714601696757712e-05, + "loss": 0.1959, + "step": 8218 + }, + { + "epoch": 1.6634284557781824, + "grad_norm": 0.2565390467643738, + "learning_rate": 1.3698529031616225e-05, + "loss": 0.1823, + "step": 8219 + }, + { + "epoch": 1.6636308439587129, + "grad_norm": 0.2777693569660187, + "learning_rate": 1.3682465097552021e-05, + "loss": 0.2056, + "step": 8220 + }, + { + "epoch": 1.663833232139243, + "grad_norm": 0.25533127784729004, + "learning_rate": 1.366640989619028e-05, + "loss": 0.1662, + "step": 8221 + }, + { + "epoch": 1.6640356203197735, + "grad_norm": 0.32117244601249695, + "learning_rate": 1.3650363429155288e-05, + "loss": 0.2295, + "step": 8222 + }, + { + "epoch": 1.6642380085003037, + "grad_norm": 0.2592966556549072, + "learning_rate": 1.3634325698070471e-05, + "loss": 0.1592, + "step": 8223 + }, + { + "epoch": 1.6644403966808339, + "grad_norm": 0.3049880862236023, + "learning_rate": 1.3618296704558364e-05, + "loss": 0.2058, + "step": 8224 + }, + { + "epoch": 1.664642784861364, + "grad_norm": 0.28951483964920044, + "learning_rate": 1.3602276450240603e-05, + "loss": 0.1971, + "step": 8225 + }, + { + "epoch": 1.6648451730418943, + "grad_norm": 0.31505659222602844, + "learning_rate": 1.3586264936737936e-05, + "loss": 0.2161, + "step": 8226 + }, + { + "epoch": 1.6650475612224245, + "grad_norm": 0.28962957859039307, + "learning_rate": 1.3570262165670289e-05, + "loss": 0.2002, + "step": 8227 + }, + { + "epoch": 1.665249949402955, + "grad_norm": 0.314058780670166, + "learning_rate": 1.3554268138656589e-05, + "loss": 0.2151, + "step": 8228 + }, + { + "epoch": 1.665452337583485, + "grad_norm": 0.28717002272605896, + "learning_rate": 1.3538282857314988e-05, + "loss": 0.1955, + "step": 8229 + }, + { + "epoch": 1.6656547257640155, + "grad_norm": 0.24509413540363312, + "learning_rate": 1.352230632326268e-05, + "loss": 0.192, + "step": 8230 + }, + { + "epoch": 1.6658571139445457, + "grad_norm": 0.2782445251941681, + "learning_rate": 1.3506338538116025e-05, + "loss": 0.1708, + "step": 8231 + }, + { + "epoch": 1.666059502125076, + "grad_norm": 0.30397579073905945, + "learning_rate": 1.349037950349047e-05, + "loss": 0.1961, + "step": 8232 + }, + { + "epoch": 1.6662618903056061, + "grad_norm": 0.2743746340274811, + "learning_rate": 1.3474429221000573e-05, + "loss": 0.1948, + "step": 8233 + }, + { + "epoch": 1.6664642784861363, + "grad_norm": 0.3238055109977722, + "learning_rate": 1.3458487692260036e-05, + "loss": 0.1831, + "step": 8234 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.268256276845932, + "learning_rate": 1.3442554918881634e-05, + "loss": 0.1774, + "step": 8235 + }, + { + "epoch": 1.666869054847197, + "grad_norm": 0.3051510155200958, + "learning_rate": 1.34266309024773e-05, + "loss": 0.2038, + "step": 8236 + }, + { + "epoch": 1.6670714430277271, + "grad_norm": 0.2659623920917511, + "learning_rate": 1.3410715644658034e-05, + "loss": 0.1772, + "step": 8237 + }, + { + "epoch": 1.6672738312082576, + "grad_norm": 0.2695719003677368, + "learning_rate": 1.3394809147033993e-05, + "loss": 0.1511, + "step": 8238 + }, + { + "epoch": 1.6674762193887878, + "grad_norm": 0.2614977955818176, + "learning_rate": 1.3378911411214435e-05, + "loss": 0.1703, + "step": 8239 + }, + { + "epoch": 1.667678607569318, + "grad_norm": 0.2625097334384918, + "learning_rate": 1.3363022438807704e-05, + "loss": 0.1536, + "step": 8240 + }, + { + "epoch": 1.6678809957498482, + "grad_norm": 0.29641667008399963, + "learning_rate": 1.3347142231421295e-05, + "loss": 0.2007, + "step": 8241 + }, + { + "epoch": 1.6680833839303784, + "grad_norm": 0.2981378436088562, + "learning_rate": 1.3331270790661799e-05, + "loss": 0.1894, + "step": 8242 + }, + { + "epoch": 1.6682857721109086, + "grad_norm": 0.30381447076797485, + "learning_rate": 1.3315408118134909e-05, + "loss": 0.1968, + "step": 8243 + }, + { + "epoch": 1.668488160291439, + "grad_norm": 0.2666470408439636, + "learning_rate": 1.3299554215445464e-05, + "loss": 0.1746, + "step": 8244 + }, + { + "epoch": 1.6686905484719692, + "grad_norm": 0.32338204979896545, + "learning_rate": 1.3283709084197381e-05, + "loss": 0.2023, + "step": 8245 + }, + { + "epoch": 1.6688929366524996, + "grad_norm": 0.31472939252853394, + "learning_rate": 1.3267872725993713e-05, + "loss": 0.2048, + "step": 8246 + }, + { + "epoch": 1.6690953248330298, + "grad_norm": 0.3297794461250305, + "learning_rate": 1.325204514243662e-05, + "loss": 0.2346, + "step": 8247 + }, + { + "epoch": 1.66929771301356, + "grad_norm": 0.3132227957248688, + "learning_rate": 1.3236226335127356e-05, + "loss": 0.2204, + "step": 8248 + }, + { + "epoch": 1.6695001011940902, + "grad_norm": 0.40292102098464966, + "learning_rate": 1.3220416305666328e-05, + "loss": 0.1882, + "step": 8249 + }, + { + "epoch": 1.6697024893746204, + "grad_norm": 0.2750104069709778, + "learning_rate": 1.3204615055652992e-05, + "loss": 0.1921, + "step": 8250 + }, + { + "epoch": 1.6697024893746204, + "eval_loss": 0.2600804567337036, + "eval_runtime": 0.7413, + "eval_samples_per_second": 6.745, + "eval_steps_per_second": 1.349, + "step": 8250 + }, + { + "epoch": 1.6699048775551508, + "grad_norm": 0.31695792078971863, + "learning_rate": 1.3188822586685966e-05, + "loss": 0.1959, + "step": 8251 + }, + { + "epoch": 1.670107265735681, + "grad_norm": 0.24991224706172943, + "learning_rate": 1.3173038900362976e-05, + "loss": 0.1433, + "step": 8252 + }, + { + "epoch": 1.6703096539162114, + "grad_norm": 0.29578897356987, + "learning_rate": 1.3157263998280845e-05, + "loss": 0.207, + "step": 8253 + }, + { + "epoch": 1.6705120420967416, + "grad_norm": 0.2668931782245636, + "learning_rate": 1.3141497882035514e-05, + "loss": 0.1855, + "step": 8254 + }, + { + "epoch": 1.6707144302772718, + "grad_norm": 0.2784649729728699, + "learning_rate": 1.3125740553222032e-05, + "loss": 0.1727, + "step": 8255 + }, + { + "epoch": 1.670916818457802, + "grad_norm": 0.5614887475967407, + "learning_rate": 1.3109992013434557e-05, + "loss": 0.2148, + "step": 8256 + }, + { + "epoch": 1.6711192066383322, + "grad_norm": 0.2848651111125946, + "learning_rate": 1.309425226426636e-05, + "loss": 0.1892, + "step": 8257 + }, + { + "epoch": 1.6713215948188624, + "grad_norm": 0.3137986958026886, + "learning_rate": 1.3078521307309832e-05, + "loss": 0.1973, + "step": 8258 + }, + { + "epoch": 1.6715239829993929, + "grad_norm": 0.25669851899147034, + "learning_rate": 1.3062799144156468e-05, + "loss": 0.2005, + "step": 8259 + }, + { + "epoch": 1.671726371179923, + "grad_norm": 0.2816522717475891, + "learning_rate": 1.304708577639685e-05, + "loss": 0.1853, + "step": 8260 + }, + { + "epoch": 1.6719287593604535, + "grad_norm": 0.270802766084671, + "learning_rate": 1.3031381205620719e-05, + "loss": 0.1676, + "step": 8261 + }, + { + "epoch": 1.6721311475409837, + "grad_norm": 0.2849435806274414, + "learning_rate": 1.301568543341688e-05, + "loss": 0.1769, + "step": 8262 + }, + { + "epoch": 1.6723335357215139, + "grad_norm": 0.2575063705444336, + "learning_rate": 1.2999998461373275e-05, + "loss": 0.1738, + "step": 8263 + }, + { + "epoch": 1.672535923902044, + "grad_norm": 0.2596307694911957, + "learning_rate": 1.2984320291076947e-05, + "loss": 0.1735, + "step": 8264 + }, + { + "epoch": 1.6727383120825743, + "grad_norm": 0.29953983426094055, + "learning_rate": 1.2968650924114045e-05, + "loss": 0.1901, + "step": 8265 + }, + { + "epoch": 1.6729407002631045, + "grad_norm": 0.29430925846099854, + "learning_rate": 1.2952990362069828e-05, + "loss": 0.1973, + "step": 8266 + }, + { + "epoch": 1.673143088443635, + "grad_norm": 0.24846704304218292, + "learning_rate": 1.2937338606528648e-05, + "loss": 0.1797, + "step": 8267 + }, + { + "epoch": 1.673345476624165, + "grad_norm": 0.30034515261650085, + "learning_rate": 1.292169565907404e-05, + "loss": 0.1986, + "step": 8268 + }, + { + "epoch": 1.6735478648046955, + "grad_norm": 0.29777172207832336, + "learning_rate": 1.290606152128856e-05, + "loss": 0.209, + "step": 8269 + }, + { + "epoch": 1.6737502529852257, + "grad_norm": 0.3124600052833557, + "learning_rate": 1.289043619475392e-05, + "loss": 0.1897, + "step": 8270 + }, + { + "epoch": 1.673952641165756, + "grad_norm": 0.3199320435523987, + "learning_rate": 1.2874819681050898e-05, + "loss": 0.1963, + "step": 8271 + }, + { + "epoch": 1.6741550293462861, + "grad_norm": 0.24260728061199188, + "learning_rate": 1.2859211981759455e-05, + "loss": 0.1804, + "step": 8272 + }, + { + "epoch": 1.6743574175268163, + "grad_norm": 0.23964375257492065, + "learning_rate": 1.2843613098458562e-05, + "loss": 0.1645, + "step": 8273 + }, + { + "epoch": 1.6745598057073465, + "grad_norm": 0.27568867802619934, + "learning_rate": 1.2828023032726378e-05, + "loss": 0.1916, + "step": 8274 + }, + { + "epoch": 1.674762193887877, + "grad_norm": 0.2523253858089447, + "learning_rate": 1.2812441786140138e-05, + "loss": 0.1683, + "step": 8275 + }, + { + "epoch": 1.6749645820684074, + "grad_norm": 0.27498844265937805, + "learning_rate": 1.2796869360276187e-05, + "loss": 0.1946, + "step": 8276 + }, + { + "epoch": 1.6751669702489376, + "grad_norm": 0.30603161454200745, + "learning_rate": 1.2781305756709993e-05, + "loss": 0.2068, + "step": 8277 + }, + { + "epoch": 1.6753693584294678, + "grad_norm": 0.2598779797554016, + "learning_rate": 1.276575097701609e-05, + "loss": 0.1516, + "step": 8278 + }, + { + "epoch": 1.675571746609998, + "grad_norm": 0.33063915371894836, + "learning_rate": 1.275020502276818e-05, + "loss": 0.2212, + "step": 8279 + }, + { + "epoch": 1.6757741347905282, + "grad_norm": 0.2730655372142792, + "learning_rate": 1.2734667895539009e-05, + "loss": 0.2018, + "step": 8280 + }, + { + "epoch": 1.6759765229710584, + "grad_norm": 0.26023030281066895, + "learning_rate": 1.2719139596900487e-05, + "loss": 0.1936, + "step": 8281 + }, + { + "epoch": 1.6761789111515888, + "grad_norm": 0.2647170424461365, + "learning_rate": 1.2703620128423588e-05, + "loss": 0.1918, + "step": 8282 + }, + { + "epoch": 1.676381299332119, + "grad_norm": 0.2505474090576172, + "learning_rate": 1.2688109491678412e-05, + "loss": 0.178, + "step": 8283 + }, + { + "epoch": 1.6765836875126494, + "grad_norm": 0.2744555175304413, + "learning_rate": 1.2672607688234172e-05, + "loss": 0.1996, + "step": 8284 + }, + { + "epoch": 1.6767860756931796, + "grad_norm": 0.28227829933166504, + "learning_rate": 1.265711471965917e-05, + "loss": 0.1828, + "step": 8285 + }, + { + "epoch": 1.6769884638737098, + "grad_norm": 0.36905500292778015, + "learning_rate": 1.2641630587520814e-05, + "loss": 0.2228, + "step": 8286 + }, + { + "epoch": 1.67719085205424, + "grad_norm": 0.259802907705307, + "learning_rate": 1.2626155293385633e-05, + "loss": 0.1658, + "step": 8287 + }, + { + "epoch": 1.6773932402347702, + "grad_norm": 0.2774128317832947, + "learning_rate": 1.2610688838819262e-05, + "loss": 0.1956, + "step": 8288 + }, + { + "epoch": 1.6775956284153004, + "grad_norm": 0.2562331557273865, + "learning_rate": 1.2595231225386429e-05, + "loss": 0.1705, + "step": 8289 + }, + { + "epoch": 1.6777980165958308, + "grad_norm": 0.29600250720977783, + "learning_rate": 1.2579782454650967e-05, + "loss": 0.1926, + "step": 8290 + }, + { + "epoch": 1.678000404776361, + "grad_norm": 0.25907132029533386, + "learning_rate": 1.2564342528175832e-05, + "loss": 0.1906, + "step": 8291 + }, + { + "epoch": 1.6782027929568915, + "grad_norm": 0.27086886763572693, + "learning_rate": 1.254891144752307e-05, + "loss": 0.1804, + "step": 8292 + }, + { + "epoch": 1.6784051811374217, + "grad_norm": 0.28162238001823425, + "learning_rate": 1.2533489214253836e-05, + "loss": 0.1815, + "step": 8293 + }, + { + "epoch": 1.6786075693179519, + "grad_norm": 0.3076886236667633, + "learning_rate": 1.2518075829928399e-05, + "loss": 0.2031, + "step": 8294 + }, + { + "epoch": 1.678809957498482, + "grad_norm": 0.2625339925289154, + "learning_rate": 1.2502671296106095e-05, + "loss": 0.1572, + "step": 8295 + }, + { + "epoch": 1.6790123456790123, + "grad_norm": 0.31220224499702454, + "learning_rate": 1.2487275614345405e-05, + "loss": 0.1773, + "step": 8296 + }, + { + "epoch": 1.6792147338595425, + "grad_norm": 0.2704111337661743, + "learning_rate": 1.2471888786203922e-05, + "loss": 0.1797, + "step": 8297 + }, + { + "epoch": 1.6794171220400729, + "grad_norm": 0.23155251145362854, + "learning_rate": 1.2456510813238299e-05, + "loss": 0.1614, + "step": 8298 + }, + { + "epoch": 1.679619510220603, + "grad_norm": 0.2626052796840668, + "learning_rate": 1.2441141697004333e-05, + "loss": 0.1935, + "step": 8299 + }, + { + "epoch": 1.6798218984011335, + "grad_norm": 0.2648791968822479, + "learning_rate": 1.2425781439056894e-05, + "loss": 0.1787, + "step": 8300 + }, + { + "epoch": 1.6798218984011335, + "eval_loss": 0.2585418224334717, + "eval_runtime": 0.7402, + "eval_samples_per_second": 6.755, + "eval_steps_per_second": 1.351, + "step": 8300 + }, + { + "epoch": 1.6800242865816637, + "grad_norm": 0.2991698384284973, + "learning_rate": 1.2410430040949994e-05, + "loss": 0.1971, + "step": 8301 + }, + { + "epoch": 1.680226674762194, + "grad_norm": 0.2974299490451813, + "learning_rate": 1.2395087504236713e-05, + "loss": 0.2, + "step": 8302 + }, + { + "epoch": 1.680429062942724, + "grad_norm": 0.2599434554576874, + "learning_rate": 1.2379753830469255e-05, + "loss": 0.19, + "step": 8303 + }, + { + "epoch": 1.6806314511232543, + "grad_norm": 0.25205034017562866, + "learning_rate": 1.236442902119891e-05, + "loss": 0.1584, + "step": 8304 + }, + { + "epoch": 1.6808338393037845, + "grad_norm": 0.28435245156288147, + "learning_rate": 1.2349113077976094e-05, + "loss": 0.166, + "step": 8305 + }, + { + "epoch": 1.681036227484315, + "grad_norm": 0.30040034651756287, + "learning_rate": 1.2333806002350301e-05, + "loss": 0.2013, + "step": 8306 + }, + { + "epoch": 1.6812386156648453, + "grad_norm": 0.2764068841934204, + "learning_rate": 1.2318507795870138e-05, + "loss": 0.2164, + "step": 8307 + }, + { + "epoch": 1.6814410038453755, + "grad_norm": 0.2434784173965454, + "learning_rate": 1.2303218460083332e-05, + "loss": 0.1585, + "step": 8308 + }, + { + "epoch": 1.6816433920259057, + "grad_norm": 0.3237009048461914, + "learning_rate": 1.2287937996536691e-05, + "loss": 0.2349, + "step": 8309 + }, + { + "epoch": 1.681845780206436, + "grad_norm": 0.2595462203025818, + "learning_rate": 1.2272666406776135e-05, + "loss": 0.1746, + "step": 8310 + }, + { + "epoch": 1.6820481683869661, + "grad_norm": 0.3123394846916199, + "learning_rate": 1.2257403692346681e-05, + "loss": 0.1873, + "step": 8311 + }, + { + "epoch": 1.6822505565674963, + "grad_norm": 0.26531022787094116, + "learning_rate": 1.224214985479244e-05, + "loss": 0.1724, + "step": 8312 + }, + { + "epoch": 1.6824529447480268, + "grad_norm": 0.2745008170604706, + "learning_rate": 1.2226904895656644e-05, + "loss": 0.1652, + "step": 8313 + }, + { + "epoch": 1.682655332928557, + "grad_norm": 0.29717713594436646, + "learning_rate": 1.2211668816481625e-05, + "loss": 0.2099, + "step": 8314 + }, + { + "epoch": 1.6828577211090874, + "grad_norm": 0.3275054097175598, + "learning_rate": 1.2196441618808796e-05, + "loss": 0.2069, + "step": 8315 + }, + { + "epoch": 1.6830601092896176, + "grad_norm": 0.3260860741138458, + "learning_rate": 1.2181223304178702e-05, + "loss": 0.1786, + "step": 8316 + }, + { + "epoch": 1.6832624974701478, + "grad_norm": 0.615013599395752, + "learning_rate": 1.2166013874130955e-05, + "loss": 0.1554, + "step": 8317 + }, + { + "epoch": 1.683464885650678, + "grad_norm": 0.25483933091163635, + "learning_rate": 1.2150813330204291e-05, + "loss": 0.1671, + "step": 8318 + }, + { + "epoch": 1.6836672738312082, + "grad_norm": 0.25844165682792664, + "learning_rate": 1.2135621673936549e-05, + "loss": 0.171, + "step": 8319 + }, + { + "epoch": 1.6838696620117384, + "grad_norm": 0.2545812726020813, + "learning_rate": 1.212043890686465e-05, + "loss": 0.1661, + "step": 8320 + }, + { + "epoch": 1.6840720501922688, + "grad_norm": 0.3012816309928894, + "learning_rate": 1.2105265030524626e-05, + "loss": 0.2202, + "step": 8321 + }, + { + "epoch": 1.684274438372799, + "grad_norm": 0.274930864572525, + "learning_rate": 1.2090100046451635e-05, + "loss": 0.1861, + "step": 8322 + }, + { + "epoch": 1.6844768265533294, + "grad_norm": 0.27070969343185425, + "learning_rate": 1.2074943956179884e-05, + "loss": 0.1767, + "step": 8323 + }, + { + "epoch": 1.6846792147338596, + "grad_norm": 0.35230904817581177, + "learning_rate": 1.2059796761242714e-05, + "loss": 0.2147, + "step": 8324 + }, + { + "epoch": 1.6848816029143898, + "grad_norm": 0.2666962742805481, + "learning_rate": 1.2044658463172564e-05, + "loss": 0.1985, + "step": 8325 + }, + { + "epoch": 1.68508399109492, + "grad_norm": 0.262483149766922, + "learning_rate": 1.2029529063500966e-05, + "loss": 0.1689, + "step": 8326 + }, + { + "epoch": 1.6852863792754502, + "grad_norm": 0.31115856766700745, + "learning_rate": 1.201440856375855e-05, + "loss": 0.199, + "step": 8327 + }, + { + "epoch": 1.6854887674559804, + "grad_norm": 0.2889840304851532, + "learning_rate": 1.199929696547506e-05, + "loss": 0.2209, + "step": 8328 + }, + { + "epoch": 1.6856911556365108, + "grad_norm": 0.2528282403945923, + "learning_rate": 1.1984194270179317e-05, + "loss": 0.1417, + "step": 8329 + }, + { + "epoch": 1.685893543817041, + "grad_norm": 0.28934213519096375, + "learning_rate": 1.1969100479399254e-05, + "loss": 0.1745, + "step": 8330 + }, + { + "epoch": 1.6860959319975715, + "grad_norm": 0.2569589614868164, + "learning_rate": 1.1954015594661915e-05, + "loss": 0.1518, + "step": 8331 + }, + { + "epoch": 1.6862983201781017, + "grad_norm": 0.2754018008708954, + "learning_rate": 1.1938939617493427e-05, + "loss": 0.1938, + "step": 8332 + }, + { + "epoch": 1.6865007083586319, + "grad_norm": 0.23252278566360474, + "learning_rate": 1.1923872549419002e-05, + "loss": 0.1469, + "step": 8333 + }, + { + "epoch": 1.686703096539162, + "grad_norm": 0.3326863944530487, + "learning_rate": 1.1908814391962985e-05, + "loss": 0.23, + "step": 8334 + }, + { + "epoch": 1.6869054847196923, + "grad_norm": 0.26767897605895996, + "learning_rate": 1.1893765146648795e-05, + "loss": 0.1816, + "step": 8335 + }, + { + "epoch": 1.6871078729002225, + "grad_norm": 0.2568624019622803, + "learning_rate": 1.1878724814998965e-05, + "loss": 0.1793, + "step": 8336 + }, + { + "epoch": 1.6873102610807529, + "grad_norm": 0.2331872135400772, + "learning_rate": 1.1863693398535114e-05, + "loss": 0.1522, + "step": 8337 + }, + { + "epoch": 1.6875126492612833, + "grad_norm": 0.26635870337486267, + "learning_rate": 1.184867089877797e-05, + "loss": 0.157, + "step": 8338 + }, + { + "epoch": 1.6877150374418135, + "grad_norm": 0.2556881606578827, + "learning_rate": 1.1833657317247338e-05, + "loss": 0.1765, + "step": 8339 + }, + { + "epoch": 1.6879174256223437, + "grad_norm": 0.2685004770755768, + "learning_rate": 1.1818652655462126e-05, + "loss": 0.1696, + "step": 8340 + }, + { + "epoch": 1.688119813802874, + "grad_norm": 0.2972007989883423, + "learning_rate": 1.1803656914940364e-05, + "loss": 0.1859, + "step": 8341 + }, + { + "epoch": 1.688322201983404, + "grad_norm": 0.25820642709732056, + "learning_rate": 1.1788670097199173e-05, + "loss": 0.1738, + "step": 8342 + }, + { + "epoch": 1.6885245901639343, + "grad_norm": 0.28410157561302185, + "learning_rate": 1.1773692203754738e-05, + "loss": 0.204, + "step": 8343 + }, + { + "epoch": 1.6887269783444647, + "grad_norm": 0.29454171657562256, + "learning_rate": 1.1758723236122382e-05, + "loss": 0.1929, + "step": 8344 + }, + { + "epoch": 1.688929366524995, + "grad_norm": 0.2966192066669464, + "learning_rate": 1.1743763195816504e-05, + "loss": 0.195, + "step": 8345 + }, + { + "epoch": 1.6891317547055253, + "grad_norm": 0.28926682472229004, + "learning_rate": 1.1728812084350605e-05, + "loss": 0.1836, + "step": 8346 + }, + { + "epoch": 1.6893341428860555, + "grad_norm": 0.28553107380867004, + "learning_rate": 1.1713869903237273e-05, + "loss": 0.18, + "step": 8347 + }, + { + "epoch": 1.6895365310665857, + "grad_norm": 0.3032236397266388, + "learning_rate": 1.1698936653988214e-05, + "loss": 0.2032, + "step": 8348 + }, + { + "epoch": 1.689738919247116, + "grad_norm": 0.3131108582019806, + "learning_rate": 1.1684012338114214e-05, + "loss": 0.1725, + "step": 8349 + }, + { + "epoch": 1.6899413074276461, + "grad_norm": 0.30871301889419556, + "learning_rate": 1.1669096957125159e-05, + "loss": 0.2258, + "step": 8350 + }, + { + "epoch": 1.6899413074276461, + "eval_loss": 0.25871673226356506, + "eval_runtime": 0.739, + "eval_samples_per_second": 6.766, + "eval_steps_per_second": 1.353, + "step": 8350 + }, + { + "epoch": 1.6901436956081763, + "grad_norm": 0.2908981442451477, + "learning_rate": 1.1654190512530016e-05, + "loss": 0.1896, + "step": 8351 + }, + { + "epoch": 1.6903460837887068, + "grad_norm": 0.3076784014701843, + "learning_rate": 1.1639293005836894e-05, + "loss": 0.2111, + "step": 8352 + }, + { + "epoch": 1.690548471969237, + "grad_norm": 0.2984131872653961, + "learning_rate": 1.1624404438552927e-05, + "loss": 0.2285, + "step": 8353 + }, + { + "epoch": 1.6907508601497674, + "grad_norm": 0.28389042615890503, + "learning_rate": 1.1609524812184413e-05, + "loss": 0.1759, + "step": 8354 + }, + { + "epoch": 1.6909532483302976, + "grad_norm": 0.29734140634536743, + "learning_rate": 1.1594654128236714e-05, + "loss": 0.2161, + "step": 8355 + }, + { + "epoch": 1.6911556365108278, + "grad_norm": 0.2875990569591522, + "learning_rate": 1.1579792388214272e-05, + "loss": 0.1972, + "step": 8356 + }, + { + "epoch": 1.691358024691358, + "grad_norm": 0.27701500058174133, + "learning_rate": 1.156493959362066e-05, + "loss": 0.1898, + "step": 8357 + }, + { + "epoch": 1.6915604128718882, + "grad_norm": 0.27099698781967163, + "learning_rate": 1.1550095745958523e-05, + "loss": 0.2021, + "step": 8358 + }, + { + "epoch": 1.6917628010524184, + "grad_norm": 0.2977381944656372, + "learning_rate": 1.1535260846729601e-05, + "loss": 0.1998, + "step": 8359 + }, + { + "epoch": 1.6919651892329488, + "grad_norm": 0.2615094482898712, + "learning_rate": 1.152043489743474e-05, + "loss": 0.1895, + "step": 8360 + }, + { + "epoch": 1.692167577413479, + "grad_norm": 0.27318063378334045, + "learning_rate": 1.1505617899573885e-05, + "loss": 0.1829, + "step": 8361 + }, + { + "epoch": 1.6923699655940094, + "grad_norm": 0.25262323021888733, + "learning_rate": 1.1490809854646011e-05, + "loss": 0.161, + "step": 8362 + }, + { + "epoch": 1.6925723537745396, + "grad_norm": 0.3081194758415222, + "learning_rate": 1.1476010764149304e-05, + "loss": 0.1924, + "step": 8363 + }, + { + "epoch": 1.6927747419550698, + "grad_norm": 0.29487472772598267, + "learning_rate": 1.146122062958095e-05, + "loss": 0.2032, + "step": 8364 + }, + { + "epoch": 1.6929771301356, + "grad_norm": 0.3395232558250427, + "learning_rate": 1.1446439452437275e-05, + "loss": 0.2292, + "step": 8365 + }, + { + "epoch": 1.6931795183161302, + "grad_norm": 0.3127206563949585, + "learning_rate": 1.143166723421366e-05, + "loss": 0.1867, + "step": 8366 + }, + { + "epoch": 1.6933819064966606, + "grad_norm": 0.30758431553840637, + "learning_rate": 1.1416903976404625e-05, + "loss": 0.2042, + "step": 8367 + }, + { + "epoch": 1.6935842946771908, + "grad_norm": 0.3078838884830475, + "learning_rate": 1.140214968050376e-05, + "loss": 0.1806, + "step": 8368 + }, + { + "epoch": 1.6937866828577213, + "grad_norm": 0.27097728848457336, + "learning_rate": 1.1387404348003739e-05, + "loss": 0.1924, + "step": 8369 + }, + { + "epoch": 1.6939890710382515, + "grad_norm": 0.2731929123401642, + "learning_rate": 1.1372667980396345e-05, + "loss": 0.1853, + "step": 8370 + }, + { + "epoch": 1.6941914592187817, + "grad_norm": 0.31753164529800415, + "learning_rate": 1.1357940579172443e-05, + "loss": 0.1928, + "step": 8371 + }, + { + "epoch": 1.6943938473993119, + "grad_norm": 0.26757803559303284, + "learning_rate": 1.1343222145822008e-05, + "loss": 0.1857, + "step": 8372 + }, + { + "epoch": 1.694596235579842, + "grad_norm": 0.2824253439903259, + "learning_rate": 1.1328512681834092e-05, + "loss": 0.2152, + "step": 8373 + }, + { + "epoch": 1.6947986237603723, + "grad_norm": 0.27644097805023193, + "learning_rate": 1.1313812188696838e-05, + "loss": 0.2132, + "step": 8374 + }, + { + "epoch": 1.6950010119409027, + "grad_norm": 0.30260854959487915, + "learning_rate": 1.12991206678975e-05, + "loss": 0.207, + "step": 8375 + }, + { + "epoch": 1.695203400121433, + "grad_norm": 0.2823565900325775, + "learning_rate": 1.1284438120922402e-05, + "loss": 0.1841, + "step": 8376 + }, + { + "epoch": 1.6954057883019633, + "grad_norm": 0.2845193147659302, + "learning_rate": 1.1269764549256978e-05, + "loss": 0.1704, + "step": 8377 + }, + { + "epoch": 1.6956081764824935, + "grad_norm": 0.24924218654632568, + "learning_rate": 1.1255099954385727e-05, + "loss": 0.1994, + "step": 8378 + }, + { + "epoch": 1.6958105646630237, + "grad_norm": 0.25757744908332825, + "learning_rate": 1.1240444337792288e-05, + "loss": 0.1841, + "step": 8379 + }, + { + "epoch": 1.696012952843554, + "grad_norm": 0.3529128432273865, + "learning_rate": 1.122579770095934e-05, + "loss": 0.2325, + "step": 8380 + }, + { + "epoch": 1.6962153410240841, + "grad_norm": 0.26317161321640015, + "learning_rate": 1.1211160045368685e-05, + "loss": 0.1558, + "step": 8381 + }, + { + "epoch": 1.6964177292046143, + "grad_norm": 0.28297486901283264, + "learning_rate": 1.1196531372501207e-05, + "loss": 0.1951, + "step": 8382 + }, + { + "epoch": 1.6966201173851447, + "grad_norm": 0.32655608654022217, + "learning_rate": 1.1181911683836899e-05, + "loss": 0.1911, + "step": 8383 + }, + { + "epoch": 1.696822505565675, + "grad_norm": 0.27955129742622375, + "learning_rate": 1.1167300980854789e-05, + "loss": 0.2055, + "step": 8384 + }, + { + "epoch": 1.6970248937462054, + "grad_norm": 0.3174886405467987, + "learning_rate": 1.1152699265033062e-05, + "loss": 0.1973, + "step": 8385 + }, + { + "epoch": 1.6972272819267356, + "grad_norm": 0.3020794093608856, + "learning_rate": 1.1138106537848948e-05, + "loss": 0.2151, + "step": 8386 + }, + { + "epoch": 1.6974296701072658, + "grad_norm": 0.27277594804763794, + "learning_rate": 1.1123522800778807e-05, + "loss": 0.1897, + "step": 8387 + }, + { + "epoch": 1.697632058287796, + "grad_norm": 0.3051762282848358, + "learning_rate": 1.1108948055298052e-05, + "loss": 0.1935, + "step": 8388 + }, + { + "epoch": 1.6978344464683262, + "grad_norm": 0.2831827998161316, + "learning_rate": 1.1094382302881212e-05, + "loss": 0.1899, + "step": 8389 + }, + { + "epoch": 1.6980368346488564, + "grad_norm": 0.3473316729068756, + "learning_rate": 1.1079825545001888e-05, + "loss": 0.2047, + "step": 8390 + }, + { + "epoch": 1.6982392228293868, + "grad_norm": 0.2632426917552948, + "learning_rate": 1.10652777831328e-05, + "loss": 0.1827, + "step": 8391 + }, + { + "epoch": 1.698441611009917, + "grad_norm": 0.3207499384880066, + "learning_rate": 1.1050739018745716e-05, + "loss": 0.2424, + "step": 8392 + }, + { + "epoch": 1.6986439991904474, + "grad_norm": 0.2827729880809784, + "learning_rate": 1.1036209253311524e-05, + "loss": 0.2034, + "step": 8393 + }, + { + "epoch": 1.6988463873709776, + "grad_norm": 0.3075752854347229, + "learning_rate": 1.1021688488300197e-05, + "loss": 0.1949, + "step": 8394 + }, + { + "epoch": 1.6990487755515078, + "grad_norm": 0.2706870138645172, + "learning_rate": 1.100717672518078e-05, + "loss": 0.1794, + "step": 8395 + }, + { + "epoch": 1.699251163732038, + "grad_norm": 0.2784644663333893, + "learning_rate": 1.0992673965421441e-05, + "loss": 0.1848, + "step": 8396 + }, + { + "epoch": 1.6994535519125682, + "grad_norm": 0.27086934447288513, + "learning_rate": 1.0978180210489408e-05, + "loss": 0.1771, + "step": 8397 + }, + { + "epoch": 1.6996559400930986, + "grad_norm": 0.2761807441711426, + "learning_rate": 1.0963695461850997e-05, + "loss": 0.1799, + "step": 8398 + }, + { + "epoch": 1.6998583282736288, + "grad_norm": 0.2793600559234619, + "learning_rate": 1.0949219720971638e-05, + "loss": 0.1586, + "step": 8399 + }, + { + "epoch": 1.7000607164541592, + "grad_norm": 0.29326415061950684, + "learning_rate": 1.0934752989315834e-05, + "loss": 0.1972, + "step": 8400 + }, + { + "epoch": 1.7000607164541592, + "eval_loss": 0.25876384973526, + "eval_runtime": 0.7391, + "eval_samples_per_second": 6.765, + "eval_steps_per_second": 1.353, + "step": 8400 + }, + { + "epoch": 1.7002631046346894, + "grad_norm": 0.2465113401412964, + "learning_rate": 1.0920295268347159e-05, + "loss": 0.1682, + "step": 8401 + }, + { + "epoch": 1.7004654928152196, + "grad_norm": 0.24070094525814056, + "learning_rate": 1.090584655952831e-05, + "loss": 0.1522, + "step": 8402 + }, + { + "epoch": 1.7006678809957498, + "grad_norm": 0.2528705298900604, + "learning_rate": 1.0891406864321053e-05, + "loss": 0.1996, + "step": 8403 + }, + { + "epoch": 1.70087026917628, + "grad_norm": 0.31882789731025696, + "learning_rate": 1.087697618418625e-05, + "loss": 0.2033, + "step": 8404 + }, + { + "epoch": 1.7010726573568102, + "grad_norm": 0.2811528444290161, + "learning_rate": 1.0862554520583857e-05, + "loss": 0.199, + "step": 8405 + }, + { + "epoch": 1.7012750455373407, + "grad_norm": 0.27912142872810364, + "learning_rate": 1.0848141874972862e-05, + "loss": 0.1795, + "step": 8406 + }, + { + "epoch": 1.7014774337178709, + "grad_norm": 0.2798633873462677, + "learning_rate": 1.083373824881142e-05, + "loss": 0.2137, + "step": 8407 + }, + { + "epoch": 1.7016798218984013, + "grad_norm": 0.24729759991168976, + "learning_rate": 1.0819343643556723e-05, + "loss": 0.1795, + "step": 8408 + }, + { + "epoch": 1.7018822100789315, + "grad_norm": 0.2917996942996979, + "learning_rate": 1.0804958060665082e-05, + "loss": 0.199, + "step": 8409 + }, + { + "epoch": 1.7020845982594617, + "grad_norm": 0.291407972574234, + "learning_rate": 1.0790581501591867e-05, + "loss": 0.1901, + "step": 8410 + }, + { + "epoch": 1.7022869864399919, + "grad_norm": 0.26372459530830383, + "learning_rate": 1.0776213967791549e-05, + "loss": 0.1769, + "step": 8411 + }, + { + "epoch": 1.702489374620522, + "grad_norm": 0.25174322724342346, + "learning_rate": 1.0761855460717696e-05, + "loss": 0.1688, + "step": 8412 + }, + { + "epoch": 1.7026917628010523, + "grad_norm": 0.27877235412597656, + "learning_rate": 1.0747505981822937e-05, + "loss": 0.1756, + "step": 8413 + }, + { + "epoch": 1.7028941509815827, + "grad_norm": 0.2647832930088043, + "learning_rate": 1.0733165532558998e-05, + "loss": 0.184, + "step": 8414 + }, + { + "epoch": 1.703096539162113, + "grad_norm": 0.26928800344467163, + "learning_rate": 1.0718834114376718e-05, + "loss": 0.1955, + "step": 8415 + }, + { + "epoch": 1.7032989273426433, + "grad_norm": 0.2908056378364563, + "learning_rate": 1.0704511728725986e-05, + "loss": 0.2094, + "step": 8416 + }, + { + "epoch": 1.7035013155231735, + "grad_norm": 0.2552296817302704, + "learning_rate": 1.0690198377055782e-05, + "loss": 0.1833, + "step": 8417 + }, + { + "epoch": 1.7037037037037037, + "grad_norm": 0.27076640725135803, + "learning_rate": 1.0675894060814185e-05, + "loss": 0.1714, + "step": 8418 + }, + { + "epoch": 1.703906091884234, + "grad_norm": 0.3016811013221741, + "learning_rate": 1.0661598781448368e-05, + "loss": 0.1685, + "step": 8419 + }, + { + "epoch": 1.7041084800647641, + "grad_norm": 0.33398357033729553, + "learning_rate": 1.0647312540404552e-05, + "loss": 0.2415, + "step": 8420 + }, + { + "epoch": 1.7043108682452943, + "grad_norm": 0.4259779453277588, + "learning_rate": 1.0633035339128106e-05, + "loss": 0.213, + "step": 8421 + }, + { + "epoch": 1.7045132564258247, + "grad_norm": 0.272946298122406, + "learning_rate": 1.0618767179063416e-05, + "loss": 0.201, + "step": 8422 + }, + { + "epoch": 1.704715644606355, + "grad_norm": 0.2644243538379669, + "learning_rate": 1.0604508061654005e-05, + "loss": 0.1957, + "step": 8423 + }, + { + "epoch": 1.7049180327868854, + "grad_norm": 0.2648584842681885, + "learning_rate": 1.0590257988342456e-05, + "loss": 0.1774, + "step": 8424 + }, + { + "epoch": 1.7051204209674156, + "grad_norm": 0.30881166458129883, + "learning_rate": 1.0576016960570433e-05, + "loss": 0.1931, + "step": 8425 + }, + { + "epoch": 1.7053228091479458, + "grad_norm": 0.22526279091835022, + "learning_rate": 1.05617849797787e-05, + "loss": 0.1567, + "step": 8426 + }, + { + "epoch": 1.705525197328476, + "grad_norm": 0.31192898750305176, + "learning_rate": 1.054756204740711e-05, + "loss": 0.2126, + "step": 8427 + }, + { + "epoch": 1.7057275855090062, + "grad_norm": 0.2907617688179016, + "learning_rate": 1.0533348164894575e-05, + "loss": 0.2089, + "step": 8428 + }, + { + "epoch": 1.7059299736895366, + "grad_norm": 0.31975242495536804, + "learning_rate": 1.0519143333679094e-05, + "loss": 0.2075, + "step": 8429 + }, + { + "epoch": 1.7061323618700668, + "grad_norm": 0.2925901412963867, + "learning_rate": 1.0504947555197786e-05, + "loss": 0.1919, + "step": 8430 + }, + { + "epoch": 1.7063347500505972, + "grad_norm": 0.2986370623111725, + "learning_rate": 1.0490760830886826e-05, + "loss": 0.1932, + "step": 8431 + }, + { + "epoch": 1.7065371382311274, + "grad_norm": 0.31750813126564026, + "learning_rate": 1.0476583162181464e-05, + "loss": 0.1895, + "step": 8432 + }, + { + "epoch": 1.7067395264116576, + "grad_norm": 0.3126259446144104, + "learning_rate": 1.0462414550516064e-05, + "loss": 0.2198, + "step": 8433 + }, + { + "epoch": 1.7069419145921878, + "grad_norm": 0.3027331829071045, + "learning_rate": 1.0448254997324058e-05, + "loss": 0.2191, + "step": 8434 + }, + { + "epoch": 1.707144302772718, + "grad_norm": 0.256282240152359, + "learning_rate": 1.0434104504037956e-05, + "loss": 0.1846, + "step": 8435 + }, + { + "epoch": 1.7073466909532482, + "grad_norm": 0.2230484038591385, + "learning_rate": 1.0419963072089355e-05, + "loss": 0.167, + "step": 8436 + }, + { + "epoch": 1.7075490791337786, + "grad_norm": 0.2401764690876007, + "learning_rate": 1.0405830702908936e-05, + "loss": 0.1602, + "step": 8437 + }, + { + "epoch": 1.7077514673143088, + "grad_norm": 0.2753002941608429, + "learning_rate": 1.0391707397926465e-05, + "loss": 0.1878, + "step": 8438 + }, + { + "epoch": 1.7079538554948392, + "grad_norm": 0.31785663962364197, + "learning_rate": 1.0377593158570786e-05, + "loss": 0.2253, + "step": 8439 + }, + { + "epoch": 1.7081562436753694, + "grad_norm": 0.26879486441612244, + "learning_rate": 1.0363487986269837e-05, + "loss": 0.1646, + "step": 8440 + }, + { + "epoch": 1.7083586318558996, + "grad_norm": 0.3231867253780365, + "learning_rate": 1.0349391882450632e-05, + "loss": 0.1762, + "step": 8441 + }, + { + "epoch": 1.7085610200364298, + "grad_norm": 0.306441068649292, + "learning_rate": 1.0335304848539262e-05, + "loss": 0.199, + "step": 8442 + }, + { + "epoch": 1.70876340821696, + "grad_norm": 0.325333833694458, + "learning_rate": 1.0321226885960899e-05, + "loss": 0.2153, + "step": 8443 + }, + { + "epoch": 1.7089657963974902, + "grad_norm": 0.2622414529323578, + "learning_rate": 1.0307157996139815e-05, + "loss": 0.1683, + "step": 8444 + }, + { + "epoch": 1.7091681845780207, + "grad_norm": 0.2952292263507843, + "learning_rate": 1.0293098180499361e-05, + "loss": 0.183, + "step": 8445 + }, + { + "epoch": 1.7093705727585509, + "grad_norm": 0.2727248966693878, + "learning_rate": 1.0279047440461931e-05, + "loss": 0.2091, + "step": 8446 + }, + { + "epoch": 1.7095729609390813, + "grad_norm": 0.302756667137146, + "learning_rate": 1.0265005777449066e-05, + "loss": 0.2054, + "step": 8447 + }, + { + "epoch": 1.7097753491196115, + "grad_norm": 0.35711240768432617, + "learning_rate": 1.025097319288133e-05, + "loss": 0.2223, + "step": 8448 + }, + { + "epoch": 1.7099777373001417, + "grad_norm": 0.2822362780570984, + "learning_rate": 1.0236949688178399e-05, + "loss": 0.1611, + "step": 8449 + }, + { + "epoch": 1.7101801254806719, + "grad_norm": 0.2875375747680664, + "learning_rate": 1.0222935264759037e-05, + "loss": 0.2048, + "step": 8450 + }, + { + "epoch": 1.7101801254806719, + "eval_loss": 0.2587939500808716, + "eval_runtime": 0.7393, + "eval_samples_per_second": 6.764, + "eval_steps_per_second": 1.353, + "step": 8450 + }, + { + "epoch": 1.710382513661202, + "grad_norm": 0.29503369331359863, + "learning_rate": 1.0208929924041055e-05, + "loss": 0.2009, + "step": 8451 + }, + { + "epoch": 1.7105849018417323, + "grad_norm": 0.2740275263786316, + "learning_rate": 1.0194933667441386e-05, + "loss": 0.1941, + "step": 8452 + }, + { + "epoch": 1.7107872900222627, + "grad_norm": 0.2855680584907532, + "learning_rate": 1.0180946496375998e-05, + "loss": 0.2248, + "step": 8453 + }, + { + "epoch": 1.710989678202793, + "grad_norm": 0.28390875458717346, + "learning_rate": 1.0166968412259992e-05, + "loss": 0.1819, + "step": 8454 + }, + { + "epoch": 1.7111920663833233, + "grad_norm": 0.2848486006259918, + "learning_rate": 1.0152999416507513e-05, + "loss": 0.1901, + "step": 8455 + }, + { + "epoch": 1.7113944545638535, + "grad_norm": 0.2505626082420349, + "learning_rate": 1.01390395105318e-05, + "loss": 0.1658, + "step": 8456 + }, + { + "epoch": 1.7115968427443837, + "grad_norm": 0.2542025148868561, + "learning_rate": 1.0125088695745166e-05, + "loss": 0.1931, + "step": 8457 + }, + { + "epoch": 1.711799230924914, + "grad_norm": 0.2455524504184723, + "learning_rate": 1.0111146973559015e-05, + "loss": 0.198, + "step": 8458 + }, + { + "epoch": 1.7120016191054441, + "grad_norm": 0.2576819360256195, + "learning_rate": 1.009721434538381e-05, + "loss": 0.1706, + "step": 8459 + }, + { + "epoch": 1.7122040072859745, + "grad_norm": 0.26860541105270386, + "learning_rate": 1.0083290812629132e-05, + "loss": 0.2004, + "step": 8460 + }, + { + "epoch": 1.7124063954665047, + "grad_norm": 0.25479787588119507, + "learning_rate": 1.0069376376703598e-05, + "loss": 0.1626, + "step": 8461 + }, + { + "epoch": 1.7126087836470352, + "grad_norm": 0.3089302182197571, + "learning_rate": 1.005547103901493e-05, + "loss": 0.2, + "step": 8462 + }, + { + "epoch": 1.7128111718275654, + "grad_norm": 0.3235393166542053, + "learning_rate": 1.0041574800969921e-05, + "loss": 0.1868, + "step": 8463 + }, + { + "epoch": 1.7130135600080956, + "grad_norm": 0.2871195673942566, + "learning_rate": 1.0027687663974462e-05, + "loss": 0.1798, + "step": 8464 + }, + { + "epoch": 1.7132159481886258, + "grad_norm": 0.33283573389053345, + "learning_rate": 1.001380962943349e-05, + "loss": 0.223, + "step": 8465 + }, + { + "epoch": 1.713418336369156, + "grad_norm": 0.27454549074172974, + "learning_rate": 9.999940698751043e-06, + "loss": 0.1968, + "step": 8466 + }, + { + "epoch": 1.7136207245496862, + "grad_norm": 0.2364848405122757, + "learning_rate": 9.98608087333024e-06, + "loss": 0.1666, + "step": 8467 + }, + { + "epoch": 1.7138231127302166, + "grad_norm": 0.3056480586528778, + "learning_rate": 9.97223015457327e-06, + "loss": 0.2041, + "step": 8468 + }, + { + "epoch": 1.7140255009107468, + "grad_norm": 0.3281700909137726, + "learning_rate": 9.958388543881392e-06, + "loss": 0.1717, + "step": 8469 + }, + { + "epoch": 1.7142278890912772, + "grad_norm": 0.28150680661201477, + "learning_rate": 9.944556042654973e-06, + "loss": 0.199, + "step": 8470 + }, + { + "epoch": 1.7144302772718074, + "grad_norm": 0.2600691020488739, + "learning_rate": 9.93073265229344e-06, + "loss": 0.186, + "step": 8471 + }, + { + "epoch": 1.7146326654523376, + "grad_norm": 0.2786303758621216, + "learning_rate": 9.916918374195282e-06, + "loss": 0.1903, + "step": 8472 + }, + { + "epoch": 1.7148350536328678, + "grad_norm": 0.24803856015205383, + "learning_rate": 9.903113209758096e-06, + "loss": 0.1542, + "step": 8473 + }, + { + "epoch": 1.715037441813398, + "grad_norm": 0.3093421459197998, + "learning_rate": 9.889317160378531e-06, + "loss": 0.2187, + "step": 8474 + }, + { + "epoch": 1.7152398299939282, + "grad_norm": 0.27181512117385864, + "learning_rate": 9.875530227452345e-06, + "loss": 0.2113, + "step": 8475 + }, + { + "epoch": 1.7154422181744586, + "grad_norm": 0.3223206102848053, + "learning_rate": 9.861752412374336e-06, + "loss": 0.2015, + "step": 8476 + }, + { + "epoch": 1.7156446063549888, + "grad_norm": 0.24381880462169647, + "learning_rate": 9.847983716538423e-06, + "loss": 0.1781, + "step": 8477 + }, + { + "epoch": 1.7158469945355193, + "grad_norm": 0.2983056902885437, + "learning_rate": 9.834224141337544e-06, + "loss": 0.178, + "step": 8478 + }, + { + "epoch": 1.7160493827160495, + "grad_norm": 0.3364439308643341, + "learning_rate": 9.820473688163778e-06, + "loss": 0.1767, + "step": 8479 + }, + { + "epoch": 1.7162517708965797, + "grad_norm": 0.26381823420524597, + "learning_rate": 9.806732358408244e-06, + "loss": 0.1845, + "step": 8480 + }, + { + "epoch": 1.7164541590771099, + "grad_norm": 0.3094126880168915, + "learning_rate": 9.793000153461141e-06, + "loss": 0.2032, + "step": 8481 + }, + { + "epoch": 1.71665654725764, + "grad_norm": 0.30394935607910156, + "learning_rate": 9.779277074711745e-06, + "loss": 0.1906, + "step": 8482 + }, + { + "epoch": 1.7168589354381703, + "grad_norm": 0.289957731962204, + "learning_rate": 9.765563123548426e-06, + "loss": 0.1838, + "step": 8483 + }, + { + "epoch": 1.7170613236187007, + "grad_norm": 0.2532503008842468, + "learning_rate": 9.751858301358607e-06, + "loss": 0.1635, + "step": 8484 + }, + { + "epoch": 1.7172637117992309, + "grad_norm": 0.26653948426246643, + "learning_rate": 9.73816260952881e-06, + "loss": 0.1798, + "step": 8485 + }, + { + "epoch": 1.7174660999797613, + "grad_norm": 0.30379506945610046, + "learning_rate": 9.724476049444609e-06, + "loss": 0.1908, + "step": 8486 + }, + { + "epoch": 1.7176684881602915, + "grad_norm": 0.4283871054649353, + "learning_rate": 9.710798622490669e-06, + "loss": 0.1931, + "step": 8487 + }, + { + "epoch": 1.7178708763408217, + "grad_norm": 0.24685965478420258, + "learning_rate": 9.69713033005073e-06, + "loss": 0.1501, + "step": 8488 + }, + { + "epoch": 1.718073264521352, + "grad_norm": 0.31098735332489014, + "learning_rate": 9.683471173507608e-06, + "loss": 0.1989, + "step": 8489 + }, + { + "epoch": 1.718275652701882, + "grad_norm": 0.2871131896972656, + "learning_rate": 9.669821154243186e-06, + "loss": 0.1812, + "step": 8490 + }, + { + "epoch": 1.7184780408824125, + "grad_norm": 0.3463563323020935, + "learning_rate": 9.656180273638448e-06, + "loss": 0.2067, + "step": 8491 + }, + { + "epoch": 1.7186804290629427, + "grad_norm": 0.34777534008026123, + "learning_rate": 9.642548533073415e-06, + "loss": 0.2103, + "step": 8492 + }, + { + "epoch": 1.7188828172434731, + "grad_norm": 0.30363330245018005, + "learning_rate": 9.628925933927213e-06, + "loss": 0.1695, + "step": 8493 + }, + { + "epoch": 1.7190852054240033, + "grad_norm": 0.281427264213562, + "learning_rate": 9.615312477578054e-06, + "loss": 0.173, + "step": 8494 + }, + { + "epoch": 1.7192875936045335, + "grad_norm": 0.32801055908203125, + "learning_rate": 9.601708165403156e-06, + "loss": 0.2195, + "step": 8495 + }, + { + "epoch": 1.7194899817850637, + "grad_norm": 0.25589799880981445, + "learning_rate": 9.588112998778898e-06, + "loss": 0.1961, + "step": 8496 + }, + { + "epoch": 1.719692369965594, + "grad_norm": 0.26079970598220825, + "learning_rate": 9.574526979080678e-06, + "loss": 0.1715, + "step": 8497 + }, + { + "epoch": 1.7198947581461241, + "grad_norm": 0.2752712070941925, + "learning_rate": 9.560950107682997e-06, + "loss": 0.1846, + "step": 8498 + }, + { + "epoch": 1.7200971463266546, + "grad_norm": 0.29999780654907227, + "learning_rate": 9.547382385959414e-06, + "loss": 0.215, + "step": 8499 + }, + { + "epoch": 1.7202995345071848, + "grad_norm": 0.2767726480960846, + "learning_rate": 9.533823815282583e-06, + "loss": 0.1907, + "step": 8500 + }, + { + "epoch": 1.7202995345071848, + "eval_loss": 0.25887531042099, + "eval_runtime": 0.7403, + "eval_samples_per_second": 6.754, + "eval_steps_per_second": 1.351, + "step": 8500 + }, + { + "epoch": 1.7205019226877152, + "grad_norm": 0.26701119542121887, + "learning_rate": 9.520274397024198e-06, + "loss": 0.1854, + "step": 8501 + }, + { + "epoch": 1.7207043108682454, + "grad_norm": 0.334595650434494, + "learning_rate": 9.506734132555062e-06, + "loss": 0.2115, + "step": 8502 + }, + { + "epoch": 1.7209066990487756, + "grad_norm": 0.29382118582725525, + "learning_rate": 9.493203023245023e-06, + "loss": 0.1966, + "step": 8503 + }, + { + "epoch": 1.7211090872293058, + "grad_norm": 0.30095425248146057, + "learning_rate": 9.47968107046303e-06, + "loss": 0.1959, + "step": 8504 + }, + { + "epoch": 1.721311475409836, + "grad_norm": 0.24783647060394287, + "learning_rate": 9.466168275577092e-06, + "loss": 0.195, + "step": 8505 + }, + { + "epoch": 1.7215138635903662, + "grad_norm": 0.26918861269950867, + "learning_rate": 9.452664639954278e-06, + "loss": 0.1892, + "step": 8506 + }, + { + "epoch": 1.7217162517708966, + "grad_norm": 0.30738553404808044, + "learning_rate": 9.439170164960765e-06, + "loss": 0.1901, + "step": 8507 + }, + { + "epoch": 1.7219186399514268, + "grad_norm": 0.2554987967014313, + "learning_rate": 9.425684851961757e-06, + "loss": 0.1827, + "step": 8508 + }, + { + "epoch": 1.7221210281319572, + "grad_norm": 0.288388192653656, + "learning_rate": 9.41220870232158e-06, + "loss": 0.1896, + "step": 8509 + }, + { + "epoch": 1.7223234163124874, + "grad_norm": 0.28056037425994873, + "learning_rate": 9.398741717403604e-06, + "loss": 0.2062, + "step": 8510 + }, + { + "epoch": 1.7225258044930176, + "grad_norm": 0.35663676261901855, + "learning_rate": 9.385283898570264e-06, + "loss": 0.2208, + "step": 8511 + }, + { + "epoch": 1.7227281926735478, + "grad_norm": 0.3021637499332428, + "learning_rate": 9.371835247183092e-06, + "loss": 0.1946, + "step": 8512 + }, + { + "epoch": 1.722930580854078, + "grad_norm": 0.3164548873901367, + "learning_rate": 9.358395764602679e-06, + "loss": 0.2, + "step": 8513 + }, + { + "epoch": 1.7231329690346082, + "grad_norm": 0.2893165051937103, + "learning_rate": 9.344965452188692e-06, + "loss": 0.1828, + "step": 8514 + }, + { + "epoch": 1.7233353572151386, + "grad_norm": 0.25707072019577026, + "learning_rate": 9.331544311299867e-06, + "loss": 0.1798, + "step": 8515 + }, + { + "epoch": 1.7235377453956688, + "grad_norm": 0.2197255641222, + "learning_rate": 9.318132343294018e-06, + "loss": 0.1516, + "step": 8516 + }, + { + "epoch": 1.7237401335761993, + "grad_norm": 0.2677488625049591, + "learning_rate": 9.304729549528014e-06, + "loss": 0.1883, + "step": 8517 + }, + { + "epoch": 1.7239425217567295, + "grad_norm": 0.2830187976360321, + "learning_rate": 9.291335931357826e-06, + "loss": 0.1925, + "step": 8518 + }, + { + "epoch": 1.7241449099372597, + "grad_norm": 0.2824142873287201, + "learning_rate": 9.27795149013848e-06, + "loss": 0.1801, + "step": 8519 + }, + { + "epoch": 1.7243472981177899, + "grad_norm": 0.2455061674118042, + "learning_rate": 9.264576227224064e-06, + "loss": 0.1358, + "step": 8520 + }, + { + "epoch": 1.72454968629832, + "grad_norm": 0.2879447937011719, + "learning_rate": 9.251210143967737e-06, + "loss": 0.1981, + "step": 8521 + }, + { + "epoch": 1.7247520744788505, + "grad_norm": 0.33884865045547485, + "learning_rate": 9.237853241721761e-06, + "loss": 0.1773, + "step": 8522 + }, + { + "epoch": 1.7249544626593807, + "grad_norm": 0.26604849100112915, + "learning_rate": 9.224505521837446e-06, + "loss": 0.1719, + "step": 8523 + }, + { + "epoch": 1.725156850839911, + "grad_norm": 0.3080281615257263, + "learning_rate": 9.211166985665154e-06, + "loss": 0.1899, + "step": 8524 + }, + { + "epoch": 1.7253592390204413, + "grad_norm": 0.25215914845466614, + "learning_rate": 9.197837634554351e-06, + "loss": 0.1578, + "step": 8525 + }, + { + "epoch": 1.7255616272009715, + "grad_norm": 0.2496306151151657, + "learning_rate": 9.18451746985356e-06, + "loss": 0.1613, + "step": 8526 + }, + { + "epoch": 1.7257640153815017, + "grad_norm": 0.3078269958496094, + "learning_rate": 9.171206492910377e-06, + "loss": 0.2016, + "step": 8527 + }, + { + "epoch": 1.725966403562032, + "grad_norm": 0.2878892719745636, + "learning_rate": 9.157904705071474e-06, + "loss": 0.1953, + "step": 8528 + }, + { + "epoch": 1.726168791742562, + "grad_norm": 0.3165808916091919, + "learning_rate": 9.14461210768257e-06, + "loss": 0.1478, + "step": 8529 + }, + { + "epoch": 1.7263711799230925, + "grad_norm": 0.2814152240753174, + "learning_rate": 9.131328702088471e-06, + "loss": 0.2018, + "step": 8530 + }, + { + "epoch": 1.7265735681036227, + "grad_norm": 0.34386932849884033, + "learning_rate": 9.118054489633066e-06, + "loss": 0.2185, + "step": 8531 + }, + { + "epoch": 1.7267759562841531, + "grad_norm": 0.26644062995910645, + "learning_rate": 9.104789471659303e-06, + "loss": 0.1746, + "step": 8532 + }, + { + "epoch": 1.7269783444646833, + "grad_norm": 0.2888523042201996, + "learning_rate": 9.091533649509177e-06, + "loss": 0.1801, + "step": 8533 + }, + { + "epoch": 1.7271807326452135, + "grad_norm": 0.29055801033973694, + "learning_rate": 9.078287024523791e-06, + "loss": 0.1747, + "step": 8534 + }, + { + "epoch": 1.7273831208257437, + "grad_norm": 0.2960166037082672, + "learning_rate": 9.065049598043285e-06, + "loss": 0.1804, + "step": 8535 + }, + { + "epoch": 1.727585509006274, + "grad_norm": 0.3112858235836029, + "learning_rate": 9.0518213714069e-06, + "loss": 0.1795, + "step": 8536 + }, + { + "epoch": 1.7277878971868041, + "grad_norm": 0.24035745859146118, + "learning_rate": 9.038602345952919e-06, + "loss": 0.1646, + "step": 8537 + }, + { + "epoch": 1.7279902853673346, + "grad_norm": 0.25147929787635803, + "learning_rate": 9.025392523018706e-06, + "loss": 0.1391, + "step": 8538 + }, + { + "epoch": 1.7281926735478648, + "grad_norm": 0.3143143653869629, + "learning_rate": 9.012191903940704e-06, + "loss": 0.1979, + "step": 8539 + }, + { + "epoch": 1.7283950617283952, + "grad_norm": 0.2703983783721924, + "learning_rate": 8.999000490054388e-06, + "loss": 0.1675, + "step": 8540 + }, + { + "epoch": 1.7285974499089254, + "grad_norm": 0.3867993652820587, + "learning_rate": 8.985818282694336e-06, + "loss": 0.1804, + "step": 8541 + }, + { + "epoch": 1.7287998380894556, + "grad_norm": 0.2795158922672272, + "learning_rate": 8.972645283194193e-06, + "loss": 0.1768, + "step": 8542 + }, + { + "epoch": 1.7290022262699858, + "grad_norm": 0.2464820295572281, + "learning_rate": 8.959481492886657e-06, + "loss": 0.1665, + "step": 8543 + }, + { + "epoch": 1.729204614450516, + "grad_norm": 0.24622751772403717, + "learning_rate": 8.946326913103508e-06, + "loss": 0.1711, + "step": 8544 + }, + { + "epoch": 1.7294070026310462, + "grad_norm": 0.3368050754070282, + "learning_rate": 8.933181545175585e-06, + "loss": 0.2069, + "step": 8545 + }, + { + "epoch": 1.7296093908115766, + "grad_norm": 0.270673930644989, + "learning_rate": 8.920045390432796e-06, + "loss": 0.2033, + "step": 8546 + }, + { + "epoch": 1.7298117789921068, + "grad_norm": 0.26693806052207947, + "learning_rate": 8.906918450204138e-06, + "loss": 0.1797, + "step": 8547 + }, + { + "epoch": 1.7300141671726372, + "grad_norm": 0.3291761875152588, + "learning_rate": 8.893800725817624e-06, + "loss": 0.1935, + "step": 8548 + }, + { + "epoch": 1.7302165553531674, + "grad_norm": 0.35645949840545654, + "learning_rate": 8.880692218600406e-06, + "loss": 0.1857, + "step": 8549 + }, + { + "epoch": 1.7304189435336976, + "grad_norm": 0.2925495505332947, + "learning_rate": 8.867592929878632e-06, + "loss": 0.1572, + "step": 8550 + }, + { + "epoch": 1.7304189435336976, + "eval_loss": 0.2594282627105713, + "eval_runtime": 0.7383, + "eval_samples_per_second": 6.773, + "eval_steps_per_second": 1.355, + "step": 8550 + }, + { + "epoch": 1.7306213317142278, + "grad_norm": 0.2810535430908203, + "learning_rate": 8.854502860977564e-06, + "loss": 0.2246, + "step": 8551 + }, + { + "epoch": 1.730823719894758, + "grad_norm": 0.3040057122707367, + "learning_rate": 8.841422013221524e-06, + "loss": 0.1888, + "step": 8552 + }, + { + "epoch": 1.7310261080752884, + "grad_norm": 0.2596018314361572, + "learning_rate": 8.828350387933882e-06, + "loss": 0.1542, + "step": 8553 + }, + { + "epoch": 1.7312284962558186, + "grad_norm": 0.2669846713542938, + "learning_rate": 8.815287986437092e-06, + "loss": 0.1916, + "step": 8554 + }, + { + "epoch": 1.731430884436349, + "grad_norm": 0.2962510883808136, + "learning_rate": 8.80223481005269e-06, + "loss": 0.211, + "step": 8555 + }, + { + "epoch": 1.7316332726168793, + "grad_norm": 0.2679612934589386, + "learning_rate": 8.789190860101225e-06, + "loss": 0.1475, + "step": 8556 + }, + { + "epoch": 1.7318356607974095, + "grad_norm": 0.2798098027706146, + "learning_rate": 8.77615613790237e-06, + "loss": 0.2033, + "step": 8557 + }, + { + "epoch": 1.7320380489779397, + "grad_norm": 0.311907559633255, + "learning_rate": 8.763130644774842e-06, + "loss": 0.2081, + "step": 8558 + }, + { + "epoch": 1.7322404371584699, + "grad_norm": 0.2620576024055481, + "learning_rate": 8.750114382036412e-06, + "loss": 0.1779, + "step": 8559 + }, + { + "epoch": 1.732442825339, + "grad_norm": 0.30197739601135254, + "learning_rate": 8.737107351003937e-06, + "loss": 0.2017, + "step": 8560 + }, + { + "epoch": 1.7326452135195305, + "grad_norm": 0.26769980788230896, + "learning_rate": 8.724109552993342e-06, + "loss": 0.1938, + "step": 8561 + }, + { + "epoch": 1.7328476017000607, + "grad_norm": 0.27940577268600464, + "learning_rate": 8.711120989319588e-06, + "loss": 0.1882, + "step": 8562 + }, + { + "epoch": 1.7330499898805911, + "grad_norm": 0.31509724259376526, + "learning_rate": 8.69814166129672e-06, + "loss": 0.1933, + "step": 8563 + }, + { + "epoch": 1.7332523780611213, + "grad_norm": 0.3496498763561249, + "learning_rate": 8.68517157023786e-06, + "loss": 0.205, + "step": 8564 + }, + { + "epoch": 1.7334547662416515, + "grad_norm": 0.25049978494644165, + "learning_rate": 8.672210717455187e-06, + "loss": 0.1646, + "step": 8565 + }, + { + "epoch": 1.7336571544221817, + "grad_norm": 0.29622167348861694, + "learning_rate": 8.659259104259942e-06, + "loss": 0.1671, + "step": 8566 + }, + { + "epoch": 1.733859542602712, + "grad_norm": 0.27434036135673523, + "learning_rate": 8.646316731962433e-06, + "loss": 0.19, + "step": 8567 + }, + { + "epoch": 1.7340619307832421, + "grad_norm": 0.2887204885482788, + "learning_rate": 8.633383601872035e-06, + "loss": 0.1957, + "step": 8568 + }, + { + "epoch": 1.7342643189637725, + "grad_norm": 0.3375066816806793, + "learning_rate": 8.62045971529718e-06, + "loss": 0.2209, + "step": 8569 + }, + { + "epoch": 1.7344667071443027, + "grad_norm": 0.32264482975006104, + "learning_rate": 8.607545073545375e-06, + "loss": 0.1748, + "step": 8570 + }, + { + "epoch": 1.7346690953248332, + "grad_norm": 0.33347657322883606, + "learning_rate": 8.5946396779232e-06, + "loss": 0.1992, + "step": 8571 + }, + { + "epoch": 1.7348714835053634, + "grad_norm": 0.2814638614654541, + "learning_rate": 8.581743529736274e-06, + "loss": 0.2136, + "step": 8572 + }, + { + "epoch": 1.7350738716858936, + "grad_norm": 0.3195917010307312, + "learning_rate": 8.568856630289268e-06, + "loss": 0.2252, + "step": 8573 + }, + { + "epoch": 1.7352762598664238, + "grad_norm": 0.2509499192237854, + "learning_rate": 8.555978980886004e-06, + "loss": 0.1712, + "step": 8574 + }, + { + "epoch": 1.735478648046954, + "grad_norm": 0.32808899879455566, + "learning_rate": 8.543110582829272e-06, + "loss": 0.2164, + "step": 8575 + }, + { + "epoch": 1.7356810362274842, + "grad_norm": 0.244186133146286, + "learning_rate": 8.530251437420954e-06, + "loss": 0.1678, + "step": 8576 + }, + { + "epoch": 1.7358834244080146, + "grad_norm": 0.2884461581707001, + "learning_rate": 8.517401545962034e-06, + "loss": 0.1972, + "step": 8577 + }, + { + "epoch": 1.7360858125885448, + "grad_norm": 0.3048520088195801, + "learning_rate": 8.50456090975249e-06, + "loss": 0.218, + "step": 8578 + }, + { + "epoch": 1.7362882007690752, + "grad_norm": 0.34312018752098083, + "learning_rate": 8.49172953009143e-06, + "loss": 0.2004, + "step": 8579 + }, + { + "epoch": 1.7364905889496054, + "grad_norm": 0.28323104977607727, + "learning_rate": 8.478907408276993e-06, + "loss": 0.1645, + "step": 8580 + }, + { + "epoch": 1.7366929771301356, + "grad_norm": 0.26150673627853394, + "learning_rate": 8.466094545606385e-06, + "loss": 0.1472, + "step": 8581 + }, + { + "epoch": 1.7368953653106658, + "grad_norm": 0.30239489674568176, + "learning_rate": 8.45329094337588e-06, + "loss": 0.1921, + "step": 8582 + }, + { + "epoch": 1.737097753491196, + "grad_norm": 0.2655140459537506, + "learning_rate": 8.44049660288082e-06, + "loss": 0.2039, + "step": 8583 + }, + { + "epoch": 1.7373001416717264, + "grad_norm": 0.28127583861351013, + "learning_rate": 8.427711525415571e-06, + "loss": 0.2167, + "step": 8584 + }, + { + "epoch": 1.7375025298522566, + "grad_norm": 0.2773891091346741, + "learning_rate": 8.41493571227362e-06, + "loss": 0.1755, + "step": 8585 + }, + { + "epoch": 1.737704918032787, + "grad_norm": 0.2599252760410309, + "learning_rate": 8.402169164747475e-06, + "loss": 0.1728, + "step": 8586 + }, + { + "epoch": 1.7379073062133172, + "grad_norm": 0.2744457721710205, + "learning_rate": 8.389411884128728e-06, + "loss": 0.1987, + "step": 8587 + }, + { + "epoch": 1.7381096943938474, + "grad_norm": 0.26484882831573486, + "learning_rate": 8.376663871708035e-06, + "loss": 0.1871, + "step": 8588 + }, + { + "epoch": 1.7383120825743776, + "grad_norm": 0.2672707438468933, + "learning_rate": 8.363925128775096e-06, + "loss": 0.2017, + "step": 8589 + }, + { + "epoch": 1.7385144707549078, + "grad_norm": 0.32294702529907227, + "learning_rate": 8.351195656618682e-06, + "loss": 0.2062, + "step": 8590 + }, + { + "epoch": 1.738716858935438, + "grad_norm": 0.27408814430236816, + "learning_rate": 8.33847545652664e-06, + "loss": 0.1722, + "step": 8591 + }, + { + "epoch": 1.7389192471159685, + "grad_norm": 0.2947283089160919, + "learning_rate": 8.325764529785851e-06, + "loss": 0.2286, + "step": 8592 + }, + { + "epoch": 1.7391216352964987, + "grad_norm": 0.2431199848651886, + "learning_rate": 8.313062877682287e-06, + "loss": 0.1926, + "step": 8593 + }, + { + "epoch": 1.739324023477029, + "grad_norm": 0.30300426483154297, + "learning_rate": 8.300370501500953e-06, + "loss": 0.2268, + "step": 8594 + }, + { + "epoch": 1.7395264116575593, + "grad_norm": 0.2920367121696472, + "learning_rate": 8.287687402525945e-06, + "loss": 0.1849, + "step": 8595 + }, + { + "epoch": 1.7397287998380895, + "grad_norm": 0.2584496736526489, + "learning_rate": 8.275013582040392e-06, + "loss": 0.18, + "step": 8596 + }, + { + "epoch": 1.7399311880186197, + "grad_norm": 0.26713255047798157, + "learning_rate": 8.262349041326512e-06, + "loss": 0.1572, + "step": 8597 + }, + { + "epoch": 1.7401335761991499, + "grad_norm": 0.2660084664821625, + "learning_rate": 8.249693781665557e-06, + "loss": 0.2091, + "step": 8598 + }, + { + "epoch": 1.74033596437968, + "grad_norm": 0.32999715209007263, + "learning_rate": 8.237047804337861e-06, + "loss": 0.1907, + "step": 8599 + }, + { + "epoch": 1.7405383525602105, + "grad_norm": 0.2914731502532959, + "learning_rate": 8.22441111062282e-06, + "loss": 0.2153, + "step": 8600 + }, + { + "epoch": 1.7405383525602105, + "eval_loss": 0.2593904137611389, + "eval_runtime": 0.7381, + "eval_samples_per_second": 6.774, + "eval_steps_per_second": 1.355, + "step": 8600 + }, + { + "epoch": 1.7407407407407407, + "grad_norm": 0.27393868565559387, + "learning_rate": 8.211783701798859e-06, + "loss": 0.19, + "step": 8601 + }, + { + "epoch": 1.7409431289212711, + "grad_norm": 0.24766801297664642, + "learning_rate": 8.199165579143508e-06, + "loss": 0.1699, + "step": 8602 + }, + { + "epoch": 1.7411455171018013, + "grad_norm": 0.2624143064022064, + "learning_rate": 8.186556743933327e-06, + "loss": 0.1918, + "step": 8603 + }, + { + "epoch": 1.7413479052823315, + "grad_norm": 0.2595481872558594, + "learning_rate": 8.173957197443948e-06, + "loss": 0.1906, + "step": 8604 + }, + { + "epoch": 1.7415502934628617, + "grad_norm": 0.2655414938926697, + "learning_rate": 8.161366940950076e-06, + "loss": 0.17, + "step": 8605 + }, + { + "epoch": 1.741752681643392, + "grad_norm": 0.29584211111068726, + "learning_rate": 8.148785975725437e-06, + "loss": 0.1749, + "step": 8606 + }, + { + "epoch": 1.7419550698239221, + "grad_norm": 0.28779736161231995, + "learning_rate": 8.136214303042834e-06, + "loss": 0.175, + "step": 8607 + }, + { + "epoch": 1.7421574580044525, + "grad_norm": 0.27830713987350464, + "learning_rate": 8.123651924174158e-06, + "loss": 0.1795, + "step": 8608 + }, + { + "epoch": 1.742359846184983, + "grad_norm": 0.302734375, + "learning_rate": 8.111098840390341e-06, + "loss": 0.2105, + "step": 8609 + }, + { + "epoch": 1.7425622343655132, + "grad_norm": 0.28851422667503357, + "learning_rate": 8.098555052961354e-06, + "loss": 0.1874, + "step": 8610 + }, + { + "epoch": 1.7427646225460434, + "grad_norm": 0.2846405804157257, + "learning_rate": 8.08602056315626e-06, + "loss": 0.1796, + "step": 8611 + }, + { + "epoch": 1.7429670107265736, + "grad_norm": 0.2728175222873688, + "learning_rate": 8.073495372243156e-06, + "loss": 0.1574, + "step": 8612 + }, + { + "epoch": 1.7431693989071038, + "grad_norm": 0.30502745509147644, + "learning_rate": 8.060979481489228e-06, + "loss": 0.1851, + "step": 8613 + }, + { + "epoch": 1.743371787087634, + "grad_norm": 0.2595095932483673, + "learning_rate": 8.048472892160685e-06, + "loss": 0.1707, + "step": 8614 + }, + { + "epoch": 1.7435741752681644, + "grad_norm": 0.3026995062828064, + "learning_rate": 8.035975605522816e-06, + "loss": 0.1626, + "step": 8615 + }, + { + "epoch": 1.7437765634486946, + "grad_norm": 0.3065376579761505, + "learning_rate": 8.023487622839975e-06, + "loss": 0.2179, + "step": 8616 + }, + { + "epoch": 1.743978951629225, + "grad_norm": 0.34945711493492126, + "learning_rate": 8.011008945375553e-06, + "loss": 0.1956, + "step": 8617 + }, + { + "epoch": 1.7441813398097552, + "grad_norm": 0.3083902597427368, + "learning_rate": 7.998539574392017e-06, + "loss": 0.2114, + "step": 8618 + }, + { + "epoch": 1.7443837279902854, + "grad_norm": 0.31957361102104187, + "learning_rate": 7.986079511150879e-06, + "loss": 0.2174, + "step": 8619 + }, + { + "epoch": 1.7445861161708156, + "grad_norm": 0.2879070043563843, + "learning_rate": 7.973628756912732e-06, + "loss": 0.2101, + "step": 8620 + }, + { + "epoch": 1.7447885043513458, + "grad_norm": 0.2841523289680481, + "learning_rate": 7.961187312937202e-06, + "loss": 0.1844, + "step": 8621 + }, + { + "epoch": 1.744990892531876, + "grad_norm": 0.2686194181442261, + "learning_rate": 7.948755180482991e-06, + "loss": 0.1757, + "step": 8622 + }, + { + "epoch": 1.7451932807124064, + "grad_norm": 0.4323246479034424, + "learning_rate": 7.936332360807853e-06, + "loss": 0.1904, + "step": 8623 + }, + { + "epoch": 1.7453956688929366, + "grad_norm": 0.29777491092681885, + "learning_rate": 7.923918855168588e-06, + "loss": 0.1798, + "step": 8624 + }, + { + "epoch": 1.745598057073467, + "grad_norm": 0.2588173747062683, + "learning_rate": 7.911514664821073e-06, + "loss": 0.1872, + "step": 8625 + }, + { + "epoch": 1.7458004452539972, + "grad_norm": 0.24862785637378693, + "learning_rate": 7.899119791020226e-06, + "loss": 0.1694, + "step": 8626 + }, + { + "epoch": 1.7460028334345274, + "grad_norm": 0.2731352746486664, + "learning_rate": 7.886734235020033e-06, + "loss": 0.163, + "step": 8627 + }, + { + "epoch": 1.7462052216150576, + "grad_norm": 0.3019119203090668, + "learning_rate": 7.874357998073544e-06, + "loss": 0.1828, + "step": 8628 + }, + { + "epoch": 1.7464076097955878, + "grad_norm": 0.29366788268089294, + "learning_rate": 7.86199108143284e-06, + "loss": 0.1737, + "step": 8629 + }, + { + "epoch": 1.746609997976118, + "grad_norm": 0.27854087948799133, + "learning_rate": 7.849633486349095e-06, + "loss": 0.1978, + "step": 8630 + }, + { + "epoch": 1.7468123861566485, + "grad_norm": 0.3060019016265869, + "learning_rate": 7.83728521407251e-06, + "loss": 0.1997, + "step": 8631 + }, + { + "epoch": 1.7470147743371787, + "grad_norm": 0.29798224568367004, + "learning_rate": 7.82494626585235e-06, + "loss": 0.213, + "step": 8632 + }, + { + "epoch": 1.747217162517709, + "grad_norm": 0.2958841621875763, + "learning_rate": 7.812616642936943e-06, + "loss": 0.216, + "step": 8633 + }, + { + "epoch": 1.7474195506982393, + "grad_norm": 0.3002054989337921, + "learning_rate": 7.800296346573677e-06, + "loss": 0.1936, + "step": 8634 + }, + { + "epoch": 1.7476219388787695, + "grad_norm": 0.3035990297794342, + "learning_rate": 7.787985378008988e-06, + "loss": 0.188, + "step": 8635 + }, + { + "epoch": 1.7478243270592997, + "grad_norm": 0.29013994336128235, + "learning_rate": 7.775683738488371e-06, + "loss": 0.1609, + "step": 8636 + }, + { + "epoch": 1.7480267152398299, + "grad_norm": 0.3334468901157379, + "learning_rate": 7.763391429256373e-06, + "loss": 0.1858, + "step": 8637 + }, + { + "epoch": 1.74822910342036, + "grad_norm": 0.2830390930175781, + "learning_rate": 7.7511084515566e-06, + "loss": 0.1803, + "step": 8638 + }, + { + "epoch": 1.7484314916008905, + "grad_norm": 0.2534598410129547, + "learning_rate": 7.738834806631711e-06, + "loss": 0.1873, + "step": 8639 + }, + { + "epoch": 1.748633879781421, + "grad_norm": 0.25773561000823975, + "learning_rate": 7.72657049572344e-06, + "loss": 0.1664, + "step": 8640 + }, + { + "epoch": 1.7488362679619511, + "grad_norm": 0.2659550905227661, + "learning_rate": 7.714315520072545e-06, + "loss": 0.1795, + "step": 8641 + }, + { + "epoch": 1.7490386561424813, + "grad_norm": 0.2660394012928009, + "learning_rate": 7.702069880918872e-06, + "loss": 0.1714, + "step": 8642 + }, + { + "epoch": 1.7492410443230115, + "grad_norm": 0.22891388833522797, + "learning_rate": 7.689833579501293e-06, + "loss": 0.171, + "step": 8643 + }, + { + "epoch": 1.7494434325035417, + "grad_norm": 0.2488303780555725, + "learning_rate": 7.677606617057743e-06, + "loss": 0.1725, + "step": 8644 + }, + { + "epoch": 1.749645820684072, + "grad_norm": 0.29489654302597046, + "learning_rate": 7.66538899482524e-06, + "loss": 0.2015, + "step": 8645 + }, + { + "epoch": 1.7498482088646024, + "grad_norm": 0.2691362500190735, + "learning_rate": 7.65318071403982e-06, + "loss": 0.1774, + "step": 8646 + }, + { + "epoch": 1.7500505970451325, + "grad_norm": 0.33754125237464905, + "learning_rate": 7.640981775936595e-06, + "loss": 0.2412, + "step": 8647 + }, + { + "epoch": 1.750252985225663, + "grad_norm": 0.27895069122314453, + "learning_rate": 7.628792181749711e-06, + "loss": 0.222, + "step": 8648 + }, + { + "epoch": 1.7504553734061932, + "grad_norm": 0.31739845871925354, + "learning_rate": 7.616611932712403e-06, + "loss": 0.1908, + "step": 8649 + }, + { + "epoch": 1.7506577615867234, + "grad_norm": 0.28091490268707275, + "learning_rate": 7.604441030056941e-06, + "loss": 0.1811, + "step": 8650 + }, + { + "epoch": 1.7506577615867234, + "eval_loss": 0.2593732178211212, + "eval_runtime": 0.7369, + "eval_samples_per_second": 6.785, + "eval_steps_per_second": 1.357, + "step": 8650 + }, + { + "epoch": 1.7508601497672536, + "grad_norm": 0.2957996428012848, + "learning_rate": 7.5922794750146294e-06, + "loss": 0.2139, + "step": 8651 + }, + { + "epoch": 1.7510625379477838, + "grad_norm": 0.349030464887619, + "learning_rate": 7.580127268815862e-06, + "loss": 0.2125, + "step": 8652 + }, + { + "epoch": 1.751264926128314, + "grad_norm": 0.24564795196056366, + "learning_rate": 7.567984412690055e-06, + "loss": 0.1514, + "step": 8653 + }, + { + "epoch": 1.7514673143088444, + "grad_norm": 0.28310883045196533, + "learning_rate": 7.555850907865713e-06, + "loss": 0.2011, + "step": 8654 + }, + { + "epoch": 1.7516697024893746, + "grad_norm": 0.3396984338760376, + "learning_rate": 7.543726755570368e-06, + "loss": 0.2358, + "step": 8655 + }, + { + "epoch": 1.751872090669905, + "grad_norm": 0.32286304235458374, + "learning_rate": 7.531611957030626e-06, + "loss": 0.2065, + "step": 8656 + }, + { + "epoch": 1.7520744788504352, + "grad_norm": 0.29195284843444824, + "learning_rate": 7.519506513472118e-06, + "loss": 0.2033, + "step": 8657 + }, + { + "epoch": 1.7522768670309654, + "grad_norm": 0.2819693386554718, + "learning_rate": 7.507410426119554e-06, + "loss": 0.1663, + "step": 8658 + }, + { + "epoch": 1.7524792552114956, + "grad_norm": 0.28465232253074646, + "learning_rate": 7.4953236961966874e-06, + "loss": 0.192, + "step": 8659 + }, + { + "epoch": 1.7526816433920258, + "grad_norm": 0.2673487365245819, + "learning_rate": 7.48324632492634e-06, + "loss": 0.1682, + "step": 8660 + }, + { + "epoch": 1.752884031572556, + "grad_norm": 0.30271047353744507, + "learning_rate": 7.471178313530347e-06, + "loss": 0.1794, + "step": 8661 + }, + { + "epoch": 1.7530864197530864, + "grad_norm": 0.2480824738740921, + "learning_rate": 7.459119663229652e-06, + "loss": 0.1852, + "step": 8662 + }, + { + "epoch": 1.7532888079336166, + "grad_norm": 0.2820073962211609, + "learning_rate": 7.447070375244203e-06, + "loss": 0.1962, + "step": 8663 + }, + { + "epoch": 1.753491196114147, + "grad_norm": 0.2992934584617615, + "learning_rate": 7.435030450793024e-06, + "loss": 0.1752, + "step": 8664 + }, + { + "epoch": 1.7536935842946773, + "grad_norm": 0.299672931432724, + "learning_rate": 7.422999891094196e-06, + "loss": 0.2217, + "step": 8665 + }, + { + "epoch": 1.7538959724752075, + "grad_norm": 0.26297396421432495, + "learning_rate": 7.410978697364834e-06, + "loss": 0.1846, + "step": 8666 + }, + { + "epoch": 1.7540983606557377, + "grad_norm": 0.42981722950935364, + "learning_rate": 7.398966870821122e-06, + "loss": 0.1864, + "step": 8667 + }, + { + "epoch": 1.7543007488362679, + "grad_norm": 0.33034002780914307, + "learning_rate": 7.386964412678299e-06, + "loss": 0.2026, + "step": 8668 + }, + { + "epoch": 1.754503137016798, + "grad_norm": 0.33484843373298645, + "learning_rate": 7.374971324150637e-06, + "loss": 0.1514, + "step": 8669 + }, + { + "epoch": 1.7547055251973285, + "grad_norm": 0.31231656670570374, + "learning_rate": 7.362987606451466e-06, + "loss": 0.1844, + "step": 8670 + }, + { + "epoch": 1.754907913377859, + "grad_norm": 0.28507134318351746, + "learning_rate": 7.351013260793183e-06, + "loss": 0.1737, + "step": 8671 + }, + { + "epoch": 1.755110301558389, + "grad_norm": 0.31598010659217834, + "learning_rate": 7.339048288387229e-06, + "loss": 0.2043, + "step": 8672 + }, + { + "epoch": 1.7553126897389193, + "grad_norm": 0.2558382749557495, + "learning_rate": 7.327092690444082e-06, + "loss": 0.166, + "step": 8673 + }, + { + "epoch": 1.7555150779194495, + "grad_norm": 0.33906546235084534, + "learning_rate": 7.315146468173295e-06, + "loss": 0.1969, + "step": 8674 + }, + { + "epoch": 1.7557174660999797, + "grad_norm": 0.2782249450683594, + "learning_rate": 7.303209622783446e-06, + "loss": 0.1898, + "step": 8675 + }, + { + "epoch": 1.75591985428051, + "grad_norm": 0.26307907700538635, + "learning_rate": 7.2912821554822046e-06, + "loss": 0.182, + "step": 8676 + }, + { + "epoch": 1.7561222424610403, + "grad_norm": 0.2771715223789215, + "learning_rate": 7.279364067476246e-06, + "loss": 0.1882, + "step": 8677 + }, + { + "epoch": 1.7563246306415705, + "grad_norm": 0.2941603362560272, + "learning_rate": 7.2674553599713315e-06, + "loss": 0.1756, + "step": 8678 + }, + { + "epoch": 1.756527018822101, + "grad_norm": 0.23980875313282013, + "learning_rate": 7.25555603417224e-06, + "loss": 0.1701, + "step": 8679 + }, + { + "epoch": 1.7567294070026311, + "grad_norm": 0.30672362446784973, + "learning_rate": 7.243666091282841e-06, + "loss": 0.2077, + "step": 8680 + }, + { + "epoch": 1.7569317951831613, + "grad_norm": 0.29822811484336853, + "learning_rate": 7.231785532506031e-06, + "loss": 0.1857, + "step": 8681 + }, + { + "epoch": 1.7571341833636915, + "grad_norm": 0.2909514904022217, + "learning_rate": 7.219914359043744e-06, + "loss": 0.1987, + "step": 8682 + }, + { + "epoch": 1.7573365715442217, + "grad_norm": 0.2535093128681183, + "learning_rate": 7.208052572097001e-06, + "loss": 0.1722, + "step": 8683 + }, + { + "epoch": 1.757538959724752, + "grad_norm": 0.28953081369400024, + "learning_rate": 7.196200172865841e-06, + "loss": 0.1771, + "step": 8684 + }, + { + "epoch": 1.7577413479052824, + "grad_norm": 0.2795270085334778, + "learning_rate": 7.18435716254936e-06, + "loss": 0.1846, + "step": 8685 + }, + { + "epoch": 1.7579437360858126, + "grad_norm": 0.3224479854106903, + "learning_rate": 7.172523542345733e-06, + "loss": 0.219, + "step": 8686 + }, + { + "epoch": 1.758146124266343, + "grad_norm": 0.28203216195106506, + "learning_rate": 7.160699313452135e-06, + "loss": 0.1738, + "step": 8687 + }, + { + "epoch": 1.7583485124468732, + "grad_norm": 0.2619762718677521, + "learning_rate": 7.1488844770648325e-06, + "loss": 0.1747, + "step": 8688 + }, + { + "epoch": 1.7585509006274034, + "grad_norm": 0.2592688202857971, + "learning_rate": 7.137079034379124e-06, + "loss": 0.1529, + "step": 8689 + }, + { + "epoch": 1.7587532888079336, + "grad_norm": 0.31665951013565063, + "learning_rate": 7.125282986589355e-06, + "loss": 0.1954, + "step": 8690 + }, + { + "epoch": 1.7589556769884638, + "grad_norm": 0.26507389545440674, + "learning_rate": 7.113496334888936e-06, + "loss": 0.1584, + "step": 8691 + }, + { + "epoch": 1.759158065168994, + "grad_norm": 0.3036040961742401, + "learning_rate": 7.101719080470304e-06, + "loss": 0.1909, + "step": 8692 + }, + { + "epoch": 1.7593604533495244, + "grad_norm": 0.2305152714252472, + "learning_rate": 7.089951224524971e-06, + "loss": 0.1737, + "step": 8693 + }, + { + "epoch": 1.7595628415300546, + "grad_norm": 0.2953859567642212, + "learning_rate": 7.078192768243486e-06, + "loss": 0.1938, + "step": 8694 + }, + { + "epoch": 1.759765229710585, + "grad_norm": 0.28796306252479553, + "learning_rate": 7.066443712815429e-06, + "loss": 0.1981, + "step": 8695 + }, + { + "epoch": 1.7599676178911152, + "grad_norm": 0.25642630457878113, + "learning_rate": 7.054704059429463e-06, + "loss": 0.181, + "step": 8696 + }, + { + "epoch": 1.7601700060716454, + "grad_norm": 0.28918367624282837, + "learning_rate": 7.0429738092732676e-06, + "loss": 0.1828, + "step": 8697 + }, + { + "epoch": 1.7603723942521756, + "grad_norm": 0.3338787853717804, + "learning_rate": 7.0312529635335965e-06, + "loss": 0.1814, + "step": 8698 + }, + { + "epoch": 1.7605747824327058, + "grad_norm": 0.24299395084381104, + "learning_rate": 7.019541523396245e-06, + "loss": 0.139, + "step": 8699 + }, + { + "epoch": 1.7607771706132362, + "grad_norm": 0.27827152609825134, + "learning_rate": 7.007839490046042e-06, + "loss": 0.1963, + "step": 8700 + }, + { + "epoch": 1.7607771706132362, + "eval_loss": 0.2589094042778015, + "eval_runtime": 0.7364, + "eval_samples_per_second": 6.789, + "eval_steps_per_second": 1.358, + "step": 8700 + }, + { + "epoch": 1.7609795587937664, + "grad_norm": 0.2616543769836426, + "learning_rate": 6.9961468646668855e-06, + "loss": 0.1759, + "step": 8701 + }, + { + "epoch": 1.7611819469742969, + "grad_norm": 0.27737903594970703, + "learning_rate": 6.984463648441719e-06, + "loss": 0.2049, + "step": 8702 + }, + { + "epoch": 1.761384335154827, + "grad_norm": 0.2969575524330139, + "learning_rate": 6.9727898425525185e-06, + "loss": 0.2156, + "step": 8703 + }, + { + "epoch": 1.7615867233353573, + "grad_norm": 0.31687629222869873, + "learning_rate": 6.9611254481803055e-06, + "loss": 0.2173, + "step": 8704 + }, + { + "epoch": 1.7617891115158875, + "grad_norm": 0.22732824087142944, + "learning_rate": 6.949470466505181e-06, + "loss": 0.1607, + "step": 8705 + }, + { + "epoch": 1.7619914996964177, + "grad_norm": 0.28792738914489746, + "learning_rate": 6.937824898706269e-06, + "loss": 0.1731, + "step": 8706 + }, + { + "epoch": 1.7621938878769479, + "grad_norm": 0.28398075699806213, + "learning_rate": 6.926188745961748e-06, + "loss": 0.1807, + "step": 8707 + }, + { + "epoch": 1.7623962760574783, + "grad_norm": 0.24871675670146942, + "learning_rate": 6.914562009448833e-06, + "loss": 0.1634, + "step": 8708 + }, + { + "epoch": 1.7625986642380085, + "grad_norm": 0.2863733470439911, + "learning_rate": 6.9029446903437934e-06, + "loss": 0.1672, + "step": 8709 + }, + { + "epoch": 1.762801052418539, + "grad_norm": 0.28869694471359253, + "learning_rate": 6.8913367898219565e-06, + "loss": 0.1901, + "step": 8710 + }, + { + "epoch": 1.763003440599069, + "grad_norm": 0.28518444299697876, + "learning_rate": 6.879738309057693e-06, + "loss": 0.1773, + "step": 8711 + }, + { + "epoch": 1.7632058287795993, + "grad_norm": 0.27658623456954956, + "learning_rate": 6.868149249224398e-06, + "loss": 0.1697, + "step": 8712 + }, + { + "epoch": 1.7634082169601295, + "grad_norm": 0.2947976887226105, + "learning_rate": 6.856569611494546e-06, + "loss": 0.2189, + "step": 8713 + }, + { + "epoch": 1.7636106051406597, + "grad_norm": 0.3152853846549988, + "learning_rate": 6.844999397039631e-06, + "loss": 0.2387, + "step": 8714 + }, + { + "epoch": 1.76381299332119, + "grad_norm": 0.27520832419395447, + "learning_rate": 6.833438607030218e-06, + "loss": 0.1728, + "step": 8715 + }, + { + "epoch": 1.7640153815017203, + "grad_norm": 0.3301842510700226, + "learning_rate": 6.821887242635905e-06, + "loss": 0.1953, + "step": 8716 + }, + { + "epoch": 1.7642177696822505, + "grad_norm": 0.2489853948354721, + "learning_rate": 6.810345305025345e-06, + "loss": 0.1564, + "step": 8717 + }, + { + "epoch": 1.764420157862781, + "grad_norm": 0.2562078833580017, + "learning_rate": 6.798812795366205e-06, + "loss": 0.1522, + "step": 8718 + }, + { + "epoch": 1.7646225460433111, + "grad_norm": 0.2957938015460968, + "learning_rate": 6.7872897148252294e-06, + "loss": 0.1752, + "step": 8719 + }, + { + "epoch": 1.7648249342238413, + "grad_norm": 0.2993956208229065, + "learning_rate": 6.775776064568218e-06, + "loss": 0.1718, + "step": 8720 + }, + { + "epoch": 1.7650273224043715, + "grad_norm": 0.45438745617866516, + "learning_rate": 6.764271845759995e-06, + "loss": 0.1941, + "step": 8721 + }, + { + "epoch": 1.7652297105849017, + "grad_norm": 0.2617250978946686, + "learning_rate": 6.75277705956443e-06, + "loss": 0.1881, + "step": 8722 + }, + { + "epoch": 1.765432098765432, + "grad_norm": 0.31186604499816895, + "learning_rate": 6.74129170714446e-06, + "loss": 0.1943, + "step": 8723 + }, + { + "epoch": 1.7656344869459624, + "grad_norm": 0.27825167775154114, + "learning_rate": 6.729815789662031e-06, + "loss": 0.162, + "step": 8724 + }, + { + "epoch": 1.7658368751264926, + "grad_norm": 0.3025761842727661, + "learning_rate": 6.718349308278171e-06, + "loss": 0.2153, + "step": 8725 + }, + { + "epoch": 1.766039263307023, + "grad_norm": 0.3043777048587799, + "learning_rate": 6.70689226415292e-06, + "loss": 0.183, + "step": 8726 + }, + { + "epoch": 1.7662416514875532, + "grad_norm": 0.25967058539390564, + "learning_rate": 6.695444658445416e-06, + "loss": 0.1866, + "step": 8727 + }, + { + "epoch": 1.7664440396680834, + "grad_norm": 0.34056445956230164, + "learning_rate": 6.684006492313788e-06, + "loss": 0.2396, + "step": 8728 + }, + { + "epoch": 1.7666464278486136, + "grad_norm": 0.2622250020503998, + "learning_rate": 6.672577766915222e-06, + "loss": 0.1957, + "step": 8729 + }, + { + "epoch": 1.7668488160291438, + "grad_norm": 0.3250180184841156, + "learning_rate": 6.661158483405971e-06, + "loss": 0.2404, + "step": 8730 + }, + { + "epoch": 1.7670512042096742, + "grad_norm": 0.3211231529712677, + "learning_rate": 6.6497486429413e-06, + "loss": 0.2132, + "step": 8731 + }, + { + "epoch": 1.7672535923902044, + "grad_norm": 0.29395386576652527, + "learning_rate": 6.638348246675563e-06, + "loss": 0.1855, + "step": 8732 + }, + { + "epoch": 1.7674559805707348, + "grad_norm": 0.26442763209342957, + "learning_rate": 6.626957295762115e-06, + "loss": 0.1919, + "step": 8733 + }, + { + "epoch": 1.767658368751265, + "grad_norm": 0.32076603174209595, + "learning_rate": 6.6155757913533675e-06, + "loss": 0.1912, + "step": 8734 + }, + { + "epoch": 1.7678607569317952, + "grad_norm": 0.25481972098350525, + "learning_rate": 6.604203734600789e-06, + "loss": 0.1663, + "step": 8735 + }, + { + "epoch": 1.7680631451123254, + "grad_norm": 0.260008305311203, + "learning_rate": 6.592841126654892e-06, + "loss": 0.164, + "step": 8736 + }, + { + "epoch": 1.7682655332928556, + "grad_norm": 0.2986699342727661, + "learning_rate": 6.581487968665223e-06, + "loss": 0.2153, + "step": 8737 + }, + { + "epoch": 1.7684679214733858, + "grad_norm": 0.2634279429912567, + "learning_rate": 6.570144261780364e-06, + "loss": 0.1995, + "step": 8738 + }, + { + "epoch": 1.7686703096539163, + "grad_norm": 0.334778368473053, + "learning_rate": 6.558810007147986e-06, + "loss": 0.2261, + "step": 8739 + }, + { + "epoch": 1.7688726978344465, + "grad_norm": 0.2399263083934784, + "learning_rate": 6.547485205914716e-06, + "loss": 0.1512, + "step": 8740 + }, + { + "epoch": 1.7690750860149769, + "grad_norm": 0.28150245547294617, + "learning_rate": 6.536169859226316e-06, + "loss": 0.2154, + "step": 8741 + }, + { + "epoch": 1.769277474195507, + "grad_norm": 0.3357497453689575, + "learning_rate": 6.524863968227535e-06, + "loss": 0.2347, + "step": 8742 + }, + { + "epoch": 1.7694798623760373, + "grad_norm": 0.3222461938858032, + "learning_rate": 6.5135675340622035e-06, + "loss": 0.1789, + "step": 8743 + }, + { + "epoch": 1.7696822505565675, + "grad_norm": 0.27976444363594055, + "learning_rate": 6.502280557873164e-06, + "loss": 0.1678, + "step": 8744 + }, + { + "epoch": 1.7698846387370977, + "grad_norm": 0.30489134788513184, + "learning_rate": 6.491003040802323e-06, + "loss": 0.1747, + "step": 8745 + }, + { + "epoch": 1.7700870269176279, + "grad_norm": 0.28980520367622375, + "learning_rate": 6.479734983990604e-06, + "loss": 0.1881, + "step": 8746 + }, + { + "epoch": 1.7702894150981583, + "grad_norm": 0.2970348596572876, + "learning_rate": 6.468476388578016e-06, + "loss": 0.2024, + "step": 8747 + }, + { + "epoch": 1.7704918032786885, + "grad_norm": 0.2925608158111572, + "learning_rate": 6.4572272557035575e-06, + "loss": 0.1726, + "step": 8748 + }, + { + "epoch": 1.770694191459219, + "grad_norm": 0.3097609281539917, + "learning_rate": 6.44598758650532e-06, + "loss": 0.2162, + "step": 8749 + }, + { + "epoch": 1.7708965796397491, + "grad_norm": 0.30766546726226807, + "learning_rate": 6.4347573821204044e-06, + "loss": 0.2104, + "step": 8750 + }, + { + "epoch": 1.7708965796397491, + "eval_loss": 0.2586788237094879, + "eval_runtime": 0.7386, + "eval_samples_per_second": 6.77, + "eval_steps_per_second": 1.354, + "step": 8750 + }, + { + "epoch": 1.7710989678202793, + "grad_norm": 0.28390100598335266, + "learning_rate": 6.423536643684969e-06, + "loss": 0.1976, + "step": 8751 + }, + { + "epoch": 1.7713013560008095, + "grad_norm": 0.2914179265499115, + "learning_rate": 6.412325372334216e-06, + "loss": 0.2039, + "step": 8752 + }, + { + "epoch": 1.7715037441813397, + "grad_norm": 0.30227139592170715, + "learning_rate": 6.401123569202372e-06, + "loss": 0.2108, + "step": 8753 + }, + { + "epoch": 1.77170613236187, + "grad_norm": 0.29538655281066895, + "learning_rate": 6.389931235422719e-06, + "loss": 0.218, + "step": 8754 + }, + { + "epoch": 1.7719085205424003, + "grad_norm": 0.3247475326061249, + "learning_rate": 6.378748372127585e-06, + "loss": 0.2053, + "step": 8755 + }, + { + "epoch": 1.7721109087229305, + "grad_norm": 0.2719709575176239, + "learning_rate": 6.367574980448343e-06, + "loss": 0.1944, + "step": 8756 + }, + { + "epoch": 1.772313296903461, + "grad_norm": 0.24502572417259216, + "learning_rate": 6.356411061515377e-06, + "loss": 0.1592, + "step": 8757 + }, + { + "epoch": 1.7725156850839912, + "grad_norm": 0.26764917373657227, + "learning_rate": 6.345256616458162e-06, + "loss": 0.1708, + "step": 8758 + }, + { + "epoch": 1.7727180732645214, + "grad_norm": 0.2626522183418274, + "learning_rate": 6.334111646405172e-06, + "loss": 0.1885, + "step": 8759 + }, + { + "epoch": 1.7729204614450516, + "grad_norm": 0.3055856227874756, + "learning_rate": 6.322976152483928e-06, + "loss": 0.1749, + "step": 8760 + }, + { + "epoch": 1.7731228496255818, + "grad_norm": 0.25192734599113464, + "learning_rate": 6.311850135821052e-06, + "loss": 0.1478, + "step": 8761 + }, + { + "epoch": 1.7733252378061122, + "grad_norm": 0.2746264934539795, + "learning_rate": 6.300733597542085e-06, + "loss": 0.1972, + "step": 8762 + }, + { + "epoch": 1.7735276259866424, + "grad_norm": 0.24976009130477905, + "learning_rate": 6.289626538771731e-06, + "loss": 0.1871, + "step": 8763 + }, + { + "epoch": 1.7737300141671728, + "grad_norm": 0.29063618183135986, + "learning_rate": 6.278528960633667e-06, + "loss": 0.1936, + "step": 8764 + }, + { + "epoch": 1.773932402347703, + "grad_norm": 0.27964475750923157, + "learning_rate": 6.267440864250629e-06, + "loss": 0.1969, + "step": 8765 + }, + { + "epoch": 1.7741347905282332, + "grad_norm": 0.3473265767097473, + "learning_rate": 6.256362250744407e-06, + "loss": 0.2297, + "step": 8766 + }, + { + "epoch": 1.7743371787087634, + "grad_norm": 0.28343695402145386, + "learning_rate": 6.2452931212358064e-06, + "loss": 0.1899, + "step": 8767 + }, + { + "epoch": 1.7745395668892936, + "grad_norm": 0.2353745698928833, + "learning_rate": 6.234233476844697e-06, + "loss": 0.1692, + "step": 8768 + }, + { + "epoch": 1.7747419550698238, + "grad_norm": 0.2326369434595108, + "learning_rate": 6.223183318689973e-06, + "loss": 0.1342, + "step": 8769 + }, + { + "epoch": 1.7749443432503542, + "grad_norm": 0.29546186327934265, + "learning_rate": 6.212142647889563e-06, + "loss": 0.2019, + "step": 8770 + }, + { + "epoch": 1.7751467314308844, + "grad_norm": 0.32325753569602966, + "learning_rate": 6.201111465560461e-06, + "loss": 0.2141, + "step": 8771 + }, + { + "epoch": 1.7753491196114148, + "grad_norm": 0.275481253862381, + "learning_rate": 6.190089772818674e-06, + "loss": 0.1821, + "step": 8772 + }, + { + "epoch": 1.775551507791945, + "grad_norm": 0.29665717482566833, + "learning_rate": 6.179077570779279e-06, + "loss": 0.1749, + "step": 8773 + }, + { + "epoch": 1.7757538959724752, + "grad_norm": 0.3191758990287781, + "learning_rate": 6.168074860556361e-06, + "loss": 0.1951, + "step": 8774 + }, + { + "epoch": 1.7759562841530054, + "grad_norm": 0.24474631249904633, + "learning_rate": 6.1570816432630515e-06, + "loss": 0.1552, + "step": 8775 + }, + { + "epoch": 1.7761586723335356, + "grad_norm": 0.25271105766296387, + "learning_rate": 6.1460979200115505e-06, + "loss": 0.1534, + "step": 8776 + }, + { + "epoch": 1.7763610605140658, + "grad_norm": 0.31084853410720825, + "learning_rate": 6.135123691913059e-06, + "loss": 0.2258, + "step": 8777 + }, + { + "epoch": 1.7765634486945963, + "grad_norm": 0.2757551968097687, + "learning_rate": 6.12415896007783e-06, + "loss": 0.1842, + "step": 8778 + }, + { + "epoch": 1.7767658368751265, + "grad_norm": 0.24306604266166687, + "learning_rate": 6.11320372561518e-06, + "loss": 0.1484, + "step": 8779 + }, + { + "epoch": 1.7769682250556569, + "grad_norm": 0.27842000126838684, + "learning_rate": 6.102257989633431e-06, + "loss": 0.1912, + "step": 8780 + }, + { + "epoch": 1.777170613236187, + "grad_norm": 0.28890183568000793, + "learning_rate": 6.0913217532399645e-06, + "loss": 0.2146, + "step": 8781 + }, + { + "epoch": 1.7773730014167173, + "grad_norm": 0.28030017018318176, + "learning_rate": 6.080395017541185e-06, + "loss": 0.1858, + "step": 8782 + }, + { + "epoch": 1.7775753895972475, + "grad_norm": 0.271921843290329, + "learning_rate": 6.069477783642563e-06, + "loss": 0.1861, + "step": 8783 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.2917061150074005, + "learning_rate": 6.05857005264856e-06, + "loss": 0.204, + "step": 8784 + }, + { + "epoch": 1.7779801659583079, + "grad_norm": 0.24218259751796722, + "learning_rate": 6.0476718256627375e-06, + "loss": 0.1711, + "step": 8785 + }, + { + "epoch": 1.7781825541388383, + "grad_norm": 0.2720330059528351, + "learning_rate": 6.036783103787635e-06, + "loss": 0.2027, + "step": 8786 + }, + { + "epoch": 1.7783849423193685, + "grad_norm": 0.2660661041736603, + "learning_rate": 6.025903888124884e-06, + "loss": 0.1892, + "step": 8787 + }, + { + "epoch": 1.778587330499899, + "grad_norm": 0.4043765962123871, + "learning_rate": 6.015034179775114e-06, + "loss": 0.1995, + "step": 8788 + }, + { + "epoch": 1.7787897186804291, + "grad_norm": 0.28639769554138184, + "learning_rate": 6.004173979838013e-06, + "loss": 0.1876, + "step": 8789 + }, + { + "epoch": 1.7789921068609593, + "grad_norm": 0.23483648896217346, + "learning_rate": 5.9933232894123e-06, + "loss": 0.1881, + "step": 8790 + }, + { + "epoch": 1.7791944950414895, + "grad_norm": 0.2661982476711273, + "learning_rate": 5.982482109595744e-06, + "loss": 0.1786, + "step": 8791 + }, + { + "epoch": 1.7793968832220197, + "grad_norm": 0.27296239137649536, + "learning_rate": 5.971650441485121e-06, + "loss": 0.1766, + "step": 8792 + }, + { + "epoch": 1.7795992714025501, + "grad_norm": 0.30432015657424927, + "learning_rate": 5.96082828617629e-06, + "loss": 0.2276, + "step": 8793 + }, + { + "epoch": 1.7798016595830803, + "grad_norm": 0.3130051791667938, + "learning_rate": 5.950015644764106e-06, + "loss": 0.2133, + "step": 8794 + }, + { + "epoch": 1.7800040477636108, + "grad_norm": 0.27052175998687744, + "learning_rate": 5.939212518342485e-06, + "loss": 0.1942, + "step": 8795 + }, + { + "epoch": 1.780206435944141, + "grad_norm": 0.255877822637558, + "learning_rate": 5.9284189080043625e-06, + "loss": 0.175, + "step": 8796 + }, + { + "epoch": 1.7804088241246712, + "grad_norm": 0.2689751088619232, + "learning_rate": 5.917634814841743e-06, + "loss": 0.1921, + "step": 8797 + }, + { + "epoch": 1.7806112123052014, + "grad_norm": 0.24834966659545898, + "learning_rate": 5.906860239945644e-06, + "loss": 0.1492, + "step": 8798 + }, + { + "epoch": 1.7808136004857316, + "grad_norm": 0.28980231285095215, + "learning_rate": 5.896095184406103e-06, + "loss": 0.1657, + "step": 8799 + }, + { + "epoch": 1.7810159886662618, + "grad_norm": 0.2986690402030945, + "learning_rate": 5.885339649312238e-06, + "loss": 0.2066, + "step": 8800 + }, + { + "epoch": 1.7810159886662618, + "eval_loss": 0.2586442232131958, + "eval_runtime": 0.7377, + "eval_samples_per_second": 6.778, + "eval_steps_per_second": 1.356, + "step": 8800 + }, + { + "epoch": 1.7812183768467922, + "grad_norm": 0.23212267458438873, + "learning_rate": 5.874593635752179e-06, + "loss": 0.1668, + "step": 8801 + }, + { + "epoch": 1.7814207650273224, + "grad_norm": 0.31673598289489746, + "learning_rate": 5.863857144813078e-06, + "loss": 0.219, + "step": 8802 + }, + { + "epoch": 1.7816231532078528, + "grad_norm": 0.3033224642276764, + "learning_rate": 5.8531301775811565e-06, + "loss": 0.1996, + "step": 8803 + }, + { + "epoch": 1.781825541388383, + "grad_norm": 0.24953462183475494, + "learning_rate": 5.8424127351416556e-06, + "loss": 0.1833, + "step": 8804 + }, + { + "epoch": 1.7820279295689132, + "grad_norm": 0.25415509939193726, + "learning_rate": 5.831704818578843e-06, + "loss": 0.2136, + "step": 8805 + }, + { + "epoch": 1.7822303177494434, + "grad_norm": 0.282206267118454, + "learning_rate": 5.821006428976061e-06, + "loss": 0.2218, + "step": 8806 + }, + { + "epoch": 1.7824327059299736, + "grad_norm": 0.25228193402290344, + "learning_rate": 5.810317567415624e-06, + "loss": 0.1884, + "step": 8807 + }, + { + "epoch": 1.7826350941105038, + "grad_norm": 0.2990221083164215, + "learning_rate": 5.799638234978933e-06, + "loss": 0.1995, + "step": 8808 + }, + { + "epoch": 1.7828374822910342, + "grad_norm": 0.2889970541000366, + "learning_rate": 5.788968432746411e-06, + "loss": 0.2128, + "step": 8809 + }, + { + "epoch": 1.7830398704715644, + "grad_norm": 0.2746729254722595, + "learning_rate": 5.7783081617975184e-06, + "loss": 0.1643, + "step": 8810 + }, + { + "epoch": 1.7832422586520948, + "grad_norm": 0.3056463897228241, + "learning_rate": 5.767657423210749e-06, + "loss": 0.2146, + "step": 8811 + }, + { + "epoch": 1.783444646832625, + "grad_norm": 0.26303210854530334, + "learning_rate": 5.757016218063638e-06, + "loss": 0.1719, + "step": 8812 + }, + { + "epoch": 1.7836470350131552, + "grad_norm": 0.23666363954544067, + "learning_rate": 5.746384547432737e-06, + "loss": 0.1649, + "step": 8813 + }, + { + "epoch": 1.7838494231936854, + "grad_norm": 0.27246472239494324, + "learning_rate": 5.7357624123936635e-06, + "loss": 0.172, + "step": 8814 + }, + { + "epoch": 1.7840518113742156, + "grad_norm": 0.30931583046913147, + "learning_rate": 5.725149814021036e-06, + "loss": 0.2071, + "step": 8815 + }, + { + "epoch": 1.7842541995547458, + "grad_norm": 0.2522447109222412, + "learning_rate": 5.714546753388539e-06, + "loss": 0.1752, + "step": 8816 + }, + { + "epoch": 1.7844565877352763, + "grad_norm": 0.31273213028907776, + "learning_rate": 5.703953231568881e-06, + "loss": 0.203, + "step": 8817 + }, + { + "epoch": 1.7846589759158065, + "grad_norm": 0.30619895458221436, + "learning_rate": 5.693369249633795e-06, + "loss": 0.1848, + "step": 8818 + }, + { + "epoch": 1.7848613640963369, + "grad_norm": 0.27686363458633423, + "learning_rate": 5.6827948086540575e-06, + "loss": 0.2046, + "step": 8819 + }, + { + "epoch": 1.785063752276867, + "grad_norm": 0.2953948974609375, + "learning_rate": 5.672229909699489e-06, + "loss": 0.1875, + "step": 8820 + }, + { + "epoch": 1.7852661404573973, + "grad_norm": 0.3067280948162079, + "learning_rate": 5.661674553838925e-06, + "loss": 0.1811, + "step": 8821 + }, + { + "epoch": 1.7854685286379275, + "grad_norm": 0.2993681728839874, + "learning_rate": 5.6511287421402435e-06, + "loss": 0.1772, + "step": 8822 + }, + { + "epoch": 1.7856709168184577, + "grad_norm": 0.28528183698654175, + "learning_rate": 5.6405924756703696e-06, + "loss": 0.1876, + "step": 8823 + }, + { + "epoch": 1.785873304998988, + "grad_norm": 0.28923770785331726, + "learning_rate": 5.63006575549524e-06, + "loss": 0.1872, + "step": 8824 + }, + { + "epoch": 1.7860756931795183, + "grad_norm": 0.32765883207321167, + "learning_rate": 5.619548582679857e-06, + "loss": 0.1737, + "step": 8825 + }, + { + "epoch": 1.7862780813600487, + "grad_norm": 0.24244408309459686, + "learning_rate": 5.6090409582882145e-06, + "loss": 0.1515, + "step": 8826 + }, + { + "epoch": 1.786480469540579, + "grad_norm": 0.26799750328063965, + "learning_rate": 5.5985428833833846e-06, + "loss": 0.1791, + "step": 8827 + }, + { + "epoch": 1.7866828577211091, + "grad_norm": 0.22886481881141663, + "learning_rate": 5.588054359027439e-06, + "loss": 0.1426, + "step": 8828 + }, + { + "epoch": 1.7868852459016393, + "grad_norm": 0.27654311060905457, + "learning_rate": 5.577575386281497e-06, + "loss": 0.1953, + "step": 8829 + }, + { + "epoch": 1.7870876340821695, + "grad_norm": 0.2745510935783386, + "learning_rate": 5.56710596620571e-06, + "loss": 0.1708, + "step": 8830 + }, + { + "epoch": 1.7872900222626997, + "grad_norm": 0.25635775923728943, + "learning_rate": 5.556646099859275e-06, + "loss": 0.184, + "step": 8831 + }, + { + "epoch": 1.7874924104432302, + "grad_norm": 0.26283639669418335, + "learning_rate": 5.546195788300401e-06, + "loss": 0.1622, + "step": 8832 + }, + { + "epoch": 1.7876947986237604, + "grad_norm": 0.2936389744281769, + "learning_rate": 5.535755032586354e-06, + "loss": 0.2106, + "step": 8833 + }, + { + "epoch": 1.7878971868042908, + "grad_norm": 0.2639520764350891, + "learning_rate": 5.525323833773399e-06, + "loss": 0.1553, + "step": 8834 + }, + { + "epoch": 1.788099574984821, + "grad_norm": 0.2605137526988983, + "learning_rate": 5.514902192916871e-06, + "loss": 0.1998, + "step": 8835 + }, + { + "epoch": 1.7883019631653512, + "grad_norm": 0.25581395626068115, + "learning_rate": 5.504490111071114e-06, + "loss": 0.184, + "step": 8836 + }, + { + "epoch": 1.7885043513458814, + "grad_norm": 0.2903321087360382, + "learning_rate": 5.494087589289531e-06, + "loss": 0.2201, + "step": 8837 + }, + { + "epoch": 1.7887067395264116, + "grad_norm": 0.31646454334259033, + "learning_rate": 5.483694628624514e-06, + "loss": 0.2071, + "step": 8838 + }, + { + "epoch": 1.7889091277069418, + "grad_norm": 0.28405052423477173, + "learning_rate": 5.473311230127531e-06, + "loss": 0.1903, + "step": 8839 + }, + { + "epoch": 1.7891115158874722, + "grad_norm": 0.31384506821632385, + "learning_rate": 5.4629373948490545e-06, + "loss": 0.1911, + "step": 8840 + }, + { + "epoch": 1.7893139040680024, + "grad_norm": 0.23068785667419434, + "learning_rate": 5.452573123838611e-06, + "loss": 0.1725, + "step": 8841 + }, + { + "epoch": 1.7895162922485328, + "grad_norm": 0.24589887261390686, + "learning_rate": 5.442218418144751e-06, + "loss": 0.1438, + "step": 8842 + }, + { + "epoch": 1.789718680429063, + "grad_norm": 0.25478124618530273, + "learning_rate": 5.4318732788150366e-06, + "loss": 0.1512, + "step": 8843 + }, + { + "epoch": 1.7899210686095932, + "grad_norm": 0.2645474970340729, + "learning_rate": 5.421537706896096e-06, + "loss": 0.1695, + "step": 8844 + }, + { + "epoch": 1.7901234567901234, + "grad_norm": 0.26867932081222534, + "learning_rate": 5.411211703433572e-06, + "loss": 0.1791, + "step": 8845 + }, + { + "epoch": 1.7903258449706536, + "grad_norm": 0.2700962722301483, + "learning_rate": 5.4008952694721395e-06, + "loss": 0.1858, + "step": 8846 + }, + { + "epoch": 1.7905282331511838, + "grad_norm": 0.2806064188480377, + "learning_rate": 5.390588406055497e-06, + "loss": 0.2028, + "step": 8847 + }, + { + "epoch": 1.7907306213317142, + "grad_norm": 0.3218914568424225, + "learning_rate": 5.38029111422641e-06, + "loss": 0.2434, + "step": 8848 + }, + { + "epoch": 1.7909330095122444, + "grad_norm": 0.2712363600730896, + "learning_rate": 5.370003395026624e-06, + "loss": 0.1776, + "step": 8849 + }, + { + "epoch": 1.7911353976927749, + "grad_norm": 0.2838458716869354, + "learning_rate": 5.359725249496972e-06, + "loss": 0.1613, + "step": 8850 + }, + { + "epoch": 1.7911353976927749, + "eval_loss": 0.2580936849117279, + "eval_runtime": 0.737, + "eval_samples_per_second": 6.785, + "eval_steps_per_second": 1.357, + "step": 8850 + }, + { + "epoch": 1.791337785873305, + "grad_norm": 0.2904452979564667, + "learning_rate": 5.349456678677245e-06, + "loss": 0.1794, + "step": 8851 + }, + { + "epoch": 1.7915401740538353, + "grad_norm": 0.3108134865760803, + "learning_rate": 5.339197683606345e-06, + "loss": 0.1996, + "step": 8852 + }, + { + "epoch": 1.7917425622343655, + "grad_norm": 0.24708275496959686, + "learning_rate": 5.328948265322154e-06, + "loss": 0.1853, + "step": 8853 + }, + { + "epoch": 1.7919449504148957, + "grad_norm": 0.30340859293937683, + "learning_rate": 5.318708424861607e-06, + "loss": 0.2239, + "step": 8854 + }, + { + "epoch": 1.792147338595426, + "grad_norm": 0.26493820548057556, + "learning_rate": 5.3084781632606665e-06, + "loss": 0.1647, + "step": 8855 + }, + { + "epoch": 1.7923497267759563, + "grad_norm": 0.29768866300582886, + "learning_rate": 5.298257481554314e-06, + "loss": 0.1698, + "step": 8856 + }, + { + "epoch": 1.7925521149564867, + "grad_norm": 0.25119927525520325, + "learning_rate": 5.2880463807765786e-06, + "loss": 0.1792, + "step": 8857 + }, + { + "epoch": 1.792754503137017, + "grad_norm": 0.3058163523674011, + "learning_rate": 5.277844861960512e-06, + "loss": 0.1858, + "step": 8858 + }, + { + "epoch": 1.792956891317547, + "grad_norm": 0.3083089292049408, + "learning_rate": 5.267652926138189e-06, + "loss": 0.1938, + "step": 8859 + }, + { + "epoch": 1.7931592794980773, + "grad_norm": 0.22727486491203308, + "learning_rate": 5.257470574340729e-06, + "loss": 0.1461, + "step": 8860 + }, + { + "epoch": 1.7933616676786075, + "grad_norm": 0.2988595962524414, + "learning_rate": 5.247297807598273e-06, + "loss": 0.1964, + "step": 8861 + }, + { + "epoch": 1.7935640558591377, + "grad_norm": 0.2653481662273407, + "learning_rate": 5.237134626939988e-06, + "loss": 0.1779, + "step": 8862 + }, + { + "epoch": 1.7937664440396681, + "grad_norm": 0.32473576068878174, + "learning_rate": 5.226981033394096e-06, + "loss": 0.2151, + "step": 8863 + }, + { + "epoch": 1.7939688322201983, + "grad_norm": 0.25815069675445557, + "learning_rate": 5.2168370279878195e-06, + "loss": 0.1758, + "step": 8864 + }, + { + "epoch": 1.7941712204007287, + "grad_norm": 0.25564804673194885, + "learning_rate": 5.206702611747427e-06, + "loss": 0.1649, + "step": 8865 + }, + { + "epoch": 1.794373608581259, + "grad_norm": 0.26895710825920105, + "learning_rate": 5.196577785698198e-06, + "loss": 0.1957, + "step": 8866 + }, + { + "epoch": 1.7945759967617891, + "grad_norm": 0.24266557395458221, + "learning_rate": 5.186462550864479e-06, + "loss": 0.1339, + "step": 8867 + }, + { + "epoch": 1.7947783849423193, + "grad_norm": 0.30665162205696106, + "learning_rate": 5.176356908269608e-06, + "loss": 0.1948, + "step": 8868 + }, + { + "epoch": 1.7949807731228495, + "grad_norm": 0.257072389125824, + "learning_rate": 5.166260858935978e-06, + "loss": 0.1676, + "step": 8869 + }, + { + "epoch": 1.7951831613033797, + "grad_norm": 0.269353449344635, + "learning_rate": 5.156174403884984e-06, + "loss": 0.1795, + "step": 8870 + }, + { + "epoch": 1.7953855494839102, + "grad_norm": 0.269663542509079, + "learning_rate": 5.146097544137085e-06, + "loss": 0.1738, + "step": 8871 + }, + { + "epoch": 1.7955879376644404, + "grad_norm": 0.2640727162361145, + "learning_rate": 5.136030280711757e-06, + "loss": 0.198, + "step": 8872 + }, + { + "epoch": 1.7957903258449708, + "grad_norm": 0.30548980832099915, + "learning_rate": 5.125972614627483e-06, + "loss": 0.2174, + "step": 8873 + }, + { + "epoch": 1.795992714025501, + "grad_norm": 0.27031007409095764, + "learning_rate": 5.115924546901796e-06, + "loss": 0.1814, + "step": 8874 + }, + { + "epoch": 1.7961951022060312, + "grad_norm": 0.27772194147109985, + "learning_rate": 5.1058860785512476e-06, + "loss": 0.2004, + "step": 8875 + }, + { + "epoch": 1.7963974903865614, + "grad_norm": 0.2998545169830322, + "learning_rate": 5.095857210591437e-06, + "loss": 0.1777, + "step": 8876 + }, + { + "epoch": 1.7965998785670916, + "grad_norm": 0.29473671317100525, + "learning_rate": 5.085837944036976e-06, + "loss": 0.1935, + "step": 8877 + }, + { + "epoch": 1.7968022667476218, + "grad_norm": 0.30179837346076965, + "learning_rate": 5.075828279901507e-06, + "loss": 0.2038, + "step": 8878 + }, + { + "epoch": 1.7970046549281522, + "grad_norm": 0.32339680194854736, + "learning_rate": 5.065828219197699e-06, + "loss": 0.2456, + "step": 8879 + }, + { + "epoch": 1.7972070431086824, + "grad_norm": 0.26312246918678284, + "learning_rate": 5.055837762937265e-06, + "loss": 0.1624, + "step": 8880 + }, + { + "epoch": 1.7974094312892128, + "grad_norm": 0.29252588748931885, + "learning_rate": 5.045856912130931e-06, + "loss": 0.1632, + "step": 8881 + }, + { + "epoch": 1.797611819469743, + "grad_norm": 0.2949320375919342, + "learning_rate": 5.035885667788454e-06, + "loss": 0.1967, + "step": 8882 + }, + { + "epoch": 1.7978142076502732, + "grad_norm": 0.2704543471336365, + "learning_rate": 5.025924030918616e-06, + "loss": 0.2029, + "step": 8883 + }, + { + "epoch": 1.7980165958308034, + "grad_norm": 0.2713168263435364, + "learning_rate": 5.0159720025292344e-06, + "loss": 0.18, + "step": 8884 + }, + { + "epoch": 1.7982189840113336, + "grad_norm": 0.30050283670425415, + "learning_rate": 5.006029583627148e-06, + "loss": 0.1939, + "step": 8885 + }, + { + "epoch": 1.798421372191864, + "grad_norm": 0.25041332840919495, + "learning_rate": 4.996096775218218e-06, + "loss": 0.1809, + "step": 8886 + }, + { + "epoch": 1.7986237603723942, + "grad_norm": 0.327347069978714, + "learning_rate": 4.986173578307362e-06, + "loss": 0.2197, + "step": 8887 + }, + { + "epoch": 1.7988261485529247, + "grad_norm": 0.2811526358127594, + "learning_rate": 4.976259993898502e-06, + "loss": 0.2059, + "step": 8888 + }, + { + "epoch": 1.7990285367334549, + "grad_norm": 0.28768783807754517, + "learning_rate": 4.966356022994567e-06, + "loss": 0.183, + "step": 8889 + }, + { + "epoch": 1.799230924913985, + "grad_norm": 0.27493974566459656, + "learning_rate": 4.956461666597567e-06, + "loss": 0.1911, + "step": 8890 + }, + { + "epoch": 1.7994333130945153, + "grad_norm": 0.2958086431026459, + "learning_rate": 4.946576925708491e-06, + "loss": 0.2226, + "step": 8891 + }, + { + "epoch": 1.7996357012750455, + "grad_norm": 0.30736634135246277, + "learning_rate": 4.93670180132737e-06, + "loss": 0.192, + "step": 8892 + }, + { + "epoch": 1.7998380894555757, + "grad_norm": 0.31193798780441284, + "learning_rate": 4.926836294453274e-06, + "loss": 0.1944, + "step": 8893 + }, + { + "epoch": 1.800040477636106, + "grad_norm": 0.29948121309280396, + "learning_rate": 4.9169804060843035e-06, + "loss": 0.2295, + "step": 8894 + }, + { + "epoch": 1.8002428658166363, + "grad_norm": 0.28736236691474915, + "learning_rate": 4.907134137217562e-06, + "loss": 0.2076, + "step": 8895 + }, + { + "epoch": 1.8004452539971667, + "grad_norm": 0.2893117666244507, + "learning_rate": 4.897297488849173e-06, + "loss": 0.1743, + "step": 8896 + }, + { + "epoch": 1.800647642177697, + "grad_norm": 0.3026507496833801, + "learning_rate": 4.887470461974331e-06, + "loss": 0.2176, + "step": 8897 + }, + { + "epoch": 1.800850030358227, + "grad_norm": 0.2544638216495514, + "learning_rate": 4.877653057587228e-06, + "loss": 0.179, + "step": 8898 + }, + { + "epoch": 1.8010524185387573, + "grad_norm": 0.24779057502746582, + "learning_rate": 4.867845276681071e-06, + "loss": 0.1852, + "step": 8899 + }, + { + "epoch": 1.8012548067192875, + "grad_norm": 0.27465617656707764, + "learning_rate": 4.858047120248121e-06, + "loss": 0.1983, + "step": 8900 + }, + { + "epoch": 1.8012548067192875, + "eval_loss": 0.25783097743988037, + "eval_runtime": 0.7366, + "eval_samples_per_second": 6.788, + "eval_steps_per_second": 1.358, + "step": 8900 + }, + { + "epoch": 1.8014571948998177, + "grad_norm": 0.2639307379722595, + "learning_rate": 4.848258589279652e-06, + "loss": 0.1657, + "step": 8901 + }, + { + "epoch": 1.8016595830803481, + "grad_norm": 0.26923590898513794, + "learning_rate": 4.838479684765962e-06, + "loss": 0.1766, + "step": 8902 + }, + { + "epoch": 1.8018619712608783, + "grad_norm": 0.2985227108001709, + "learning_rate": 4.828710407696391e-06, + "loss": 0.182, + "step": 8903 + }, + { + "epoch": 1.8020643594414087, + "grad_norm": 0.31524157524108887, + "learning_rate": 4.81895075905926e-06, + "loss": 0.2524, + "step": 8904 + }, + { + "epoch": 1.802266747621939, + "grad_norm": 0.3199750483036041, + "learning_rate": 4.809200739841979e-06, + "loss": 0.1932, + "step": 8905 + }, + { + "epoch": 1.8024691358024691, + "grad_norm": 0.2882361114025116, + "learning_rate": 4.799460351030938e-06, + "loss": 0.1756, + "step": 8906 + }, + { + "epoch": 1.8026715239829993, + "grad_norm": 0.31260427832603455, + "learning_rate": 4.789729593611569e-06, + "loss": 0.2151, + "step": 8907 + }, + { + "epoch": 1.8028739121635295, + "grad_norm": 0.2947652339935303, + "learning_rate": 4.780008468568342e-06, + "loss": 0.1879, + "step": 8908 + }, + { + "epoch": 1.8030763003440597, + "grad_norm": 0.32592445611953735, + "learning_rate": 4.770296976884714e-06, + "loss": 0.1707, + "step": 8909 + }, + { + "epoch": 1.8032786885245902, + "grad_norm": 0.28303763270378113, + "learning_rate": 4.760595119543209e-06, + "loss": 0.18, + "step": 8910 + }, + { + "epoch": 1.8034810767051204, + "grad_norm": 0.2822856605052948, + "learning_rate": 4.750902897525345e-06, + "loss": 0.184, + "step": 8911 + }, + { + "epoch": 1.8036834648856508, + "grad_norm": 0.286300927400589, + "learning_rate": 4.741220311811701e-06, + "loss": 0.1824, + "step": 8912 + }, + { + "epoch": 1.803885853066181, + "grad_norm": 0.29102855920791626, + "learning_rate": 4.7315473633818385e-06, + "loss": 0.181, + "step": 8913 + }, + { + "epoch": 1.8040882412467112, + "grad_norm": 0.28965407609939575, + "learning_rate": 4.721884053214376e-06, + "loss": 0.1647, + "step": 8914 + }, + { + "epoch": 1.8042906294272414, + "grad_norm": 0.28732210397720337, + "learning_rate": 4.7122303822869416e-06, + "loss": 0.2239, + "step": 8915 + }, + { + "epoch": 1.8044930176077716, + "grad_norm": 0.2571806311607361, + "learning_rate": 4.702586351576199e-06, + "loss": 0.1667, + "step": 8916 + }, + { + "epoch": 1.804695405788302, + "grad_norm": 0.25412270426750183, + "learning_rate": 4.692951962057834e-06, + "loss": 0.1638, + "step": 8917 + }, + { + "epoch": 1.8048977939688322, + "grad_norm": 0.2577854096889496, + "learning_rate": 4.683327214706534e-06, + "loss": 0.1646, + "step": 8918 + }, + { + "epoch": 1.8051001821493626, + "grad_norm": 0.3552990257740021, + "learning_rate": 4.673712110496031e-06, + "loss": 0.2013, + "step": 8919 + }, + { + "epoch": 1.8053025703298928, + "grad_norm": 0.31418198347091675, + "learning_rate": 4.6641066503990916e-06, + "loss": 0.2224, + "step": 8920 + }, + { + "epoch": 1.805504958510423, + "grad_norm": 0.3056572675704956, + "learning_rate": 4.6545108353875045e-06, + "loss": 0.2114, + "step": 8921 + }, + { + "epoch": 1.8057073466909532, + "grad_norm": 0.3513403534889221, + "learning_rate": 4.644924666432049e-06, + "loss": 0.1504, + "step": 8922 + }, + { + "epoch": 1.8059097348714834, + "grad_norm": 0.34498804807662964, + "learning_rate": 4.635348144502571e-06, + "loss": 0.1988, + "step": 8923 + }, + { + "epoch": 1.8061121230520136, + "grad_norm": 0.2675020694732666, + "learning_rate": 4.625781270567919e-06, + "loss": 0.1669, + "step": 8924 + }, + { + "epoch": 1.806314511232544, + "grad_norm": 0.24444063007831573, + "learning_rate": 4.616224045595974e-06, + "loss": 0.1672, + "step": 8925 + }, + { + "epoch": 1.8065168994130743, + "grad_norm": 0.2665750980377197, + "learning_rate": 4.606676470553617e-06, + "loss": 0.1883, + "step": 8926 + }, + { + "epoch": 1.8067192875936047, + "grad_norm": 0.3534557521343231, + "learning_rate": 4.597138546406798e-06, + "loss": 0.1867, + "step": 8927 + }, + { + "epoch": 1.8069216757741349, + "grad_norm": 0.3001806437969208, + "learning_rate": 4.587610274120435e-06, + "loss": 0.1955, + "step": 8928 + }, + { + "epoch": 1.807124063954665, + "grad_norm": 0.3241526186466217, + "learning_rate": 4.578091654658523e-06, + "loss": 0.171, + "step": 8929 + }, + { + "epoch": 1.8073264521351953, + "grad_norm": 0.29405054450035095, + "learning_rate": 4.568582688984047e-06, + "loss": 0.1745, + "step": 8930 + }, + { + "epoch": 1.8075288403157255, + "grad_norm": 0.2953342795372009, + "learning_rate": 4.559083378059015e-06, + "loss": 0.1972, + "step": 8931 + }, + { + "epoch": 1.8077312284962557, + "grad_norm": 0.27937665581703186, + "learning_rate": 4.549593722844492e-06, + "loss": 0.1815, + "step": 8932 + }, + { + "epoch": 1.807933616676786, + "grad_norm": 0.3475225567817688, + "learning_rate": 4.54011372430051e-06, + "loss": 0.2308, + "step": 8933 + }, + { + "epoch": 1.8081360048573163, + "grad_norm": 0.33293816447257996, + "learning_rate": 4.530643383386179e-06, + "loss": 0.2115, + "step": 8934 + }, + { + "epoch": 1.8083383930378467, + "grad_norm": 0.2608540654182434, + "learning_rate": 4.5211827010596005e-06, + "loss": 0.1838, + "step": 8935 + }, + { + "epoch": 1.808540781218377, + "grad_norm": 0.29804497957229614, + "learning_rate": 4.511731678277919e-06, + "loss": 0.1847, + "step": 8936 + }, + { + "epoch": 1.8087431693989071, + "grad_norm": 0.2685197591781616, + "learning_rate": 4.502290315997271e-06, + "loss": 0.2106, + "step": 8937 + }, + { + "epoch": 1.8089455575794373, + "grad_norm": 0.274152010679245, + "learning_rate": 4.492858615172824e-06, + "loss": 0.1865, + "step": 8938 + }, + { + "epoch": 1.8091479457599675, + "grad_norm": 0.27463477849960327, + "learning_rate": 4.483436576758826e-06, + "loss": 0.1886, + "step": 8939 + }, + { + "epoch": 1.8093503339404977, + "grad_norm": 0.27460166811943054, + "learning_rate": 4.47402420170846e-06, + "loss": 0.1805, + "step": 8940 + }, + { + "epoch": 1.8095527221210281, + "grad_norm": 0.26897096633911133, + "learning_rate": 4.464621490973986e-06, + "loss": 0.1926, + "step": 8941 + }, + { + "epoch": 1.8097551103015583, + "grad_norm": 0.28815120458602905, + "learning_rate": 4.455228445506665e-06, + "loss": 0.1845, + "step": 8942 + }, + { + "epoch": 1.8099574984820888, + "grad_norm": 0.27394571900367737, + "learning_rate": 4.445845066256793e-06, + "loss": 0.1834, + "step": 8943 + }, + { + "epoch": 1.810159886662619, + "grad_norm": 0.29721203446388245, + "learning_rate": 4.436471354173677e-06, + "loss": 0.1913, + "step": 8944 + }, + { + "epoch": 1.8103622748431492, + "grad_norm": 0.30943432450294495, + "learning_rate": 4.427107310205647e-06, + "loss": 0.208, + "step": 8945 + }, + { + "epoch": 1.8105646630236794, + "grad_norm": 0.29275450110435486, + "learning_rate": 4.417752935300079e-06, + "loss": 0.1942, + "step": 8946 + }, + { + "epoch": 1.8107670512042096, + "grad_norm": 0.2976188659667969, + "learning_rate": 4.408408230403327e-06, + "loss": 0.1846, + "step": 8947 + }, + { + "epoch": 1.81096943938474, + "grad_norm": 0.2598245143890381, + "learning_rate": 4.39907319646079e-06, + "loss": 0.1852, + "step": 8948 + }, + { + "epoch": 1.8111718275652702, + "grad_norm": 0.25931134819984436, + "learning_rate": 4.389747834416913e-06, + "loss": 0.1495, + "step": 8949 + }, + { + "epoch": 1.8113742157458006, + "grad_norm": 0.26877543330192566, + "learning_rate": 4.380432145215119e-06, + "loss": 0.1652, + "step": 8950 + }, + { + "epoch": 1.8113742157458006, + "eval_loss": 0.25779491662979126, + "eval_runtime": 0.7398, + "eval_samples_per_second": 6.759, + "eval_steps_per_second": 1.352, + "step": 8950 + }, + { + "epoch": 1.8115766039263308, + "grad_norm": 0.30282893776893616, + "learning_rate": 4.371126129797864e-06, + "loss": 0.2027, + "step": 8951 + }, + { + "epoch": 1.811778992106861, + "grad_norm": 0.3111761510372162, + "learning_rate": 4.361829789106653e-06, + "loss": 0.1984, + "step": 8952 + }, + { + "epoch": 1.8119813802873912, + "grad_norm": 0.2611580193042755, + "learning_rate": 4.352543124081987e-06, + "loss": 0.1644, + "step": 8953 + }, + { + "epoch": 1.8121837684679214, + "grad_norm": 0.2492230385541916, + "learning_rate": 4.343266135663393e-06, + "loss": 0.1598, + "step": 8954 + }, + { + "epoch": 1.8123861566484516, + "grad_norm": 0.3022874891757965, + "learning_rate": 4.33399882478941e-06, + "loss": 0.1791, + "step": 8955 + }, + { + "epoch": 1.812588544828982, + "grad_norm": 0.2649358808994293, + "learning_rate": 4.324741192397619e-06, + "loss": 0.1878, + "step": 8956 + }, + { + "epoch": 1.8127909330095122, + "grad_norm": 0.23399528861045837, + "learning_rate": 4.315493239424606e-06, + "loss": 0.1813, + "step": 8957 + }, + { + "epoch": 1.8129933211900426, + "grad_norm": 0.2612743079662323, + "learning_rate": 4.306254966805967e-06, + "loss": 0.2021, + "step": 8958 + }, + { + "epoch": 1.8131957093705728, + "grad_norm": 0.35977280139923096, + "learning_rate": 4.297026375476365e-06, + "loss": 0.2231, + "step": 8959 + }, + { + "epoch": 1.813398097551103, + "grad_norm": 0.28002238273620605, + "learning_rate": 4.28780746636942e-06, + "loss": 0.1699, + "step": 8960 + }, + { + "epoch": 1.8136004857316332, + "grad_norm": 0.2709221839904785, + "learning_rate": 4.278598240417842e-06, + "loss": 0.1805, + "step": 8961 + }, + { + "epoch": 1.8138028739121634, + "grad_norm": 0.2538873553276062, + "learning_rate": 4.269398698553284e-06, + "loss": 0.1682, + "step": 8962 + }, + { + "epoch": 1.8140052620926936, + "grad_norm": 0.26883426308631897, + "learning_rate": 4.260208841706481e-06, + "loss": 0.1853, + "step": 8963 + }, + { + "epoch": 1.814207650273224, + "grad_norm": 0.24153414368629456, + "learning_rate": 4.251028670807156e-06, + "loss": 0.1588, + "step": 8964 + }, + { + "epoch": 1.8144100384537543, + "grad_norm": 0.2933860719203949, + "learning_rate": 4.241858186784064e-06, + "loss": 0.2111, + "step": 8965 + }, + { + "epoch": 1.8146124266342847, + "grad_norm": 0.27167898416519165, + "learning_rate": 4.232697390564988e-06, + "loss": 0.2109, + "step": 8966 + }, + { + "epoch": 1.8148148148148149, + "grad_norm": 0.2836179733276367, + "learning_rate": 4.223546283076718e-06, + "loss": 0.1851, + "step": 8967 + }, + { + "epoch": 1.815017202995345, + "grad_norm": 0.25831085443496704, + "learning_rate": 4.2144048652450585e-06, + "loss": 0.183, + "step": 8968 + }, + { + "epoch": 1.8152195911758753, + "grad_norm": 0.2679770886898041, + "learning_rate": 4.2052731379948475e-06, + "loss": 0.1804, + "step": 8969 + }, + { + "epoch": 1.8154219793564055, + "grad_norm": 0.2566496431827545, + "learning_rate": 4.1961511022499345e-06, + "loss": 0.1776, + "step": 8970 + }, + { + "epoch": 1.8156243675369357, + "grad_norm": 0.28995993733406067, + "learning_rate": 4.187038758933204e-06, + "loss": 0.2231, + "step": 8971 + }, + { + "epoch": 1.815826755717466, + "grad_norm": 0.2585195302963257, + "learning_rate": 4.177936108966529e-06, + "loss": 0.1795, + "step": 8972 + }, + { + "epoch": 1.8160291438979965, + "grad_norm": 0.28282400965690613, + "learning_rate": 4.1688431532708404e-06, + "loss": 0.1899, + "step": 8973 + }, + { + "epoch": 1.8162315320785267, + "grad_norm": 0.2926957905292511, + "learning_rate": 4.159759892766047e-06, + "loss": 0.1828, + "step": 8974 + }, + { + "epoch": 1.816433920259057, + "grad_norm": 0.3157510459423065, + "learning_rate": 4.150686328371112e-06, + "loss": 0.1972, + "step": 8975 + }, + { + "epoch": 1.8166363084395871, + "grad_norm": 0.2653707265853882, + "learning_rate": 4.141622461003991e-06, + "loss": 0.1639, + "step": 8976 + }, + { + "epoch": 1.8168386966201173, + "grad_norm": 0.28877976536750793, + "learning_rate": 4.132568291581684e-06, + "loss": 0.1908, + "step": 8977 + }, + { + "epoch": 1.8170410848006475, + "grad_norm": 0.30027449131011963, + "learning_rate": 4.123523821020192e-06, + "loss": 0.1952, + "step": 8978 + }, + { + "epoch": 1.817243472981178, + "grad_norm": 0.3052767515182495, + "learning_rate": 4.1144890502345375e-06, + "loss": 0.2201, + "step": 8979 + }, + { + "epoch": 1.8174458611617081, + "grad_norm": 0.314216285943985, + "learning_rate": 4.105463980138769e-06, + "loss": 0.1787, + "step": 8980 + }, + { + "epoch": 1.8176482493422386, + "grad_norm": 0.30276334285736084, + "learning_rate": 4.096448611645942e-06, + "loss": 0.1971, + "step": 8981 + }, + { + "epoch": 1.8178506375227688, + "grad_norm": 0.24939769506454468, + "learning_rate": 4.087442945668152e-06, + "loss": 0.1796, + "step": 8982 + }, + { + "epoch": 1.818053025703299, + "grad_norm": 0.28983643651008606, + "learning_rate": 4.078446983116468e-06, + "loss": 0.1915, + "step": 8983 + }, + { + "epoch": 1.8182554138838292, + "grad_norm": 0.25924208760261536, + "learning_rate": 4.0694607249010304e-06, + "loss": 0.1679, + "step": 8984 + }, + { + "epoch": 1.8184578020643594, + "grad_norm": 0.24220941960811615, + "learning_rate": 4.060484171930978e-06, + "loss": 0.155, + "step": 8985 + }, + { + "epoch": 1.8186601902448896, + "grad_norm": 0.3046566843986511, + "learning_rate": 4.051517325114451e-06, + "loss": 0.2165, + "step": 8986 + }, + { + "epoch": 1.81886257842542, + "grad_norm": 0.26422369480133057, + "learning_rate": 4.0425601853586125e-06, + "loss": 0.1877, + "step": 8987 + }, + { + "epoch": 1.8190649666059502, + "grad_norm": 0.3298000991344452, + "learning_rate": 4.033612753569682e-06, + "loss": 0.2056, + "step": 8988 + }, + { + "epoch": 1.8192673547864806, + "grad_norm": 0.3131459951400757, + "learning_rate": 4.0246750306528354e-06, + "loss": 0.2212, + "step": 8989 + }, + { + "epoch": 1.8194697429670108, + "grad_norm": 0.2667044401168823, + "learning_rate": 4.015747017512317e-06, + "loss": 0.1741, + "step": 8990 + }, + { + "epoch": 1.819672131147541, + "grad_norm": 0.3156639039516449, + "learning_rate": 4.0068287150513696e-06, + "loss": 0.2316, + "step": 8991 + }, + { + "epoch": 1.8198745193280712, + "grad_norm": 0.2932990491390228, + "learning_rate": 3.997920124172238e-06, + "loss": 0.1958, + "step": 8992 + }, + { + "epoch": 1.8200769075086014, + "grad_norm": 0.2587626278400421, + "learning_rate": 3.989021245776214e-06, + "loss": 0.1702, + "step": 8993 + }, + { + "epoch": 1.8202792956891316, + "grad_norm": 0.29920312762260437, + "learning_rate": 3.980132080763588e-06, + "loss": 0.2167, + "step": 8994 + }, + { + "epoch": 1.820481683869662, + "grad_norm": 0.3583579957485199, + "learning_rate": 3.971252630033684e-06, + "loss": 0.2383, + "step": 8995 + }, + { + "epoch": 1.8206840720501922, + "grad_norm": 0.336424320936203, + "learning_rate": 3.9623828944848065e-06, + "loss": 0.2016, + "step": 8996 + }, + { + "epoch": 1.8208864602307226, + "grad_norm": 0.24210865795612335, + "learning_rate": 3.953522875014326e-06, + "loss": 0.1853, + "step": 8997 + }, + { + "epoch": 1.8210888484112528, + "grad_norm": 0.3300243616104126, + "learning_rate": 3.944672572518582e-06, + "loss": 0.1933, + "step": 8998 + }, + { + "epoch": 1.821291236591783, + "grad_norm": 0.2903205156326294, + "learning_rate": 3.935831987892979e-06, + "loss": 0.1797, + "step": 8999 + }, + { + "epoch": 1.8214936247723132, + "grad_norm": 0.3007589876651764, + "learning_rate": 3.927001122031915e-06, + "loss": 0.2058, + "step": 9000 + }, + { + "epoch": 1.8214936247723132, + "eval_loss": 0.25774043798446655, + "eval_runtime": 0.7398, + "eval_samples_per_second": 6.759, + "eval_steps_per_second": 1.352, + "step": 9000 + }, + { + "epoch": 1.8216960129528434, + "grad_norm": 0.30178171396255493, + "learning_rate": 3.918179975828784e-06, + "loss": 0.1917, + "step": 9001 + }, + { + "epoch": 1.8218984011333736, + "grad_norm": 0.29727664589881897, + "learning_rate": 3.909368550176029e-06, + "loss": 0.183, + "step": 9002 + }, + { + "epoch": 1.822100789313904, + "grad_norm": 0.26325443387031555, + "learning_rate": 3.900566845965104e-06, + "loss": 0.1653, + "step": 9003 + }, + { + "epoch": 1.8223031774944345, + "grad_norm": 0.25132912397384644, + "learning_rate": 3.891774864086451e-06, + "loss": 0.1707, + "step": 9004 + }, + { + "epoch": 1.8225055656749647, + "grad_norm": 0.27996230125427246, + "learning_rate": 3.8829926054295805e-06, + "loss": 0.2224, + "step": 9005 + }, + { + "epoch": 1.822707953855495, + "grad_norm": 0.25456923246383667, + "learning_rate": 3.874220070882972e-06, + "loss": 0.1832, + "step": 9006 + }, + { + "epoch": 1.822910342036025, + "grad_norm": 0.3332825005054474, + "learning_rate": 3.865457261334138e-06, + "loss": 0.1774, + "step": 9007 + }, + { + "epoch": 1.8231127302165553, + "grad_norm": 0.28720322251319885, + "learning_rate": 3.856704177669612e-06, + "loss": 0.1957, + "step": 9008 + }, + { + "epoch": 1.8233151183970855, + "grad_norm": 0.37506118416786194, + "learning_rate": 3.847960820774932e-06, + "loss": 0.1914, + "step": 9009 + }, + { + "epoch": 1.823517506577616, + "grad_norm": 0.25758567452430725, + "learning_rate": 3.839227191534666e-06, + "loss": 0.1718, + "step": 9010 + }, + { + "epoch": 1.823719894758146, + "grad_norm": 0.28181517124176025, + "learning_rate": 3.830503290832388e-06, + "loss": 0.1774, + "step": 9011 + }, + { + "epoch": 1.8239222829386765, + "grad_norm": 0.3112618327140808, + "learning_rate": 3.82178911955069e-06, + "loss": 0.2347, + "step": 9012 + }, + { + "epoch": 1.8241246711192067, + "grad_norm": 0.3159734904766083, + "learning_rate": 3.8130846785711773e-06, + "loss": 0.2079, + "step": 9013 + }, + { + "epoch": 1.824327059299737, + "grad_norm": 0.287130206823349, + "learning_rate": 3.804389968774491e-06, + "loss": 0.2008, + "step": 9014 + }, + { + "epoch": 1.8245294474802671, + "grad_norm": 0.27288955450057983, + "learning_rate": 3.7957049910402497e-06, + "loss": 0.1735, + "step": 9015 + }, + { + "epoch": 1.8247318356607973, + "grad_norm": 0.2727169990539551, + "learning_rate": 3.7870297462471282e-06, + "loss": 0.2139, + "step": 9016 + }, + { + "epoch": 1.8249342238413275, + "grad_norm": 0.29491594433784485, + "learning_rate": 3.77836423527278e-06, + "loss": 0.1806, + "step": 9017 + }, + { + "epoch": 1.825136612021858, + "grad_norm": 0.3176809847354889, + "learning_rate": 3.7697084589938924e-06, + "loss": 0.2092, + "step": 9018 + }, + { + "epoch": 1.8253390002023882, + "grad_norm": 0.33551716804504395, + "learning_rate": 3.7610624182861655e-06, + "loss": 0.186, + "step": 9019 + }, + { + "epoch": 1.8255413883829186, + "grad_norm": 0.29789623618125916, + "learning_rate": 3.7524261140243322e-06, + "loss": 0.1917, + "step": 9020 + }, + { + "epoch": 1.8257437765634488, + "grad_norm": 0.29452013969421387, + "learning_rate": 3.7437995470821052e-06, + "loss": 0.1992, + "step": 9021 + }, + { + "epoch": 1.825946164743979, + "grad_norm": 0.27145907282829285, + "learning_rate": 3.735182718332231e-06, + "loss": 0.1697, + "step": 9022 + }, + { + "epoch": 1.8261485529245092, + "grad_norm": 0.2831342816352844, + "learning_rate": 3.726575628646478e-06, + "loss": 0.201, + "step": 9023 + }, + { + "epoch": 1.8263509411050394, + "grad_norm": 0.2772304117679596, + "learning_rate": 3.7179782788956175e-06, + "loss": 0.1909, + "step": 9024 + }, + { + "epoch": 1.8265533292855696, + "grad_norm": 0.26404857635498047, + "learning_rate": 3.7093906699494417e-06, + "loss": 0.1797, + "step": 9025 + }, + { + "epoch": 1.8267557174661, + "grad_norm": 0.2647900879383087, + "learning_rate": 3.7008128026767453e-06, + "loss": 0.1886, + "step": 9026 + }, + { + "epoch": 1.8269581056466302, + "grad_norm": 0.3391728103160858, + "learning_rate": 3.692244677945356e-06, + "loss": 0.1792, + "step": 9027 + }, + { + "epoch": 1.8271604938271606, + "grad_norm": 0.29189401865005493, + "learning_rate": 3.6836862966221243e-06, + "loss": 0.1718, + "step": 9028 + }, + { + "epoch": 1.8273628820076908, + "grad_norm": 0.31552425026893616, + "learning_rate": 3.6751376595728582e-06, + "loss": 0.2288, + "step": 9029 + }, + { + "epoch": 1.827565270188221, + "grad_norm": 0.3222467601299286, + "learning_rate": 3.6665987676624323e-06, + "loss": 0.2479, + "step": 9030 + }, + { + "epoch": 1.8277676583687512, + "grad_norm": 0.3233198821544647, + "learning_rate": 3.658069621754734e-06, + "loss": 0.214, + "step": 9031 + }, + { + "epoch": 1.8279700465492814, + "grad_norm": 0.2774644196033478, + "learning_rate": 3.6495502227126387e-06, + "loss": 0.1885, + "step": 9032 + }, + { + "epoch": 1.8281724347298116, + "grad_norm": 0.28501439094543457, + "learning_rate": 3.641040571398069e-06, + "loss": 0.2028, + "step": 9033 + }, + { + "epoch": 1.828374822910342, + "grad_norm": 0.2721464931964874, + "learning_rate": 3.6325406686719352e-06, + "loss": 0.199, + "step": 9034 + }, + { + "epoch": 1.8285772110908725, + "grad_norm": 0.28608623147010803, + "learning_rate": 3.6240505153941506e-06, + "loss": 0.188, + "step": 9035 + }, + { + "epoch": 1.8287795992714027, + "grad_norm": 0.2906154692173004, + "learning_rate": 3.615570112423683e-06, + "loss": 0.1663, + "step": 9036 + }, + { + "epoch": 1.8289819874519329, + "grad_norm": 0.2678082287311554, + "learning_rate": 3.607099460618479e-06, + "loss": 0.1792, + "step": 9037 + }, + { + "epoch": 1.829184375632463, + "grad_norm": 0.28244009613990784, + "learning_rate": 3.59863856083551e-06, + "loss": 0.207, + "step": 9038 + }, + { + "epoch": 1.8293867638129933, + "grad_norm": 0.30563104152679443, + "learning_rate": 3.590187413930768e-06, + "loss": 0.2205, + "step": 9039 + }, + { + "epoch": 1.8295891519935235, + "grad_norm": 0.2915782928466797, + "learning_rate": 3.581746020759247e-06, + "loss": 0.208, + "step": 9040 + }, + { + "epoch": 1.8297915401740539, + "grad_norm": 0.3015349507331848, + "learning_rate": 3.5733143821749636e-06, + "loss": 0.1832, + "step": 9041 + }, + { + "epoch": 1.829993928354584, + "grad_norm": 0.3021952509880066, + "learning_rate": 3.564892499030925e-06, + "loss": 0.1899, + "step": 9042 + }, + { + "epoch": 1.8301963165351145, + "grad_norm": 0.31338658928871155, + "learning_rate": 3.5564803721791827e-06, + "loss": 0.1869, + "step": 9043 + }, + { + "epoch": 1.8303987047156447, + "grad_norm": 0.3176608085632324, + "learning_rate": 3.5480780024707894e-06, + "loss": 0.2072, + "step": 9044 + }, + { + "epoch": 1.830601092896175, + "grad_norm": 0.28486374020576477, + "learning_rate": 3.539685390755809e-06, + "loss": 0.1892, + "step": 9045 + }, + { + "epoch": 1.830803481076705, + "grad_norm": 0.26669222116470337, + "learning_rate": 3.5313025378833077e-06, + "loss": 0.1965, + "step": 9046 + }, + { + "epoch": 1.8310058692572353, + "grad_norm": 0.28664642572402954, + "learning_rate": 3.5229294447013838e-06, + "loss": 0.1861, + "step": 9047 + }, + { + "epoch": 1.8312082574377655, + "grad_norm": 0.2668553292751312, + "learning_rate": 3.5145661120571384e-06, + "loss": 0.1649, + "step": 9048 + }, + { + "epoch": 1.831410645618296, + "grad_norm": 0.3332071304321289, + "learning_rate": 3.506212540796683e-06, + "loss": 0.2077, + "step": 9049 + }, + { + "epoch": 1.8316130337988261, + "grad_norm": 0.2512947618961334, + "learning_rate": 3.4978687317651526e-06, + "loss": 0.1923, + "step": 9050 + }, + { + "epoch": 1.8316130337988261, + "eval_loss": 0.2575944662094116, + "eval_runtime": 0.7391, + "eval_samples_per_second": 6.765, + "eval_steps_per_second": 1.353, + "step": 9050 + }, + { + "epoch": 1.8318154219793565, + "grad_norm": 0.2591105103492737, + "learning_rate": 3.4895346858066724e-06, + "loss": 0.1553, + "step": 9051 + }, + { + "epoch": 1.8320178101598867, + "grad_norm": 0.26791900396347046, + "learning_rate": 3.481210403764401e-06, + "loss": 0.1818, + "step": 9052 + }, + { + "epoch": 1.832220198340417, + "grad_norm": 0.32195326685905457, + "learning_rate": 3.4728958864804984e-06, + "loss": 0.1723, + "step": 9053 + }, + { + "epoch": 1.8324225865209471, + "grad_norm": 0.2889711260795593, + "learning_rate": 3.4645911347961357e-06, + "loss": 0.1628, + "step": 9054 + }, + { + "epoch": 1.8326249747014773, + "grad_norm": 0.3247166574001312, + "learning_rate": 3.456296149551519e-06, + "loss": 0.2015, + "step": 9055 + }, + { + "epoch": 1.8328273628820075, + "grad_norm": 0.2901359498500824, + "learning_rate": 3.4480109315858324e-06, + "loss": 0.1748, + "step": 9056 + }, + { + "epoch": 1.833029751062538, + "grad_norm": 0.2965649664402008, + "learning_rate": 3.439735481737283e-06, + "loss": 0.22, + "step": 9057 + }, + { + "epoch": 1.8332321392430682, + "grad_norm": 0.303994357585907, + "learning_rate": 3.4314698008431123e-06, + "loss": 0.2208, + "step": 9058 + }, + { + "epoch": 1.8334345274235986, + "grad_norm": 0.2678375840187073, + "learning_rate": 3.4232138897395406e-06, + "loss": 0.1935, + "step": 9059 + }, + { + "epoch": 1.8336369156041288, + "grad_norm": 0.3268527388572693, + "learning_rate": 3.4149677492618214e-06, + "loss": 0.2103, + "step": 9060 + }, + { + "epoch": 1.833839303784659, + "grad_norm": 0.2670746445655823, + "learning_rate": 3.406731380244199e-06, + "loss": 0.1804, + "step": 9061 + }, + { + "epoch": 1.8340416919651892, + "grad_norm": 0.27483952045440674, + "learning_rate": 3.3985047835199624e-06, + "loss": 0.1703, + "step": 9062 + }, + { + "epoch": 1.8342440801457194, + "grad_norm": 0.3195999562740326, + "learning_rate": 3.3902879599213897e-06, + "loss": 0.2326, + "step": 9063 + }, + { + "epoch": 1.8344464683262498, + "grad_norm": 0.2567335069179535, + "learning_rate": 3.382080910279761e-06, + "loss": 0.1817, + "step": 9064 + }, + { + "epoch": 1.83464885650678, + "grad_norm": 0.2286807745695114, + "learning_rate": 3.37388363542539e-06, + "loss": 0.1377, + "step": 9065 + }, + { + "epoch": 1.8348512446873104, + "grad_norm": 0.2931678593158722, + "learning_rate": 3.3656961361875795e-06, + "loss": 0.2191, + "step": 9066 + }, + { + "epoch": 1.8350536328678406, + "grad_norm": 0.25825655460357666, + "learning_rate": 3.3575184133946668e-06, + "loss": 0.178, + "step": 9067 + }, + { + "epoch": 1.8352560210483708, + "grad_norm": 0.2457646280527115, + "learning_rate": 3.3493504678739797e-06, + "loss": 0.1439, + "step": 9068 + }, + { + "epoch": 1.835458409228901, + "grad_norm": 0.30730950832366943, + "learning_rate": 3.3411923004518674e-06, + "loss": 0.2097, + "step": 9069 + }, + { + "epoch": 1.8356607974094312, + "grad_norm": 0.31206050515174866, + "learning_rate": 3.333043911953693e-06, + "loss": 0.1768, + "step": 9070 + }, + { + "epoch": 1.8358631855899614, + "grad_norm": 0.3024093210697174, + "learning_rate": 3.324905303203818e-06, + "loss": 0.2128, + "step": 9071 + }, + { + "epoch": 1.8360655737704918, + "grad_norm": 0.2619602084159851, + "learning_rate": 3.316776475025629e-06, + "loss": 0.1696, + "step": 9072 + }, + { + "epoch": 1.836267961951022, + "grad_norm": 0.2904742658138275, + "learning_rate": 3.3086574282415127e-06, + "loss": 0.1849, + "step": 9073 + }, + { + "epoch": 1.8364703501315525, + "grad_norm": 0.29145240783691406, + "learning_rate": 3.3005481636728676e-06, + "loss": 0.1838, + "step": 9074 + }, + { + "epoch": 1.8366727383120827, + "grad_norm": 0.29646483063697815, + "learning_rate": 3.2924486821400923e-06, + "loss": 0.2048, + "step": 9075 + }, + { + "epoch": 1.8368751264926129, + "grad_norm": 0.28350889682769775, + "learning_rate": 3.284358984462621e-06, + "loss": 0.1887, + "step": 9076 + }, + { + "epoch": 1.837077514673143, + "grad_norm": 0.3916099965572357, + "learning_rate": 3.2762790714588876e-06, + "loss": 0.1887, + "step": 9077 + }, + { + "epoch": 1.8372799028536733, + "grad_norm": 0.28508061170578003, + "learning_rate": 3.268208943946327e-06, + "loss": 0.2175, + "step": 9078 + }, + { + "epoch": 1.8374822910342035, + "grad_norm": 0.26994675397872925, + "learning_rate": 3.260148602741386e-06, + "loss": 0.1734, + "step": 9079 + }, + { + "epoch": 1.8376846792147339, + "grad_norm": 0.2726169526576996, + "learning_rate": 3.2520980486595353e-06, + "loss": 0.1934, + "step": 9080 + }, + { + "epoch": 1.837887067395264, + "grad_norm": 0.25494077801704407, + "learning_rate": 3.244057282515234e-06, + "loss": 0.182, + "step": 9081 + }, + { + "epoch": 1.8380894555757945, + "grad_norm": 0.21710167825222015, + "learning_rate": 3.2360263051219643e-06, + "loss": 0.1331, + "step": 9082 + }, + { + "epoch": 1.8382918437563247, + "grad_norm": 0.2714509963989258, + "learning_rate": 3.228005117292232e-06, + "loss": 0.1859, + "step": 9083 + }, + { + "epoch": 1.838494231936855, + "grad_norm": 0.286760151386261, + "learning_rate": 3.219993719837511e-06, + "loss": 0.167, + "step": 9084 + }, + { + "epoch": 1.838696620117385, + "grad_norm": 0.2535327076911926, + "learning_rate": 3.2119921135683405e-06, + "loss": 0.1753, + "step": 9085 + }, + { + "epoch": 1.8388990082979153, + "grad_norm": 0.24537889659404755, + "learning_rate": 3.2040002992942077e-06, + "loss": 0.1687, + "step": 9086 + }, + { + "epoch": 1.8391013964784455, + "grad_norm": 0.29615381360054016, + "learning_rate": 3.1960182778236647e-06, + "loss": 0.2077, + "step": 9087 + }, + { + "epoch": 1.839303784658976, + "grad_norm": 0.3481968641281128, + "learning_rate": 3.188046049964233e-06, + "loss": 0.2454, + "step": 9088 + }, + { + "epoch": 1.8395061728395061, + "grad_norm": 0.32502785325050354, + "learning_rate": 3.180083616522478e-06, + "loss": 0.1733, + "step": 9089 + }, + { + "epoch": 1.8397085610200365, + "grad_norm": 0.22581607103347778, + "learning_rate": 3.1721309783039332e-06, + "loss": 0.1675, + "step": 9090 + }, + { + "epoch": 1.8399109492005667, + "grad_norm": 0.2769971191883087, + "learning_rate": 3.1641881361131664e-06, + "loss": 0.1906, + "step": 9091 + }, + { + "epoch": 1.840113337381097, + "grad_norm": 0.30919790267944336, + "learning_rate": 3.156255090753757e-06, + "loss": 0.2084, + "step": 9092 + }, + { + "epoch": 1.8403157255616271, + "grad_norm": 0.2568013668060303, + "learning_rate": 3.148331843028296e-06, + "loss": 0.1692, + "step": 9093 + }, + { + "epoch": 1.8405181137421573, + "grad_norm": 0.2942313551902771, + "learning_rate": 3.1404183937383647e-06, + "loss": 0.1977, + "step": 9094 + }, + { + "epoch": 1.8407205019226878, + "grad_norm": 0.2699166536331177, + "learning_rate": 3.1325147436845783e-06, + "loss": 0.1425, + "step": 9095 + }, + { + "epoch": 1.840922890103218, + "grad_norm": 0.30012571811676025, + "learning_rate": 3.1246208936665188e-06, + "loss": 0.2031, + "step": 9096 + }, + { + "epoch": 1.8411252782837484, + "grad_norm": 0.3417207896709442, + "learning_rate": 3.1167368444828147e-06, + "loss": 0.1909, + "step": 9097 + }, + { + "epoch": 1.8413276664642786, + "grad_norm": 0.29664674401283264, + "learning_rate": 3.108862596931095e-06, + "loss": 0.1856, + "step": 9098 + }, + { + "epoch": 1.8415300546448088, + "grad_norm": 0.37633007764816284, + "learning_rate": 3.100998151807988e-06, + "loss": 0.1895, + "step": 9099 + }, + { + "epoch": 1.841732442825339, + "grad_norm": 0.2773500978946686, + "learning_rate": 3.0931435099091466e-06, + "loss": 0.1801, + "step": 9100 + }, + { + "epoch": 1.841732442825339, + "eval_loss": 0.2573607563972473, + "eval_runtime": 0.7401, + "eval_samples_per_second": 6.756, + "eval_steps_per_second": 1.351, + "step": 9100 + }, + { + "epoch": 1.8419348310058692, + "grad_norm": 0.28869545459747314, + "learning_rate": 3.085298672029202e-06, + "loss": 0.2102, + "step": 9101 + }, + { + "epoch": 1.8421372191863994, + "grad_norm": 0.28606685996055603, + "learning_rate": 3.0774636389618192e-06, + "loss": 0.2018, + "step": 9102 + }, + { + "epoch": 1.8423396073669298, + "grad_norm": 0.3356720507144928, + "learning_rate": 3.0696384114996757e-06, + "loss": 0.2135, + "step": 9103 + }, + { + "epoch": 1.84254199554746, + "grad_norm": 0.3090665936470032, + "learning_rate": 3.0618229904344485e-06, + "loss": 0.1787, + "step": 9104 + }, + { + "epoch": 1.8427443837279904, + "grad_norm": 0.2992224097251892, + "learning_rate": 3.054017376556795e-06, + "loss": 0.2351, + "step": 9105 + }, + { + "epoch": 1.8429467719085206, + "grad_norm": 0.268134742975235, + "learning_rate": 3.0462215706564267e-06, + "loss": 0.1458, + "step": 9106 + }, + { + "epoch": 1.8431491600890508, + "grad_norm": 0.31659549474716187, + "learning_rate": 3.038435573522036e-06, + "loss": 0.175, + "step": 9107 + }, + { + "epoch": 1.843351548269581, + "grad_norm": 0.282866507768631, + "learning_rate": 3.0306593859413255e-06, + "loss": 0.1694, + "step": 9108 + }, + { + "epoch": 1.8435539364501112, + "grad_norm": 0.27301478385925293, + "learning_rate": 3.022893008701011e-06, + "loss": 0.1973, + "step": 9109 + }, + { + "epoch": 1.8437563246306414, + "grad_norm": 0.23165518045425415, + "learning_rate": 3.015136442586819e-06, + "loss": 0.1689, + "step": 9110 + }, + { + "epoch": 1.8439587128111719, + "grad_norm": 0.31681936979293823, + "learning_rate": 3.0073896883834663e-06, + "loss": 0.2378, + "step": 9111 + }, + { + "epoch": 1.844161100991702, + "grad_norm": 0.3050534129142761, + "learning_rate": 2.9996527468746925e-06, + "loss": 0.1851, + "step": 9112 + }, + { + "epoch": 1.8443634891722325, + "grad_norm": 0.33265596628189087, + "learning_rate": 2.9919256188432387e-06, + "loss": 0.2079, + "step": 9113 + }, + { + "epoch": 1.8445658773527627, + "grad_norm": 0.29722699522972107, + "learning_rate": 2.984208305070857e-06, + "loss": 0.2271, + "step": 9114 + }, + { + "epoch": 1.8447682655332929, + "grad_norm": 0.2777019441127777, + "learning_rate": 2.9765008063383117e-06, + "loss": 0.1677, + "step": 9115 + }, + { + "epoch": 1.844970653713823, + "grad_norm": 0.2719971239566803, + "learning_rate": 2.9688031234253565e-06, + "loss": 0.187, + "step": 9116 + }, + { + "epoch": 1.8451730418943533, + "grad_norm": 0.279680073261261, + "learning_rate": 2.961115257110769e-06, + "loss": 0.1779, + "step": 9117 + }, + { + "epoch": 1.8453754300748835, + "grad_norm": 0.2744757831096649, + "learning_rate": 2.953437208172316e-06, + "loss": 0.1692, + "step": 9118 + }, + { + "epoch": 1.845577818255414, + "grad_norm": 0.2708338499069214, + "learning_rate": 2.945768977386787e-06, + "loss": 0.1935, + "step": 9119 + }, + { + "epoch": 1.845780206435944, + "grad_norm": 0.2639225125312805, + "learning_rate": 2.9381105655299724e-06, + "loss": 0.1543, + "step": 9120 + }, + { + "epoch": 1.8459825946164745, + "grad_norm": 0.24041591584682465, + "learning_rate": 2.930461973376675e-06, + "loss": 0.1803, + "step": 9121 + }, + { + "epoch": 1.8461849827970047, + "grad_norm": 0.23384606838226318, + "learning_rate": 2.9228232017006864e-06, + "loss": 0.1645, + "step": 9122 + }, + { + "epoch": 1.846387370977535, + "grad_norm": 0.34278208017349243, + "learning_rate": 2.915194251274844e-06, + "loss": 0.1817, + "step": 9123 + }, + { + "epoch": 1.8465897591580651, + "grad_norm": 0.3258562684059143, + "learning_rate": 2.9075751228709312e-06, + "loss": 0.2113, + "step": 9124 + }, + { + "epoch": 1.8467921473385953, + "grad_norm": 0.3236311078071594, + "learning_rate": 2.8999658172597975e-06, + "loss": 0.1951, + "step": 9125 + }, + { + "epoch": 1.8469945355191257, + "grad_norm": 0.2938937246799469, + "learning_rate": 2.8923663352112606e-06, + "loss": 0.1901, + "step": 9126 + }, + { + "epoch": 1.847196923699656, + "grad_norm": 0.3299955725669861, + "learning_rate": 2.884776677494161e-06, + "loss": 0.1755, + "step": 9127 + }, + { + "epoch": 1.8473993118801864, + "grad_norm": 0.26704564690589905, + "learning_rate": 2.8771968448763396e-06, + "loss": 0.197, + "step": 9128 + }, + { + "epoch": 1.8476017000607166, + "grad_norm": 0.297905832529068, + "learning_rate": 2.8696268381246394e-06, + "loss": 0.1863, + "step": 9129 + }, + { + "epoch": 1.8478040882412468, + "grad_norm": 0.38043296337127686, + "learning_rate": 2.8620666580049247e-06, + "loss": 0.216, + "step": 9130 + }, + { + "epoch": 1.848006476421777, + "grad_norm": 0.2848310172557831, + "learning_rate": 2.85451630528204e-06, + "loss": 0.1913, + "step": 9131 + }, + { + "epoch": 1.8482088646023072, + "grad_norm": 0.3236400783061981, + "learning_rate": 2.8469757807198736e-06, + "loss": 0.2242, + "step": 9132 + }, + { + "epoch": 1.8484112527828374, + "grad_norm": 0.3123624920845032, + "learning_rate": 2.8394450850812714e-06, + "loss": 0.2231, + "step": 9133 + }, + { + "epoch": 1.8486136409633678, + "grad_norm": 0.3278132975101471, + "learning_rate": 2.8319242191281237e-06, + "loss": 0.2337, + "step": 9134 + }, + { + "epoch": 1.848816029143898, + "grad_norm": 0.2576705515384674, + "learning_rate": 2.8244131836213106e-06, + "loss": 0.1761, + "step": 9135 + }, + { + "epoch": 1.8490184173244284, + "grad_norm": 0.292070209980011, + "learning_rate": 2.8169119793207134e-06, + "loss": 0.1743, + "step": 9136 + }, + { + "epoch": 1.8492208055049586, + "grad_norm": 0.31883808970451355, + "learning_rate": 2.809420606985236e-06, + "loss": 0.1821, + "step": 9137 + }, + { + "epoch": 1.8494231936854888, + "grad_norm": 0.28848689794540405, + "learning_rate": 2.801939067372783e-06, + "loss": 0.1736, + "step": 9138 + }, + { + "epoch": 1.849625581866019, + "grad_norm": 0.24862170219421387, + "learning_rate": 2.794467361240238e-06, + "loss": 0.1928, + "step": 9139 + }, + { + "epoch": 1.8498279700465492, + "grad_norm": 0.3228076100349426, + "learning_rate": 2.7870054893435194e-06, + "loss": 0.173, + "step": 9140 + }, + { + "epoch": 1.8500303582270794, + "grad_norm": 0.26641377806663513, + "learning_rate": 2.7795534524375333e-06, + "loss": 0.1851, + "step": 9141 + }, + { + "epoch": 1.8502327464076098, + "grad_norm": 0.2720263600349426, + "learning_rate": 2.7721112512762216e-06, + "loss": 0.1717, + "step": 9142 + }, + { + "epoch": 1.85043513458814, + "grad_norm": 0.29709115624427795, + "learning_rate": 2.7646788866124817e-06, + "loss": 0.1788, + "step": 9143 + }, + { + "epoch": 1.8506375227686704, + "grad_norm": 0.27155938744544983, + "learning_rate": 2.757256359198257e-06, + "loss": 0.1862, + "step": 9144 + }, + { + "epoch": 1.8508399109492006, + "grad_norm": 0.27506211400032043, + "learning_rate": 2.74984366978448e-06, + "loss": 0.2015, + "step": 9145 + }, + { + "epoch": 1.8510422991297308, + "grad_norm": 0.30489999055862427, + "learning_rate": 2.742440819121084e-06, + "loss": 0.2147, + "step": 9146 + }, + { + "epoch": 1.851244687310261, + "grad_norm": 0.3416946530342102, + "learning_rate": 2.735047807957014e-06, + "loss": 0.199, + "step": 9147 + }, + { + "epoch": 1.8514470754907912, + "grad_norm": 0.3143892288208008, + "learning_rate": 2.7276646370402172e-06, + "loss": 0.1983, + "step": 9148 + }, + { + "epoch": 1.8516494636713214, + "grad_norm": 0.27518364787101746, + "learning_rate": 2.7202913071176507e-06, + "loss": 0.1964, + "step": 9149 + }, + { + "epoch": 1.8518518518518519, + "grad_norm": 0.312863290309906, + "learning_rate": 2.7129278189352512e-06, + "loss": 0.179, + "step": 9150 + }, + { + "epoch": 1.8518518518518519, + "eval_loss": 0.2572021782398224, + "eval_runtime": 0.7386, + "eval_samples_per_second": 6.77, + "eval_steps_per_second": 1.354, + "step": 9150 + }, + { + "epoch": 1.852054240032382, + "grad_norm": 0.2847476005554199, + "learning_rate": 2.7055741732380012e-06, + "loss": 0.1939, + "step": 9151 + }, + { + "epoch": 1.8522566282129125, + "grad_norm": 0.3035951852798462, + "learning_rate": 2.6982303707698607e-06, + "loss": 0.2055, + "step": 9152 + }, + { + "epoch": 1.8524590163934427, + "grad_norm": 0.24721133708953857, + "learning_rate": 2.690896412273791e-06, + "loss": 0.1724, + "step": 9153 + }, + { + "epoch": 1.8526614045739729, + "grad_norm": 0.30687880516052246, + "learning_rate": 2.6835722984917764e-06, + "loss": 0.2138, + "step": 9154 + }, + { + "epoch": 1.852863792754503, + "grad_norm": 0.26503756642341614, + "learning_rate": 2.67625803016478e-06, + "loss": 0.1726, + "step": 9155 + }, + { + "epoch": 1.8530661809350333, + "grad_norm": 0.2724968492984772, + "learning_rate": 2.668953608032798e-06, + "loss": 0.1842, + "step": 9156 + }, + { + "epoch": 1.8532685691155637, + "grad_norm": 0.3005676865577698, + "learning_rate": 2.6616590328347958e-06, + "loss": 0.1891, + "step": 9157 + }, + { + "epoch": 1.853470957296094, + "grad_norm": 0.2763330638408661, + "learning_rate": 2.6543743053087823e-06, + "loss": 0.1864, + "step": 9158 + }, + { + "epoch": 1.8536733454766243, + "grad_norm": 0.40457960963249207, + "learning_rate": 2.6470994261917347e-06, + "loss": 0.1893, + "step": 9159 + }, + { + "epoch": 1.8538757336571545, + "grad_norm": 0.33288365602493286, + "learning_rate": 2.639834396219654e-06, + "loss": 0.2259, + "step": 9160 + }, + { + "epoch": 1.8540781218376847, + "grad_norm": 0.28434813022613525, + "learning_rate": 2.63257921612754e-06, + "loss": 0.1996, + "step": 9161 + }, + { + "epoch": 1.854280510018215, + "grad_norm": 0.3212646245956421, + "learning_rate": 2.625333886649417e-06, + "loss": 0.178, + "step": 9162 + }, + { + "epoch": 1.8544828981987451, + "grad_norm": 0.2832084894180298, + "learning_rate": 2.6180984085182547e-06, + "loss": 0.1968, + "step": 9163 + }, + { + "epoch": 1.8546852863792753, + "grad_norm": 0.26873666048049927, + "learning_rate": 2.610872782466067e-06, + "loss": 0.175, + "step": 9164 + }, + { + "epoch": 1.8548876745598057, + "grad_norm": 0.28573164343833923, + "learning_rate": 2.603657009223892e-06, + "loss": 0.172, + "step": 9165 + }, + { + "epoch": 1.855090062740336, + "grad_norm": 0.29353541135787964, + "learning_rate": 2.596451089521734e-06, + "loss": 0.1608, + "step": 9166 + }, + { + "epoch": 1.8552924509208664, + "grad_norm": 0.2839539051055908, + "learning_rate": 2.5892550240885995e-06, + "loss": 0.1626, + "step": 9167 + }, + { + "epoch": 1.8554948391013966, + "grad_norm": 0.41530534625053406, + "learning_rate": 2.582068813652527e-06, + "loss": 0.1724, + "step": 9168 + }, + { + "epoch": 1.8556972272819268, + "grad_norm": 0.2706339359283447, + "learning_rate": 2.5748924589405476e-06, + "loss": 0.19, + "step": 9169 + }, + { + "epoch": 1.855899615462457, + "grad_norm": 0.29047369956970215, + "learning_rate": 2.5677259606786684e-06, + "loss": 0.1829, + "step": 9170 + }, + { + "epoch": 1.8561020036429872, + "grad_norm": 0.24929776787757874, + "learning_rate": 2.5605693195919323e-06, + "loss": 0.1668, + "step": 9171 + }, + { + "epoch": 1.8563043918235174, + "grad_norm": 0.26415640115737915, + "learning_rate": 2.5534225364043706e-06, + "loss": 0.1611, + "step": 9172 + }, + { + "epoch": 1.8565067800040478, + "grad_norm": 0.28007614612579346, + "learning_rate": 2.5462856118390277e-06, + "loss": 0.1789, + "step": 9173 + }, + { + "epoch": 1.856709168184578, + "grad_norm": 0.2696763873100281, + "learning_rate": 2.5391585466179257e-06, + "loss": 0.2041, + "step": 9174 + }, + { + "epoch": 1.8569115563651084, + "grad_norm": 0.25559699535369873, + "learning_rate": 2.5320413414621323e-06, + "loss": 0.17, + "step": 9175 + }, + { + "epoch": 1.8571139445456386, + "grad_norm": 0.28708815574645996, + "learning_rate": 2.524933997091661e-06, + "loss": 0.1763, + "step": 9176 + }, + { + "epoch": 1.8573163327261688, + "grad_norm": 0.25946274399757385, + "learning_rate": 2.51783651422558e-06, + "loss": 0.1492, + "step": 9177 + }, + { + "epoch": 1.857518720906699, + "grad_norm": 0.2698550224304199, + "learning_rate": 2.510748893581938e-06, + "loss": 0.1746, + "step": 9178 + }, + { + "epoch": 1.8577211090872292, + "grad_norm": 0.27411776781082153, + "learning_rate": 2.503671135877772e-06, + "loss": 0.1906, + "step": 9179 + }, + { + "epoch": 1.8579234972677594, + "grad_norm": 0.2979721426963806, + "learning_rate": 2.496603241829143e-06, + "loss": 0.2126, + "step": 9180 + }, + { + "epoch": 1.8581258854482898, + "grad_norm": 0.24646952748298645, + "learning_rate": 2.489545212151112e-06, + "loss": 0.1719, + "step": 9181 + }, + { + "epoch": 1.85832827362882, + "grad_norm": 0.25245559215545654, + "learning_rate": 2.4824970475577305e-06, + "loss": 0.1815, + "step": 9182 + }, + { + "epoch": 1.8585306618093504, + "grad_norm": 0.23951876163482666, + "learning_rate": 2.4754587487620494e-06, + "loss": 0.1746, + "step": 9183 + }, + { + "epoch": 1.8587330499898806, + "grad_norm": 0.33775365352630615, + "learning_rate": 2.4684303164761556e-06, + "loss": 0.2248, + "step": 9184 + }, + { + "epoch": 1.8589354381704108, + "grad_norm": 0.34358781576156616, + "learning_rate": 2.46141175141108e-06, + "loss": 0.1786, + "step": 9185 + }, + { + "epoch": 1.859137826350941, + "grad_norm": 0.3016970753669739, + "learning_rate": 2.4544030542768993e-06, + "loss": 0.2214, + "step": 9186 + }, + { + "epoch": 1.8593402145314712, + "grad_norm": 0.2781483829021454, + "learning_rate": 2.4474042257826902e-06, + "loss": 0.1786, + "step": 9187 + }, + { + "epoch": 1.8595426027120017, + "grad_norm": 0.2879065275192261, + "learning_rate": 2.44041526663652e-06, + "loss": 0.1992, + "step": 9188 + }, + { + "epoch": 1.8597449908925319, + "grad_norm": 0.28307580947875977, + "learning_rate": 2.433436177545445e-06, + "loss": 0.1868, + "step": 9189 + }, + { + "epoch": 1.8599473790730623, + "grad_norm": 0.29029884934425354, + "learning_rate": 2.4264669592155454e-06, + "loss": 0.2209, + "step": 9190 + }, + { + "epoch": 1.8601497672535925, + "grad_norm": 0.2731107771396637, + "learning_rate": 2.41950761235189e-06, + "loss": 0.1885, + "step": 9191 + }, + { + "epoch": 1.8603521554341227, + "grad_norm": 0.29840999841690063, + "learning_rate": 2.412558137658549e-06, + "loss": 0.1925, + "step": 9192 + }, + { + "epoch": 1.860554543614653, + "grad_norm": 0.2856263220310211, + "learning_rate": 2.4056185358386163e-06, + "loss": 0.1725, + "step": 9193 + }, + { + "epoch": 1.860756931795183, + "grad_norm": 0.264995276927948, + "learning_rate": 2.3986888075941404e-06, + "loss": 0.1689, + "step": 9194 + }, + { + "epoch": 1.8609593199757133, + "grad_norm": 0.3229961395263672, + "learning_rate": 2.3917689536262166e-06, + "loss": 0.2027, + "step": 9195 + }, + { + "epoch": 1.8611617081562437, + "grad_norm": 0.2580181658267975, + "learning_rate": 2.384858974634918e-06, + "loss": 0.1565, + "step": 9196 + }, + { + "epoch": 1.861364096336774, + "grad_norm": 0.3221249282360077, + "learning_rate": 2.37795887131933e-06, + "loss": 0.2124, + "step": 9197 + }, + { + "epoch": 1.8615664845173043, + "grad_norm": 0.22325314581394196, + "learning_rate": 2.3710686443775165e-06, + "loss": 0.1501, + "step": 9198 + }, + { + "epoch": 1.8617688726978345, + "grad_norm": 0.292501300573349, + "learning_rate": 2.364188294506575e-06, + "loss": 0.2013, + "step": 9199 + }, + { + "epoch": 1.8619712608783647, + "grad_norm": 0.35212448239326477, + "learning_rate": 2.357317822402583e-06, + "loss": 0.2097, + "step": 9200 + }, + { + "epoch": 1.8619712608783647, + "eval_loss": 0.25725609064102173, + "eval_runtime": 0.7405, + "eval_samples_per_second": 6.752, + "eval_steps_per_second": 1.35, + "step": 9200 + }, + { + "epoch": 1.862173649058895, + "grad_norm": 0.2880713641643524, + "learning_rate": 2.350457228760616e-06, + "loss": 0.1991, + "step": 9201 + }, + { + "epoch": 1.8623760372394251, + "grad_norm": 0.2760334014892578, + "learning_rate": 2.3436065142747652e-06, + "loss": 0.1861, + "step": 9202 + }, + { + "epoch": 1.8625784254199553, + "grad_norm": 0.2789352834224701, + "learning_rate": 2.336765679638109e-06, + "loss": 0.1808, + "step": 9203 + }, + { + "epoch": 1.8627808136004858, + "grad_norm": 0.27005985379219055, + "learning_rate": 2.329934725542737e-06, + "loss": 0.2044, + "step": 9204 + }, + { + "epoch": 1.862983201781016, + "grad_norm": 0.2802645266056061, + "learning_rate": 2.3231136526797204e-06, + "loss": 0.1483, + "step": 9205 + }, + { + "epoch": 1.8631855899615464, + "grad_norm": 0.2909417450428009, + "learning_rate": 2.3163024617391727e-06, + "loss": 0.1796, + "step": 9206 + }, + { + "epoch": 1.8633879781420766, + "grad_norm": 0.33910495042800903, + "learning_rate": 2.309501153410143e-06, + "loss": 0.183, + "step": 9207 + }, + { + "epoch": 1.8635903663226068, + "grad_norm": 0.2519986629486084, + "learning_rate": 2.3027097283807253e-06, + "loss": 0.1665, + "step": 9208 + }, + { + "epoch": 1.863792754503137, + "grad_norm": 0.2841980755329132, + "learning_rate": 2.2959281873380146e-06, + "loss": 0.2132, + "step": 9209 + }, + { + "epoch": 1.8639951426836672, + "grad_norm": 0.29311954975128174, + "learning_rate": 2.2891565309680952e-06, + "loss": 0.2043, + "step": 9210 + }, + { + "epoch": 1.8641975308641974, + "grad_norm": 0.3197910189628601, + "learning_rate": 2.2823947599560525e-06, + "loss": 0.1826, + "step": 9211 + }, + { + "epoch": 1.8643999190447278, + "grad_norm": 0.35433757305145264, + "learning_rate": 2.2756428749859728e-06, + "loss": 0.1838, + "step": 9212 + }, + { + "epoch": 1.864602307225258, + "grad_norm": 0.26283395290374756, + "learning_rate": 2.2689008767409313e-06, + "loss": 0.1839, + "step": 9213 + }, + { + "epoch": 1.8648046954057884, + "grad_norm": 0.30381646752357483, + "learning_rate": 2.2621687659030165e-06, + "loss": 0.2045, + "step": 9214 + }, + { + "epoch": 1.8650070835863186, + "grad_norm": 0.27693313360214233, + "learning_rate": 2.2554465431533168e-06, + "loss": 0.2015, + "step": 9215 + }, + { + "epoch": 1.8652094717668488, + "grad_norm": 0.450054407119751, + "learning_rate": 2.24873420917191e-06, + "loss": 0.1661, + "step": 9216 + }, + { + "epoch": 1.865411859947379, + "grad_norm": 0.2783445417881012, + "learning_rate": 2.2420317646378864e-06, + "loss": 0.1715, + "step": 9217 + }, + { + "epoch": 1.8656142481279092, + "grad_norm": 0.27848151326179504, + "learning_rate": 2.235339210229326e-06, + "loss": 0.1761, + "step": 9218 + }, + { + "epoch": 1.8658166363084396, + "grad_norm": 0.29417315125465393, + "learning_rate": 2.2286565466233087e-06, + "loss": 0.2073, + "step": 9219 + }, + { + "epoch": 1.8660190244889698, + "grad_norm": 0.3033013939857483, + "learning_rate": 2.2219837744959283e-06, + "loss": 0.2171, + "step": 9220 + }, + { + "epoch": 1.8662214126695003, + "grad_norm": 0.27484026551246643, + "learning_rate": 2.2153208945222436e-06, + "loss": 0.1637, + "step": 9221 + }, + { + "epoch": 1.8664238008500305, + "grad_norm": 0.3170156478881836, + "learning_rate": 2.2086679073763607e-06, + "loss": 0.2094, + "step": 9222 + }, + { + "epoch": 1.8666261890305607, + "grad_norm": 0.2751418650150299, + "learning_rate": 2.2020248137313405e-06, + "loss": 0.21, + "step": 9223 + }, + { + "epoch": 1.8668285772110909, + "grad_norm": 0.28255924582481384, + "learning_rate": 2.195391614259257e-06, + "loss": 0.2093, + "step": 9224 + }, + { + "epoch": 1.867030965391621, + "grad_norm": 0.25517529249191284, + "learning_rate": 2.1887683096312062e-06, + "loss": 0.1716, + "step": 9225 + }, + { + "epoch": 1.8672333535721513, + "grad_norm": 0.2793150842189789, + "learning_rate": 2.182154900517264e-06, + "loss": 0.1935, + "step": 9226 + }, + { + "epoch": 1.8674357417526817, + "grad_norm": 0.30712810158729553, + "learning_rate": 2.175551387586494e-06, + "loss": 0.1864, + "step": 9227 + }, + { + "epoch": 1.8676381299332119, + "grad_norm": 0.3432822823524475, + "learning_rate": 2.1689577715069743e-06, + "loss": 0.2281, + "step": 9228 + }, + { + "epoch": 1.8678405181137423, + "grad_norm": 0.24391323328018188, + "learning_rate": 2.1623740529457815e-06, + "loss": 0.1517, + "step": 9229 + }, + { + "epoch": 1.8680429062942725, + "grad_norm": 0.24822361767292023, + "learning_rate": 2.155800232568972e-06, + "loss": 0.1536, + "step": 9230 + }, + { + "epoch": 1.8682452944748027, + "grad_norm": 0.28816094994544983, + "learning_rate": 2.1492363110416357e-06, + "loss": 0.1978, + "step": 9231 + }, + { + "epoch": 1.868447682655333, + "grad_norm": 0.29633474349975586, + "learning_rate": 2.142682289027831e-06, + "loss": 0.1907, + "step": 9232 + }, + { + "epoch": 1.868650070835863, + "grad_norm": 0.4840158522129059, + "learning_rate": 2.1361381671906267e-06, + "loss": 0.1782, + "step": 9233 + }, + { + "epoch": 1.8688524590163933, + "grad_norm": 0.2775403559207916, + "learning_rate": 2.1296039461920825e-06, + "loss": 0.1873, + "step": 9234 + }, + { + "epoch": 1.8690548471969237, + "grad_norm": 0.2747441530227661, + "learning_rate": 2.1230796266932694e-06, + "loss": 0.1781, + "step": 9235 + }, + { + "epoch": 1.869257235377454, + "grad_norm": 0.2542593777179718, + "learning_rate": 2.1165652093542598e-06, + "loss": 0.1633, + "step": 9236 + }, + { + "epoch": 1.8694596235579843, + "grad_norm": 0.2847427725791931, + "learning_rate": 2.1100606948340927e-06, + "loss": 0.1835, + "step": 9237 + }, + { + "epoch": 1.8696620117385145, + "grad_norm": 0.2818686366081238, + "learning_rate": 2.103566083790842e-06, + "loss": 0.1953, + "step": 9238 + }, + { + "epoch": 1.8698643999190447, + "grad_norm": 0.23137100040912628, + "learning_rate": 2.09708137688156e-06, + "loss": 0.1449, + "step": 9239 + }, + { + "epoch": 1.870066788099575, + "grad_norm": 0.2877149283885956, + "learning_rate": 2.0906065747622994e-06, + "loss": 0.2078, + "step": 9240 + }, + { + "epoch": 1.8702691762801051, + "grad_norm": 0.301807165145874, + "learning_rate": 2.084141678088114e-06, + "loss": 0.1845, + "step": 9241 + }, + { + "epoch": 1.8704715644606353, + "grad_norm": 0.28998205065727234, + "learning_rate": 2.0776866875130586e-06, + "loss": 0.184, + "step": 9242 + }, + { + "epoch": 1.8706739526411658, + "grad_norm": 0.24933961033821106, + "learning_rate": 2.0712416036901663e-06, + "loss": 0.1612, + "step": 9243 + }, + { + "epoch": 1.870876340821696, + "grad_norm": 0.3098524212837219, + "learning_rate": 2.064806427271493e-06, + "loss": 0.1954, + "step": 9244 + }, + { + "epoch": 1.8710787290022264, + "grad_norm": 0.2629394829273224, + "learning_rate": 2.0583811589080847e-06, + "loss": 0.1805, + "step": 9245 + }, + { + "epoch": 1.8712811171827566, + "grad_norm": 0.27097082138061523, + "learning_rate": 2.0519657992499884e-06, + "loss": 0.16, + "step": 9246 + }, + { + "epoch": 1.8714835053632868, + "grad_norm": 0.28503546118736267, + "learning_rate": 2.0455603489462405e-06, + "loss": 0.188, + "step": 9247 + }, + { + "epoch": 1.871685893543817, + "grad_norm": 0.30967095494270325, + "learning_rate": 2.0391648086448556e-06, + "loss": 0.2317, + "step": 9248 + }, + { + "epoch": 1.8718882817243472, + "grad_norm": 0.30626732110977173, + "learning_rate": 2.032779178992894e-06, + "loss": 0.2115, + "step": 9249 + }, + { + "epoch": 1.8720906699048776, + "grad_norm": 0.2859762907028198, + "learning_rate": 2.0264034606363835e-06, + "loss": 0.1828, + "step": 9250 + }, + { + "epoch": 1.8720906699048776, + "eval_loss": 0.2571257948875427, + "eval_runtime": 0.7387, + "eval_samples_per_second": 6.768, + "eval_steps_per_second": 1.354, + "step": 9250 + }, + { + "epoch": 1.8722930580854078, + "grad_norm": 0.2790232300758362, + "learning_rate": 2.0200376542203414e-06, + "loss": 0.1982, + "step": 9251 + }, + { + "epoch": 1.8724954462659382, + "grad_norm": 0.28410059213638306, + "learning_rate": 2.013681760388797e-06, + "loss": 0.1754, + "step": 9252 + }, + { + "epoch": 1.8726978344464684, + "grad_norm": 0.2875385284423828, + "learning_rate": 2.0073357797847694e-06, + "loss": 0.1871, + "step": 9253 + }, + { + "epoch": 1.8729002226269986, + "grad_norm": 0.28860172629356384, + "learning_rate": 2.000999713050289e-06, + "loss": 0.1987, + "step": 9254 + }, + { + "epoch": 1.8731026108075288, + "grad_norm": 0.23642873764038086, + "learning_rate": 1.9946735608263543e-06, + "loss": 0.1455, + "step": 9255 + }, + { + "epoch": 1.873304998988059, + "grad_norm": 0.3174664378166199, + "learning_rate": 1.9883573237529985e-06, + "loss": 0.1946, + "step": 9256 + }, + { + "epoch": 1.8735073871685892, + "grad_norm": 0.29666104912757874, + "learning_rate": 1.9820510024692206e-06, + "loss": 0.1958, + "step": 9257 + }, + { + "epoch": 1.8737097753491196, + "grad_norm": 0.32560938596725464, + "learning_rate": 1.975754597613033e-06, + "loss": 0.2262, + "step": 9258 + }, + { + "epoch": 1.8739121635296498, + "grad_norm": 0.25239941477775574, + "learning_rate": 1.9694681098214375e-06, + "loss": 0.1736, + "step": 9259 + }, + { + "epoch": 1.8741145517101803, + "grad_norm": 0.31177282333374023, + "learning_rate": 1.9631915397304355e-06, + "loss": 0.1845, + "step": 9260 + }, + { + "epoch": 1.8743169398907105, + "grad_norm": 0.24597904086112976, + "learning_rate": 1.956924887975031e-06, + "loss": 0.1554, + "step": 9261 + }, + { + "epoch": 1.8745193280712407, + "grad_norm": 0.2527812123298645, + "learning_rate": 1.950668155189206e-06, + "loss": 0.2053, + "step": 9262 + }, + { + "epoch": 1.8747217162517709, + "grad_norm": 0.30786600708961487, + "learning_rate": 1.944421342005964e-06, + "loss": 0.1854, + "step": 9263 + }, + { + "epoch": 1.874924104432301, + "grad_norm": 0.2886776626110077, + "learning_rate": 1.9381844490572786e-06, + "loss": 0.2074, + "step": 9264 + }, + { + "epoch": 1.8751264926128313, + "grad_norm": 0.26758891344070435, + "learning_rate": 1.9319574769741334e-06, + "loss": 0.218, + "step": 9265 + }, + { + "epoch": 1.8753288807933617, + "grad_norm": 0.26394400000572205, + "learning_rate": 1.9257404263865244e-06, + "loss": 0.167, + "step": 9266 + }, + { + "epoch": 1.8755312689738919, + "grad_norm": 0.2897722125053406, + "learning_rate": 1.919533297923415e-06, + "loss": 0.1638, + "step": 9267 + }, + { + "epoch": 1.8757336571544223, + "grad_norm": 0.266205370426178, + "learning_rate": 1.9133360922127806e-06, + "loss": 0.1758, + "step": 9268 + }, + { + "epoch": 1.8759360453349525, + "grad_norm": 0.2297278642654419, + "learning_rate": 1.907148809881587e-06, + "loss": 0.1547, + "step": 9269 + }, + { + "epoch": 1.8761384335154827, + "grad_norm": 0.24826155602931976, + "learning_rate": 1.9009714515557997e-06, + "loss": 0.1586, + "step": 9270 + }, + { + "epoch": 1.876340821696013, + "grad_norm": 0.36072930693626404, + "learning_rate": 1.8948040178603855e-06, + "loss": 0.2092, + "step": 9271 + }, + { + "epoch": 1.876543209876543, + "grad_norm": 0.3244743049144745, + "learning_rate": 1.8886465094192896e-06, + "loss": 0.2082, + "step": 9272 + }, + { + "epoch": 1.8767455980570733, + "grad_norm": 0.27832773327827454, + "learning_rate": 1.8824989268554805e-06, + "loss": 0.1951, + "step": 9273 + }, + { + "epoch": 1.8769479862376037, + "grad_norm": 0.2717227041721344, + "learning_rate": 1.8763612707908828e-06, + "loss": 0.1733, + "step": 9274 + }, + { + "epoch": 1.877150374418134, + "grad_norm": 0.3183591365814209, + "learning_rate": 1.8702335418464556e-06, + "loss": 0.1655, + "step": 9275 + }, + { + "epoch": 1.8773527625986643, + "grad_norm": 0.2324318289756775, + "learning_rate": 1.864115740642125e-06, + "loss": 0.1964, + "step": 9276 + }, + { + "epoch": 1.8775551507791945, + "grad_norm": 0.3259795308113098, + "learning_rate": 1.8580078677968516e-06, + "loss": 0.2207, + "step": 9277 + }, + { + "epoch": 1.8777575389597247, + "grad_norm": 0.253817081451416, + "learning_rate": 1.851909923928541e-06, + "loss": 0.1631, + "step": 9278 + }, + { + "epoch": 1.877959927140255, + "grad_norm": 0.28769010305404663, + "learning_rate": 1.8458219096541218e-06, + "loss": 0.1983, + "step": 9279 + }, + { + "epoch": 1.8781623153207851, + "grad_norm": 0.2674994468688965, + "learning_rate": 1.839743825589535e-06, + "loss": 0.2164, + "step": 9280 + }, + { + "epoch": 1.8783647035013156, + "grad_norm": 0.2848077416419983, + "learning_rate": 1.8336756723496774e-06, + "loss": 0.1822, + "step": 9281 + }, + { + "epoch": 1.8785670916818458, + "grad_norm": 0.3198871910572052, + "learning_rate": 1.8276174505484577e-06, + "loss": 0.1854, + "step": 9282 + }, + { + "epoch": 1.8787694798623762, + "grad_norm": 0.30895861983299255, + "learning_rate": 1.8215691607988084e-06, + "loss": 0.1675, + "step": 9283 + }, + { + "epoch": 1.8789718680429064, + "grad_norm": 0.3190915882587433, + "learning_rate": 1.8155308037126061e-06, + "loss": 0.1914, + "step": 9284 + }, + { + "epoch": 1.8791742562234366, + "grad_norm": 0.28208211064338684, + "learning_rate": 1.8095023799007739e-06, + "loss": 0.1668, + "step": 9285 + }, + { + "epoch": 1.8793766444039668, + "grad_norm": 0.26353633403778076, + "learning_rate": 1.8034838899731787e-06, + "loss": 0.167, + "step": 9286 + }, + { + "epoch": 1.879579032584497, + "grad_norm": 0.24951602518558502, + "learning_rate": 1.797475334538723e-06, + "loss": 0.1764, + "step": 9287 + }, + { + "epoch": 1.8797814207650272, + "grad_norm": 0.3103049397468567, + "learning_rate": 1.7914767142052758e-06, + "loss": 0.209, + "step": 9288 + }, + { + "epoch": 1.8799838089455576, + "grad_norm": 0.25802189111709595, + "learning_rate": 1.7854880295797405e-06, + "loss": 0.1699, + "step": 9289 + }, + { + "epoch": 1.8801861971260878, + "grad_norm": 0.25156068801879883, + "learning_rate": 1.7795092812679548e-06, + "loss": 0.2003, + "step": 9290 + }, + { + "epoch": 1.8803885853066182, + "grad_norm": 0.2825656235218048, + "learning_rate": 1.7735404698748237e-06, + "loss": 0.1887, + "step": 9291 + }, + { + "epoch": 1.8805909734871484, + "grad_norm": 0.2530002295970917, + "learning_rate": 1.7675815960041752e-06, + "loss": 0.1759, + "step": 9292 + }, + { + "epoch": 1.8807933616676786, + "grad_norm": 0.24998198449611664, + "learning_rate": 1.7616326602588828e-06, + "loss": 0.175, + "step": 9293 + }, + { + "epoch": 1.8809957498482088, + "grad_norm": 0.3051765263080597, + "learning_rate": 1.7556936632407983e-06, + "loss": 0.1972, + "step": 9294 + }, + { + "epoch": 1.881198138028739, + "grad_norm": 0.25916385650634766, + "learning_rate": 1.7497646055507633e-06, + "loss": 0.1511, + "step": 9295 + }, + { + "epoch": 1.8814005262092692, + "grad_norm": 0.3032218813896179, + "learning_rate": 1.743845487788609e-06, + "loss": 0.1982, + "step": 9296 + }, + { + "epoch": 1.8816029143897997, + "grad_norm": 0.3098996579647064, + "learning_rate": 1.73793631055319e-06, + "loss": 0.2386, + "step": 9297 + }, + { + "epoch": 1.8818053025703299, + "grad_norm": 0.3222119212150574, + "learning_rate": 1.7320370744423165e-06, + "loss": 0.1864, + "step": 9298 + }, + { + "epoch": 1.8820076907508603, + "grad_norm": 0.31922316551208496, + "learning_rate": 1.7261477800528114e-06, + "loss": 0.204, + "step": 9299 + }, + { + "epoch": 1.8822100789313905, + "grad_norm": 0.29419052600860596, + "learning_rate": 1.7202684279805092e-06, + "loss": 0.1628, + "step": 9300 + }, + { + "epoch": 1.8822100789313905, + "eval_loss": 0.25728121399879456, + "eval_runtime": 0.7402, + "eval_samples_per_second": 6.755, + "eval_steps_per_second": 1.351, + "step": 9300 + }, + { + "epoch": 1.8824124671119207, + "grad_norm": 0.2934373915195465, + "learning_rate": 1.7143990188202007e-06, + "loss": 0.2056, + "step": 9301 + }, + { + "epoch": 1.8826148552924509, + "grad_norm": 0.29484328627586365, + "learning_rate": 1.708539553165711e-06, + "loss": 0.1865, + "step": 9302 + }, + { + "epoch": 1.882817243472981, + "grad_norm": 0.27377191185951233, + "learning_rate": 1.7026900316098215e-06, + "loss": 0.1692, + "step": 9303 + }, + { + "epoch": 1.8830196316535113, + "grad_norm": 0.2880317270755768, + "learning_rate": 1.6968504547443364e-06, + "loss": 0.2183, + "step": 9304 + }, + { + "epoch": 1.8832220198340417, + "grad_norm": 0.2645496428012848, + "learning_rate": 1.6910208231600389e-06, + "loss": 0.1895, + "step": 9305 + }, + { + "epoch": 1.883424408014572, + "grad_norm": 0.2487277090549469, + "learning_rate": 1.6852011374467014e-06, + "loss": 0.1905, + "step": 9306 + }, + { + "epoch": 1.8836267961951023, + "grad_norm": 0.2735969126224518, + "learning_rate": 1.6793913981931198e-06, + "loss": 0.1663, + "step": 9307 + }, + { + "epoch": 1.8838291843756325, + "grad_norm": 0.2630087733268738, + "learning_rate": 1.6735916059870461e-06, + "loss": 0.1747, + "step": 9308 + }, + { + "epoch": 1.8840315725561627, + "grad_norm": 0.2995377779006958, + "learning_rate": 1.6678017614152442e-06, + "loss": 0.181, + "step": 9309 + }, + { + "epoch": 1.884233960736693, + "grad_norm": 0.29509326815605164, + "learning_rate": 1.6620218650634677e-06, + "loss": 0.175, + "step": 9310 + }, + { + "epoch": 1.8844363489172231, + "grad_norm": 0.25882208347320557, + "learning_rate": 1.6562519175164827e-06, + "loss": 0.1562, + "step": 9311 + }, + { + "epoch": 1.8846387370977535, + "grad_norm": 0.2785455286502838, + "learning_rate": 1.6504919193580105e-06, + "loss": 0.176, + "step": 9312 + }, + { + "epoch": 1.8848411252782837, + "grad_norm": 0.31672048568725586, + "learning_rate": 1.6447418711707962e-06, + "loss": 0.2095, + "step": 9313 + }, + { + "epoch": 1.8850435134588142, + "grad_norm": 0.2423751801252365, + "learning_rate": 1.6390017735365637e-06, + "loss": 0.1446, + "step": 9314 + }, + { + "epoch": 1.8852459016393444, + "grad_norm": 0.2900162935256958, + "learning_rate": 1.6332716270360482e-06, + "loss": 0.2023, + "step": 9315 + }, + { + "epoch": 1.8854482898198746, + "grad_norm": 0.25335296988487244, + "learning_rate": 1.6275514322489638e-06, + "loss": 0.167, + "step": 9316 + }, + { + "epoch": 1.8856506780004048, + "grad_norm": 0.26420995593070984, + "learning_rate": 1.6218411897540252e-06, + "loss": 0.2064, + "step": 9317 + }, + { + "epoch": 1.885853066180935, + "grad_norm": 0.27242782711982727, + "learning_rate": 1.616140900128904e-06, + "loss": 0.1954, + "step": 9318 + }, + { + "epoch": 1.8860554543614652, + "grad_norm": 0.24767664074897766, + "learning_rate": 1.6104505639503276e-06, + "loss": 0.1601, + "step": 9319 + }, + { + "epoch": 1.8862578425419956, + "grad_norm": 0.38079896569252014, + "learning_rate": 1.6047701817939687e-06, + "loss": 0.2118, + "step": 9320 + }, + { + "epoch": 1.8864602307225258, + "grad_norm": 0.24454255402088165, + "learning_rate": 1.5990997542345121e-06, + "loss": 0.1811, + "step": 9321 + }, + { + "epoch": 1.8866626189030562, + "grad_norm": 0.2145971655845642, + "learning_rate": 1.5934392818456323e-06, + "loss": 0.1308, + "step": 9322 + }, + { + "epoch": 1.8868650070835864, + "grad_norm": 0.3013664484024048, + "learning_rate": 1.5877887652000045e-06, + "loss": 0.1892, + "step": 9323 + }, + { + "epoch": 1.8870673952641166, + "grad_norm": 0.25066500902175903, + "learning_rate": 1.5821482048692716e-06, + "loss": 0.1711, + "step": 9324 + }, + { + "epoch": 1.8872697834446468, + "grad_norm": 0.31969258189201355, + "learning_rate": 1.576517601424099e-06, + "loss": 0.1989, + "step": 9325 + }, + { + "epoch": 1.887472171625177, + "grad_norm": 0.2970082759857178, + "learning_rate": 1.5708969554341424e-06, + "loss": 0.1757, + "step": 9326 + }, + { + "epoch": 1.8876745598057072, + "grad_norm": 0.25590500235557556, + "learning_rate": 1.5652862674680136e-06, + "loss": 0.1633, + "step": 9327 + }, + { + "epoch": 1.8878769479862376, + "grad_norm": 0.27178964018821716, + "learning_rate": 1.5596855380933584e-06, + "loss": 0.1844, + "step": 9328 + }, + { + "epoch": 1.8880793361667678, + "grad_norm": 0.27887216210365295, + "learning_rate": 1.5540947678768013e-06, + "loss": 0.1773, + "step": 9329 + }, + { + "epoch": 1.8882817243472982, + "grad_norm": 0.2817661464214325, + "learning_rate": 1.5485139573839569e-06, + "loss": 0.1944, + "step": 9330 + }, + { + "epoch": 1.8884841125278284, + "grad_norm": 0.30042463541030884, + "learning_rate": 1.5429431071794175e-06, + "loss": 0.1771, + "step": 9331 + }, + { + "epoch": 1.8886865007083586, + "grad_norm": 0.30515265464782715, + "learning_rate": 1.5373822178268105e-06, + "loss": 0.1774, + "step": 9332 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.2691422402858734, + "learning_rate": 1.5318312898887078e-06, + "loss": 0.1824, + "step": 9333 + }, + { + "epoch": 1.889091277069419, + "grad_norm": 0.28377440571784973, + "learning_rate": 1.5262903239267045e-06, + "loss": 0.1677, + "step": 9334 + }, + { + "epoch": 1.8892936652499492, + "grad_norm": 0.2840723693370819, + "learning_rate": 1.5207593205013748e-06, + "loss": 0.186, + "step": 9335 + }, + { + "epoch": 1.8894960534304797, + "grad_norm": 0.30614086985588074, + "learning_rate": 1.515238280172282e-06, + "loss": 0.2031, + "step": 9336 + }, + { + "epoch": 1.88969844161101, + "grad_norm": 0.27492690086364746, + "learning_rate": 1.5097272034979904e-06, + "loss": 0.1852, + "step": 9337 + }, + { + "epoch": 1.8899008297915403, + "grad_norm": 0.34836092591285706, + "learning_rate": 1.504226091036054e-06, + "loss": 0.1754, + "step": 9338 + }, + { + "epoch": 1.8901032179720705, + "grad_norm": 0.27250251173973083, + "learning_rate": 1.4987349433430165e-06, + "loss": 0.1595, + "step": 9339 + }, + { + "epoch": 1.8903056061526007, + "grad_norm": 0.276806503534317, + "learning_rate": 1.4932537609744112e-06, + "loss": 0.1792, + "step": 9340 + }, + { + "epoch": 1.8905079943331309, + "grad_norm": 0.26508834958076477, + "learning_rate": 1.4877825444847838e-06, + "loss": 0.1708, + "step": 9341 + }, + { + "epoch": 1.890710382513661, + "grad_norm": 0.28936338424682617, + "learning_rate": 1.4823212944276243e-06, + "loss": 0.191, + "step": 9342 + }, + { + "epoch": 1.8909127706941915, + "grad_norm": 0.31687653064727783, + "learning_rate": 1.47687001135548e-06, + "loss": 0.1835, + "step": 9343 + }, + { + "epoch": 1.8911151588747217, + "grad_norm": 0.2686740458011627, + "learning_rate": 1.471428695819821e-06, + "loss": 0.1719, + "step": 9344 + }, + { + "epoch": 1.8913175470552521, + "grad_norm": 0.28184860944747925, + "learning_rate": 1.4659973483711732e-06, + "loss": 0.1813, + "step": 9345 + }, + { + "epoch": 1.8915199352357823, + "grad_norm": 0.2862120568752289, + "learning_rate": 1.460575969558997e-06, + "loss": 0.1912, + "step": 9346 + }, + { + "epoch": 1.8917223234163125, + "grad_norm": 0.2824765145778656, + "learning_rate": 1.4551645599317876e-06, + "loss": 0.1711, + "step": 9347 + }, + { + "epoch": 1.8919247115968427, + "grad_norm": 0.2883681356906891, + "learning_rate": 1.4497631200370066e-06, + "loss": 0.1805, + "step": 9348 + }, + { + "epoch": 1.892127099777373, + "grad_norm": 0.26486730575561523, + "learning_rate": 1.4443716504211168e-06, + "loss": 0.1736, + "step": 9349 + }, + { + "epoch": 1.8923294879579031, + "grad_norm": 0.20992836356163025, + "learning_rate": 1.4389901516295713e-06, + "loss": 0.0989, + "step": 9350 + }, + { + "epoch": 1.8923294879579031, + "eval_loss": 0.25722378492355347, + "eval_runtime": 0.7396, + "eval_samples_per_second": 6.76, + "eval_steps_per_second": 1.352, + "step": 9350 + }, + { + "epoch": 1.8925318761384335, + "grad_norm": 0.27398180961608887, + "learning_rate": 1.4336186242068117e-06, + "loss": 0.1426, + "step": 9351 + }, + { + "epoch": 1.8927342643189637, + "grad_norm": 0.2692059874534607, + "learning_rate": 1.4282570686962705e-06, + "loss": 0.2077, + "step": 9352 + }, + { + "epoch": 1.8929366524994942, + "grad_norm": 0.3125195801258087, + "learning_rate": 1.422905485640391e-06, + "loss": 0.1772, + "step": 9353 + }, + { + "epoch": 1.8931390406800244, + "grad_norm": 0.3143872916698456, + "learning_rate": 1.417563875580563e-06, + "loss": 0.1803, + "step": 9354 + }, + { + "epoch": 1.8933414288605546, + "grad_norm": 0.28542307019233704, + "learning_rate": 1.4122322390572096e-06, + "loss": 0.1928, + "step": 9355 + }, + { + "epoch": 1.8935438170410848, + "grad_norm": 0.30504897236824036, + "learning_rate": 1.4069105766097323e-06, + "loss": 0.2005, + "step": 9356 + }, + { + "epoch": 1.893746205221615, + "grad_norm": 0.2561946511268616, + "learning_rate": 1.401598888776523e-06, + "loss": 0.1427, + "step": 9357 + }, + { + "epoch": 1.8939485934021452, + "grad_norm": 0.29000937938690186, + "learning_rate": 1.3962971760949518e-06, + "loss": 0.2019, + "step": 9358 + }, + { + "epoch": 1.8941509815826756, + "grad_norm": 0.29577577114105225, + "learning_rate": 1.3910054391014005e-06, + "loss": 0.1778, + "step": 9359 + }, + { + "epoch": 1.8943533697632058, + "grad_norm": 0.26925763487815857, + "learning_rate": 1.385723678331219e-06, + "loss": 0.1734, + "step": 9360 + }, + { + "epoch": 1.8945557579437362, + "grad_norm": 0.29018691182136536, + "learning_rate": 1.3804518943187683e-06, + "loss": 0.2129, + "step": 9361 + }, + { + "epoch": 1.8947581461242664, + "grad_norm": 0.28104546666145325, + "learning_rate": 1.3751900875974e-06, + "loss": 0.1839, + "step": 9362 + }, + { + "epoch": 1.8949605343047966, + "grad_norm": 0.25722119212150574, + "learning_rate": 1.3699382586994325e-06, + "loss": 0.183, + "step": 9363 + }, + { + "epoch": 1.8951629224853268, + "grad_norm": 0.2599545419216156, + "learning_rate": 1.3646964081561964e-06, + "loss": 0.1856, + "step": 9364 + }, + { + "epoch": 1.895365310665857, + "grad_norm": 0.2868998944759369, + "learning_rate": 1.359464536498023e-06, + "loss": 0.2232, + "step": 9365 + }, + { + "epoch": 1.8955676988463872, + "grad_norm": 0.31910818815231323, + "learning_rate": 1.3542426442541889e-06, + "loss": 0.1922, + "step": 9366 + }, + { + "epoch": 1.8957700870269176, + "grad_norm": 0.3114806115627289, + "learning_rate": 1.349030731953016e-06, + "loss": 0.2096, + "step": 9367 + }, + { + "epoch": 1.895972475207448, + "grad_norm": 0.28310254216194153, + "learning_rate": 1.3438288001217714e-06, + "loss": 0.1975, + "step": 9368 + }, + { + "epoch": 1.8961748633879782, + "grad_norm": 0.2802981436252594, + "learning_rate": 1.3386368492867451e-06, + "loss": 0.2046, + "step": 9369 + }, + { + "epoch": 1.8963772515685084, + "grad_norm": 0.3002050220966339, + "learning_rate": 1.333454879973206e-06, + "loss": 0.1872, + "step": 9370 + }, + { + "epoch": 1.8965796397490386, + "grad_norm": 0.2511701285839081, + "learning_rate": 1.3282828927054015e-06, + "loss": 0.1808, + "step": 9371 + }, + { + "epoch": 1.8967820279295688, + "grad_norm": 0.29675614833831787, + "learning_rate": 1.3231208880065794e-06, + "loss": 0.1823, + "step": 9372 + }, + { + "epoch": 1.896984416110099, + "grad_norm": 0.2509481608867645, + "learning_rate": 1.3179688663989886e-06, + "loss": 0.1539, + "step": 9373 + }, + { + "epoch": 1.8971868042906295, + "grad_norm": 0.27059900760650635, + "learning_rate": 1.3128268284038347e-06, + "loss": 0.1443, + "step": 9374 + }, + { + "epoch": 1.8973891924711597, + "grad_norm": 0.2761165499687195, + "learning_rate": 1.307694774541368e-06, + "loss": 0.1681, + "step": 9375 + }, + { + "epoch": 1.89759158065169, + "grad_norm": 0.33413439989089966, + "learning_rate": 1.3025727053307624e-06, + "loss": 0.2027, + "step": 9376 + }, + { + "epoch": 1.8977939688322203, + "grad_norm": 0.2996768355369568, + "learning_rate": 1.2974606212902473e-06, + "loss": 0.2213, + "step": 9377 + }, + { + "epoch": 1.8979963570127505, + "grad_norm": 0.3011522591114044, + "learning_rate": 1.2923585229369762e-06, + "loss": 0.1947, + "step": 9378 + }, + { + "epoch": 1.8981987451932807, + "grad_norm": 0.34616440534591675, + "learning_rate": 1.2872664107871467e-06, + "loss": 0.2339, + "step": 9379 + }, + { + "epoch": 1.898401133373811, + "grad_norm": 0.25703170895576477, + "learning_rate": 1.2821842853559252e-06, + "loss": 0.1944, + "step": 9380 + }, + { + "epoch": 1.898603521554341, + "grad_norm": 0.2764713168144226, + "learning_rate": 1.2771121471574555e-06, + "loss": 0.1975, + "step": 9381 + }, + { + "epoch": 1.8988059097348715, + "grad_norm": 0.2664838135242462, + "learning_rate": 1.2720499967049048e-06, + "loss": 0.1742, + "step": 9382 + }, + { + "epoch": 1.8990082979154017, + "grad_norm": 0.30014511942863464, + "learning_rate": 1.2669978345103861e-06, + "loss": 0.1966, + "step": 9383 + }, + { + "epoch": 1.8992106860959321, + "grad_norm": 0.2880495488643646, + "learning_rate": 1.2619556610850346e-06, + "loss": 0.2124, + "step": 9384 + }, + { + "epoch": 1.8994130742764623, + "grad_norm": 0.24450325965881348, + "learning_rate": 1.2569234769389648e-06, + "loss": 0.1728, + "step": 9385 + }, + { + "epoch": 1.8996154624569925, + "grad_norm": 0.28422245383262634, + "learning_rate": 1.2519012825812804e-06, + "loss": 0.1947, + "step": 9386 + }, + { + "epoch": 1.8998178506375227, + "grad_norm": 0.27359914779663086, + "learning_rate": 1.2468890785200637e-06, + "loss": 0.1601, + "step": 9387 + }, + { + "epoch": 1.900020238818053, + "grad_norm": 0.28732553124427795, + "learning_rate": 1.2418868652624093e-06, + "loss": 0.2172, + "step": 9388 + }, + { + "epoch": 1.9002226269985831, + "grad_norm": 0.3102622628211975, + "learning_rate": 1.2368946433143792e-06, + "loss": 0.1719, + "step": 9389 + }, + { + "epoch": 1.9004250151791136, + "grad_norm": 0.2415602207183838, + "learning_rate": 1.2319124131810468e-06, + "loss": 0.1882, + "step": 9390 + }, + { + "epoch": 1.9006274033596438, + "grad_norm": 0.29216253757476807, + "learning_rate": 1.2269401753664533e-06, + "loss": 0.1517, + "step": 9391 + }, + { + "epoch": 1.9008297915401742, + "grad_norm": 0.2574816346168518, + "learning_rate": 1.2219779303736412e-06, + "loss": 0.173, + "step": 9392 + }, + { + "epoch": 1.9010321797207044, + "grad_norm": 0.2932865023612976, + "learning_rate": 1.2170256787046308e-06, + "loss": 0.1946, + "step": 9393 + }, + { + "epoch": 1.9012345679012346, + "grad_norm": 0.24139507114887238, + "learning_rate": 1.2120834208604436e-06, + "loss": 0.1737, + "step": 9394 + }, + { + "epoch": 1.9014369560817648, + "grad_norm": 0.28557559847831726, + "learning_rate": 1.2071511573410909e-06, + "loss": 0.1958, + "step": 9395 + }, + { + "epoch": 1.901639344262295, + "grad_norm": 0.4012734293937683, + "learning_rate": 1.2022288886455512e-06, + "loss": 0.1906, + "step": 9396 + }, + { + "epoch": 1.9018417324428254, + "grad_norm": 0.27204346656799316, + "learning_rate": 1.1973166152718262e-06, + "loss": 0.1798, + "step": 9397 + }, + { + "epoch": 1.9020441206233556, + "grad_norm": 0.29930251836776733, + "learning_rate": 1.192414337716885e-06, + "loss": 0.1721, + "step": 9398 + }, + { + "epoch": 1.902246508803886, + "grad_norm": 0.2984878122806549, + "learning_rate": 1.1875220564766865e-06, + "loss": 0.1977, + "step": 9399 + }, + { + "epoch": 1.9024488969844162, + "grad_norm": 0.29395338892936707, + "learning_rate": 1.182639772046179e-06, + "loss": 0.2079, + "step": 9400 + }, + { + "epoch": 1.9024488969844162, + "eval_loss": 0.2572198510169983, + "eval_runtime": 0.7401, + "eval_samples_per_second": 6.756, + "eval_steps_per_second": 1.351, + "step": 9400 + }, + { + "epoch": 1.9026512851649464, + "grad_norm": 0.27311617136001587, + "learning_rate": 1.17776748491929e-06, + "loss": 0.1829, + "step": 9401 + }, + { + "epoch": 1.9028536733454766, + "grad_norm": 0.26347020268440247, + "learning_rate": 1.1729051955889692e-06, + "loss": 0.1704, + "step": 9402 + }, + { + "epoch": 1.9030560615260068, + "grad_norm": 0.296898752450943, + "learning_rate": 1.1680529045471123e-06, + "loss": 0.2021, + "step": 9403 + }, + { + "epoch": 1.903258449706537, + "grad_norm": 0.311983197927475, + "learning_rate": 1.1632106122846375e-06, + "loss": 0.1891, + "step": 9404 + }, + { + "epoch": 1.9034608378870674, + "grad_norm": 0.29881778359413147, + "learning_rate": 1.15837831929142e-06, + "loss": 0.193, + "step": 9405 + }, + { + "epoch": 1.9036632260675976, + "grad_norm": 0.2944910228252411, + "learning_rate": 1.1535560260563683e-06, + "loss": 0.1827, + "step": 9406 + }, + { + "epoch": 1.903865614248128, + "grad_norm": 0.3343166708946228, + "learning_rate": 1.1487437330673146e-06, + "loss": 0.2165, + "step": 9407 + }, + { + "epoch": 1.9040680024286583, + "grad_norm": 0.28629180788993835, + "learning_rate": 1.143941440811147e-06, + "loss": 0.181, + "step": 9408 + }, + { + "epoch": 1.9042703906091885, + "grad_norm": 0.24283328652381897, + "learning_rate": 1.1391491497736995e-06, + "loss": 0.1493, + "step": 9409 + }, + { + "epoch": 1.9044727787897187, + "grad_norm": 0.25752711296081543, + "learning_rate": 1.134366860439795e-06, + "loss": 0.197, + "step": 9410 + }, + { + "epoch": 1.9046751669702489, + "grad_norm": 0.30904826521873474, + "learning_rate": 1.12959457329328e-06, + "loss": 0.1888, + "step": 9411 + }, + { + "epoch": 1.904877555150779, + "grad_norm": 0.2787197232246399, + "learning_rate": 1.124832288816946e-06, + "loss": 0.1935, + "step": 9412 + }, + { + "epoch": 1.9050799433313095, + "grad_norm": 0.37014058232307434, + "learning_rate": 1.1200800074925855e-06, + "loss": 0.2135, + "step": 9413 + }, + { + "epoch": 1.9052823315118397, + "grad_norm": 0.31408366560935974, + "learning_rate": 1.1153377298010138e-06, + "loss": 0.172, + "step": 9414 + }, + { + "epoch": 1.90548471969237, + "grad_norm": 0.3022351861000061, + "learning_rate": 1.1106054562219691e-06, + "loss": 0.199, + "step": 9415 + }, + { + "epoch": 1.9056871078729003, + "grad_norm": 0.2721509635448456, + "learning_rate": 1.1058831872342357e-06, + "loss": 0.1842, + "step": 9416 + }, + { + "epoch": 1.9058894960534305, + "grad_norm": 0.2608672082424164, + "learning_rate": 1.1011709233155532e-06, + "loss": 0.1817, + "step": 9417 + }, + { + "epoch": 1.9060918842339607, + "grad_norm": 0.2617418169975281, + "learning_rate": 1.0964686649426736e-06, + "loss": 0.2253, + "step": 9418 + }, + { + "epoch": 1.906294272414491, + "grad_norm": 0.2891494631767273, + "learning_rate": 1.0917764125913055e-06, + "loss": 0.2057, + "step": 9419 + }, + { + "epoch": 1.906496660595021, + "grad_norm": 0.22889705002307892, + "learning_rate": 1.087094166736169e-06, + "loss": 0.1462, + "step": 9420 + }, + { + "epoch": 1.9066990487755515, + "grad_norm": 0.240357905626297, + "learning_rate": 1.0824219278509518e-06, + "loss": 0.1588, + "step": 9421 + }, + { + "epoch": 1.9069014369560817, + "grad_norm": 0.23243705928325653, + "learning_rate": 1.0777596964083647e-06, + "loss": 0.1482, + "step": 9422 + }, + { + "epoch": 1.9071038251366121, + "grad_norm": 0.2566569447517395, + "learning_rate": 1.073107472880075e-06, + "loss": 0.1832, + "step": 9423 + }, + { + "epoch": 1.9073062133171423, + "grad_norm": 0.26307451725006104, + "learning_rate": 1.0684652577367394e-06, + "loss": 0.1853, + "step": 9424 + }, + { + "epoch": 1.9075086014976725, + "grad_norm": 0.2929825484752655, + "learning_rate": 1.0638330514480154e-06, + "loss": 0.2002, + "step": 9425 + }, + { + "epoch": 1.9077109896782027, + "grad_norm": 0.23552408814430237, + "learning_rate": 1.0592108544825286e-06, + "loss": 0.1549, + "step": 9426 + }, + { + "epoch": 1.907913377858733, + "grad_norm": 0.30096322298049927, + "learning_rate": 1.0545986673079155e-06, + "loss": 0.2031, + "step": 9427 + }, + { + "epoch": 1.9081157660392634, + "grad_norm": 0.2977411150932312, + "learning_rate": 1.0499964903908033e-06, + "loss": 0.2103, + "step": 9428 + }, + { + "epoch": 1.9083181542197936, + "grad_norm": 0.25064200162887573, + "learning_rate": 1.0454043241967636e-06, + "loss": 0.152, + "step": 9429 + }, + { + "epoch": 1.908520542400324, + "grad_norm": 0.33590954542160034, + "learning_rate": 1.040822169190392e-06, + "loss": 0.2018, + "step": 9430 + }, + { + "epoch": 1.9087229305808542, + "grad_norm": 0.2594203054904938, + "learning_rate": 1.0362500258352725e-06, + "loss": 0.179, + "step": 9431 + }, + { + "epoch": 1.9089253187613844, + "grad_norm": 0.2659510672092438, + "learning_rate": 1.0316878945939579e-06, + "loss": 0.1853, + "step": 9432 + }, + { + "epoch": 1.9091277069419146, + "grad_norm": 0.3166869282722473, + "learning_rate": 1.027135775928001e-06, + "loss": 0.2172, + "step": 9433 + }, + { + "epoch": 1.9093300951224448, + "grad_norm": 0.2751993238925934, + "learning_rate": 1.0225936702979333e-06, + "loss": 0.1769, + "step": 9434 + }, + { + "epoch": 1.909532483302975, + "grad_norm": 0.3165026903152466, + "learning_rate": 1.0180615781632762e-06, + "loss": 0.2001, + "step": 9435 + }, + { + "epoch": 1.9097348714835054, + "grad_norm": 0.296625018119812, + "learning_rate": 1.013539499982552e-06, + "loss": 0.2236, + "step": 9436 + }, + { + "epoch": 1.9099372596640356, + "grad_norm": 0.256110817193985, + "learning_rate": 1.009027436213239e-06, + "loss": 0.1433, + "step": 9437 + }, + { + "epoch": 1.910139647844566, + "grad_norm": 0.22374269366264343, + "learning_rate": 1.0045253873118387e-06, + "loss": 0.1515, + "step": 9438 + }, + { + "epoch": 1.9103420360250962, + "grad_norm": 0.2797078788280487, + "learning_rate": 1.0000333537337981e-06, + "loss": 0.1829, + "step": 9439 + }, + { + "epoch": 1.9105444242056264, + "grad_norm": 0.2668952941894531, + "learning_rate": 9.955513359335978e-07, + "loss": 0.213, + "step": 9440 + }, + { + "epoch": 1.9107468123861566, + "grad_norm": 0.30301371216773987, + "learning_rate": 9.910793343646751e-07, + "loss": 0.204, + "step": 9441 + }, + { + "epoch": 1.9109492005666868, + "grad_norm": 0.26662030816078186, + "learning_rate": 9.866173494794462e-07, + "loss": 0.1984, + "step": 9442 + }, + { + "epoch": 1.911151588747217, + "grad_norm": 0.33214861154556274, + "learning_rate": 9.821653817293498e-07, + "loss": 0.1967, + "step": 9443 + }, + { + "epoch": 1.9113539769277474, + "grad_norm": 0.29728424549102783, + "learning_rate": 9.7772343156477e-07, + "loss": 0.202, + "step": 9444 + }, + { + "epoch": 1.9115563651082776, + "grad_norm": 0.26122555136680603, + "learning_rate": 9.732914994351029e-07, + "loss": 0.1525, + "step": 9445 + }, + { + "epoch": 1.911758753288808, + "grad_norm": 0.26041847467422485, + "learning_rate": 9.688695857887343e-07, + "loss": 0.1637, + "step": 9446 + }, + { + "epoch": 1.9119611414693383, + "grad_norm": 0.3184111714363098, + "learning_rate": 9.644576910730174e-07, + "loss": 0.1806, + "step": 9447 + }, + { + "epoch": 1.9121635296498685, + "grad_norm": 0.37006789445877075, + "learning_rate": 9.600558157342955e-07, + "loss": 0.1855, + "step": 9448 + }, + { + "epoch": 1.9123659178303987, + "grad_norm": 0.24486856162548065, + "learning_rate": 9.556639602179229e-07, + "loss": 0.1956, + "step": 9449 + }, + { + "epoch": 1.9125683060109289, + "grad_norm": 0.2659735679626465, + "learning_rate": 9.512821249682002e-07, + "loss": 0.1842, + "step": 9450 + }, + { + "epoch": 1.9125683060109289, + "eval_loss": 0.2572857439517975, + "eval_runtime": 0.7408, + "eval_samples_per_second": 6.75, + "eval_steps_per_second": 1.35, + "step": 9450 + }, + { + "epoch": 1.912770694191459, + "grad_norm": 0.2926306426525116, + "learning_rate": 9.469103104284505e-07, + "loss": 0.1947, + "step": 9451 + }, + { + "epoch": 1.9129730823719895, + "grad_norm": 0.31365111470222473, + "learning_rate": 9.425485170409642e-07, + "loss": 0.2003, + "step": 9452 + }, + { + "epoch": 1.9131754705525197, + "grad_norm": 0.27288708090782166, + "learning_rate": 9.381967452470219e-07, + "loss": 0.1724, + "step": 9453 + }, + { + "epoch": 1.91337785873305, + "grad_norm": 0.3712238371372223, + "learning_rate": 9.338549954868825e-07, + "loss": 0.1556, + "step": 9454 + }, + { + "epoch": 1.9135802469135803, + "grad_norm": 0.32234710454940796, + "learning_rate": 9.295232681998167e-07, + "loss": 0.2452, + "step": 9455 + }, + { + "epoch": 1.9137826350941105, + "grad_norm": 0.3090566396713257, + "learning_rate": 9.252015638240408e-07, + "loss": 0.2055, + "step": 9456 + }, + { + "epoch": 1.9139850232746407, + "grad_norm": 0.26599106192588806, + "learning_rate": 9.208898827967938e-07, + "loss": 0.1852, + "step": 9457 + }, + { + "epoch": 1.914187411455171, + "grad_norm": 0.27205586433410645, + "learning_rate": 9.165882255542824e-07, + "loss": 0.1567, + "step": 9458 + }, + { + "epoch": 1.9143897996357013, + "grad_norm": 0.23347721993923187, + "learning_rate": 9.122965925317029e-07, + "loss": 0.1577, + "step": 9459 + }, + { + "epoch": 1.9145921878162315, + "grad_norm": 0.24848300218582153, + "learning_rate": 9.080149841632523e-07, + "loss": 0.1838, + "step": 9460 + }, + { + "epoch": 1.914794575996762, + "grad_norm": 0.30119454860687256, + "learning_rate": 9.037434008820733e-07, + "loss": 0.2033, + "step": 9461 + }, + { + "epoch": 1.9149969641772921, + "grad_norm": 0.30918315052986145, + "learning_rate": 8.994818431203311e-07, + "loss": 0.1813, + "step": 9462 + }, + { + "epoch": 1.9151993523578223, + "grad_norm": 0.2923755347728729, + "learning_rate": 8.952303113091697e-07, + "loss": 0.2069, + "step": 9463 + }, + { + "epoch": 1.9154017405383525, + "grad_norm": 0.29802224040031433, + "learning_rate": 8.909888058787008e-07, + "loss": 0.1863, + "step": 9464 + }, + { + "epoch": 1.9156041287188827, + "grad_norm": 0.2790274918079376, + "learning_rate": 8.867573272580587e-07, + "loss": 0.2113, + "step": 9465 + }, + { + "epoch": 1.915806516899413, + "grad_norm": 0.2592965066432953, + "learning_rate": 8.825358758753232e-07, + "loss": 0.1744, + "step": 9466 + }, + { + "epoch": 1.9160089050799434, + "grad_norm": 0.2811659276485443, + "learning_rate": 8.783244521575751e-07, + "loss": 0.1633, + "step": 9467 + }, + { + "epoch": 1.9162112932604736, + "grad_norm": 0.30291372537612915, + "learning_rate": 8.741230565308956e-07, + "loss": 0.187, + "step": 9468 + }, + { + "epoch": 1.916413681441004, + "grad_norm": 0.2720467150211334, + "learning_rate": 8.699316894203224e-07, + "loss": 0.1707, + "step": 9469 + }, + { + "epoch": 1.9166160696215342, + "grad_norm": 0.36524778604507446, + "learning_rate": 8.657503512499055e-07, + "loss": 0.2233, + "step": 9470 + }, + { + "epoch": 1.9168184578020644, + "grad_norm": 0.3040582537651062, + "learning_rate": 8.615790424426618e-07, + "loss": 0.2155, + "step": 9471 + }, + { + "epoch": 1.9170208459825946, + "grad_norm": 0.25449779629707336, + "learning_rate": 8.574177634205982e-07, + "loss": 0.185, + "step": 9472 + }, + { + "epoch": 1.9172232341631248, + "grad_norm": 0.2561799883842468, + "learning_rate": 8.532665146047225e-07, + "loss": 0.1641, + "step": 9473 + }, + { + "epoch": 1.917425622343655, + "grad_norm": 0.28597721457481384, + "learning_rate": 8.491252964149987e-07, + "loss": 0.1991, + "step": 9474 + }, + { + "epoch": 1.9176280105241854, + "grad_norm": 0.2718763053417206, + "learning_rate": 8.449941092704027e-07, + "loss": 0.1786, + "step": 9475 + }, + { + "epoch": 1.9178303987047156, + "grad_norm": 0.29240909218788147, + "learning_rate": 8.408729535888893e-07, + "loss": 0.1701, + "step": 9476 + }, + { + "epoch": 1.918032786885246, + "grad_norm": 0.2624104619026184, + "learning_rate": 8.367618297873692e-07, + "loss": 0.1887, + "step": 9477 + }, + { + "epoch": 1.9182351750657762, + "grad_norm": 0.26697927713394165, + "learning_rate": 8.326607382817875e-07, + "loss": 0.1584, + "step": 9478 + }, + { + "epoch": 1.9184375632463064, + "grad_norm": 0.2614873945713043, + "learning_rate": 8.285696794870457e-07, + "loss": 0.1583, + "step": 9479 + }, + { + "epoch": 1.9186399514268366, + "grad_norm": 0.2983810305595398, + "learning_rate": 8.244886538170238e-07, + "loss": 0.1853, + "step": 9480 + }, + { + "epoch": 1.9188423396073668, + "grad_norm": 0.276559978723526, + "learning_rate": 8.204176616846026e-07, + "loss": 0.1827, + "step": 9481 + }, + { + "epoch": 1.919044727787897, + "grad_norm": 0.2633993923664093, + "learning_rate": 8.163567035016417e-07, + "loss": 0.148, + "step": 9482 + }, + { + "epoch": 1.9192471159684275, + "grad_norm": 0.24610120058059692, + "learning_rate": 8.123057796789901e-07, + "loss": 0.1881, + "step": 9483 + }, + { + "epoch": 1.9194495041489577, + "grad_norm": 0.2540454566478729, + "learning_rate": 8.082648906264756e-07, + "loss": 0.1712, + "step": 9484 + }, + { + "epoch": 1.919651892329488, + "grad_norm": 0.27683067321777344, + "learning_rate": 8.042340367529155e-07, + "loss": 0.2107, + "step": 9485 + }, + { + "epoch": 1.9198542805100183, + "grad_norm": 0.2888847589492798, + "learning_rate": 8.002132184660949e-07, + "loss": 0.1725, + "step": 9486 + }, + { + "epoch": 1.9200566686905485, + "grad_norm": 0.2787870764732361, + "learning_rate": 7.962024361728216e-07, + "loss": 0.1869, + "step": 9487 + }, + { + "epoch": 1.9202590568710787, + "grad_norm": 0.3224725127220154, + "learning_rate": 7.922016902788488e-07, + "loss": 0.2191, + "step": 9488 + }, + { + "epoch": 1.9204614450516089, + "grad_norm": 0.2715289294719696, + "learning_rate": 7.882109811889304e-07, + "loss": 0.1768, + "step": 9489 + }, + { + "epoch": 1.9206638332321393, + "grad_norm": 0.26549339294433594, + "learning_rate": 7.842303093068105e-07, + "loss": 0.1786, + "step": 9490 + }, + { + "epoch": 1.9208662214126695, + "grad_norm": 0.3416059911251068, + "learning_rate": 7.802596750351998e-07, + "loss": 0.2178, + "step": 9491 + }, + { + "epoch": 1.9210686095932, + "grad_norm": 0.2986985743045807, + "learning_rate": 7.762990787758217e-07, + "loss": 0.1809, + "step": 9492 + }, + { + "epoch": 1.9212709977737301, + "grad_norm": 0.3280656039714813, + "learning_rate": 7.723485209293668e-07, + "loss": 0.215, + "step": 9493 + }, + { + "epoch": 1.9214733859542603, + "grad_norm": 0.3719618320465088, + "learning_rate": 7.684080018954931e-07, + "loss": 0.2052, + "step": 9494 + }, + { + "epoch": 1.9216757741347905, + "grad_norm": 0.2605191469192505, + "learning_rate": 7.644775220728817e-07, + "loss": 0.1855, + "step": 9495 + }, + { + "epoch": 1.9218781623153207, + "grad_norm": 0.3293147385120392, + "learning_rate": 7.60557081859159e-07, + "loss": 0.2265, + "step": 9496 + }, + { + "epoch": 1.922080550495851, + "grad_norm": 0.29550546407699585, + "learning_rate": 7.566466816509743e-07, + "loss": 0.2152, + "step": 9497 + }, + { + "epoch": 1.9222829386763813, + "grad_norm": 0.28164321184158325, + "learning_rate": 7.527463218439223e-07, + "loss": 0.1796, + "step": 9498 + }, + { + "epoch": 1.9224853268569115, + "grad_norm": 0.27279436588287354, + "learning_rate": 7.488560028326097e-07, + "loss": 0.1865, + "step": 9499 + }, + { + "epoch": 1.922687715037442, + "grad_norm": 0.2678142189979553, + "learning_rate": 7.449757250106105e-07, + "loss": 0.1624, + "step": 9500 + }, + { + "epoch": 1.922687715037442, + "eval_loss": 0.25728070735931396, + "eval_runtime": 0.7406, + "eval_samples_per_second": 6.751, + "eval_steps_per_second": 1.35, + "step": 9500 + } + ], + "logging_steps": 1, + "max_steps": 9882, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.1164472942121943e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}