{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 200, "global_step": 460, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004347826086956522, "grad_norm": 20.069940351259884, "learning_rate": 9.999883393595949e-06, "loss": 1.0922, "step": 1 }, { "epoch": 0.008695652173913044, "grad_norm": 7.542838269922722, "learning_rate": 9.999533579822611e-06, "loss": 0.9273, "step": 2 }, { "epoch": 0.013043478260869565, "grad_norm": 10.605262141069346, "learning_rate": 9.9989505749962e-06, "loss": 0.799, "step": 3 }, { "epoch": 0.017391304347826087, "grad_norm": 7.717415499184863, "learning_rate": 9.998134406309555e-06, "loss": 0.7944, "step": 4 }, { "epoch": 0.021739130434782608, "grad_norm": 6.667883368554055, "learning_rate": 9.99708511183087e-06, "loss": 0.6172, "step": 5 }, { "epoch": 0.02608695652173913, "grad_norm": 6.631085068879644, "learning_rate": 9.995802740501933e-06, "loss": 0.7764, "step": 6 }, { "epoch": 0.030434782608695653, "grad_norm": 6.400521762953455, "learning_rate": 9.994287352135826e-06, "loss": 0.7722, "step": 7 }, { "epoch": 0.034782608695652174, "grad_norm": 7.272975450725693, "learning_rate": 9.99253901741414e-06, "loss": 0.7411, "step": 8 }, { "epoch": 0.0391304347826087, "grad_norm": 6.864310215078276, "learning_rate": 9.99055781788369e-06, "loss": 0.5343, "step": 9 }, { "epoch": 0.043478260869565216, "grad_norm": 7.712777689064331, "learning_rate": 9.988343845952697e-06, "loss": 0.8379, "step": 10 }, { "epoch": 0.04782608695652174, "grad_norm": 5.0645786594764735, "learning_rate": 9.985897204886481e-06, "loss": 0.6745, "step": 11 }, { "epoch": 0.05217391304347826, "grad_norm": 6.72945128964307, "learning_rate": 9.983218008802648e-06, "loss": 0.7465, "step": 12 }, { "epoch": 0.05652173913043478, "grad_norm": 6.6367606607720795, "learning_rate": 9.98030638266577e-06, "loss": 0.799, "step": 13 }, { "epoch": 0.06086956521739131, "grad_norm": 5.528359723258759, "learning_rate": 9.977162462281544e-06, "loss": 0.7108, "step": 14 }, { "epoch": 0.06521739130434782, "grad_norm": 5.264702955307018, "learning_rate": 9.973786394290475e-06, "loss": 0.7228, "step": 15 }, { "epoch": 0.06956521739130435, "grad_norm": 6.771882163796353, "learning_rate": 9.970178336161018e-06, "loss": 0.6881, "step": 16 }, { "epoch": 0.07391304347826087, "grad_norm": 5.879249880250146, "learning_rate": 9.96633845618225e-06, "loss": 0.8758, "step": 17 }, { "epoch": 0.0782608695652174, "grad_norm": 5.307640811008731, "learning_rate": 9.962266933456008e-06, "loss": 0.5422, "step": 18 }, { "epoch": 0.08260869565217391, "grad_norm": 6.966122541129152, "learning_rate": 9.957963957888542e-06, "loss": 0.8481, "step": 19 }, { "epoch": 0.08695652173913043, "grad_norm": 7.249802700623231, "learning_rate": 9.953429730181653e-06, "loss": 0.7001, "step": 20 }, { "epoch": 0.09130434782608696, "grad_norm": 7.0428833711498084, "learning_rate": 9.94866446182334e-06, "loss": 0.8144, "step": 21 }, { "epoch": 0.09565217391304348, "grad_norm": 10.996358543863554, "learning_rate": 9.943668375077926e-06, "loss": 0.8485, "step": 22 }, { "epoch": 0.1, "grad_norm": 7.0517113465910395, "learning_rate": 9.938441702975689e-06, "loss": 0.9405, "step": 23 }, { "epoch": 0.10434782608695652, "grad_norm": 6.695372202707919, "learning_rate": 9.932984689302012e-06, "loss": 0.9361, "step": 24 }, { "epoch": 0.10869565217391304, "grad_norm": 7.686125990917134, "learning_rate": 9.927297588585984e-06, "loss": 0.7243, "step": 25 }, { "epoch": 0.11304347826086956, "grad_norm": 7.875281863853482, "learning_rate": 9.921380666088558e-06, "loss": 0.6751, "step": 26 }, { "epoch": 0.11739130434782609, "grad_norm": 7.1870240888829535, "learning_rate": 9.915234197790153e-06, "loss": 0.8586, "step": 27 }, { "epoch": 0.12173913043478261, "grad_norm": 5.805469924139845, "learning_rate": 9.908858470377793e-06, "loss": 0.9345, "step": 28 }, { "epoch": 0.12608695652173912, "grad_norm": 5.934847009962585, "learning_rate": 9.902253781231741e-06, "loss": 0.6176, "step": 29 }, { "epoch": 0.13043478260869565, "grad_norm": 5.669652079107864, "learning_rate": 9.895420438411616e-06, "loss": 0.8198, "step": 30 }, { "epoch": 0.13478260869565217, "grad_norm": 6.9068296669610865, "learning_rate": 9.88835876064203e-06, "loss": 0.7229, "step": 31 }, { "epoch": 0.1391304347826087, "grad_norm": 5.589461426898666, "learning_rate": 9.881069077297724e-06, "loss": 0.7851, "step": 32 }, { "epoch": 0.14347826086956522, "grad_norm": 5.9925311921594675, "learning_rate": 9.873551728388203e-06, "loss": 0.7627, "step": 33 }, { "epoch": 0.14782608695652175, "grad_norm": 7.235153096975517, "learning_rate": 9.865807064541878e-06, "loss": 0.7946, "step": 34 }, { "epoch": 0.15217391304347827, "grad_norm": 5.736002072478754, "learning_rate": 9.857835446989708e-06, "loss": 1.0581, "step": 35 }, { "epoch": 0.1565217391304348, "grad_norm": 5.761567311753849, "learning_rate": 9.849637247548356e-06, "loss": 0.506, "step": 36 }, { "epoch": 0.1608695652173913, "grad_norm": 6.72869577475054, "learning_rate": 9.841212848602848e-06, "loss": 0.8714, "step": 37 }, { "epoch": 0.16521739130434782, "grad_norm": 5.813625848755491, "learning_rate": 9.832562643088724e-06, "loss": 0.7487, "step": 38 }, { "epoch": 0.16956521739130434, "grad_norm": 5.842266886816961, "learning_rate": 9.823687034473734e-06, "loss": 0.7917, "step": 39 }, { "epoch": 0.17391304347826086, "grad_norm": 6.1778401108169145, "learning_rate": 9.814586436738998e-06, "loss": 1.0145, "step": 40 }, { "epoch": 0.1782608695652174, "grad_norm": 7.862015045061562, "learning_rate": 9.805261274359705e-06, "loss": 0.7646, "step": 41 }, { "epoch": 0.1826086956521739, "grad_norm": 5.602026315245972, "learning_rate": 9.795711982285317e-06, "loss": 0.8201, "step": 42 }, { "epoch": 0.18695652173913044, "grad_norm": 4.929349987457967, "learning_rate": 9.785939005919279e-06, "loss": 0.7702, "step": 43 }, { "epoch": 0.19130434782608696, "grad_norm": 4.871372648031096, "learning_rate": 9.775942801098241e-06, "loss": 0.6926, "step": 44 }, { "epoch": 0.1956521739130435, "grad_norm": 6.719859832428872, "learning_rate": 9.765723834070805e-06, "loss": 0.8486, "step": 45 }, { "epoch": 0.2, "grad_norm": 4.3370504785567165, "learning_rate": 9.755282581475769e-06, "loss": 0.6513, "step": 46 }, { "epoch": 0.20434782608695654, "grad_norm": 7.762057444144393, "learning_rate": 9.7446195303199e-06, "loss": 0.9406, "step": 47 }, { "epoch": 0.20869565217391303, "grad_norm": 4.551538491515239, "learning_rate": 9.733735177955219e-06, "loss": 0.525, "step": 48 }, { "epoch": 0.21304347826086956, "grad_norm": 6.075760774188708, "learning_rate": 9.722630032055804e-06, "loss": 0.7813, "step": 49 }, { "epoch": 0.21739130434782608, "grad_norm": 4.008956324178376, "learning_rate": 9.711304610594104e-06, "loss": 0.7368, "step": 50 }, { "epoch": 0.2217391304347826, "grad_norm": 4.839599477628889, "learning_rate": 9.699759441816788e-06, "loss": 0.6262, "step": 51 }, { "epoch": 0.22608695652173913, "grad_norm": 6.38115691482247, "learning_rate": 9.687995064220102e-06, "loss": 0.877, "step": 52 }, { "epoch": 0.23043478260869565, "grad_norm": 5.197270085744578, "learning_rate": 9.676012026524755e-06, "loss": 0.6567, "step": 53 }, { "epoch": 0.23478260869565218, "grad_norm": 6.133664705996566, "learning_rate": 9.66381088765032e-06, "loss": 0.8683, "step": 54 }, { "epoch": 0.2391304347826087, "grad_norm": 5.767075885504545, "learning_rate": 9.651392216689167e-06, "loss": 1.0394, "step": 55 }, { "epoch": 0.24347826086956523, "grad_norm": 4.1300106739423255, "learning_rate": 9.638756592879923e-06, "loss": 0.6307, "step": 56 }, { "epoch": 0.24782608695652175, "grad_norm": 5.031145143621685, "learning_rate": 9.625904605580452e-06, "loss": 0.6713, "step": 57 }, { "epoch": 0.25217391304347825, "grad_norm": 4.919887444745777, "learning_rate": 9.61283685424036e-06, "loss": 0.7606, "step": 58 }, { "epoch": 0.2565217391304348, "grad_norm": 6.6285800296846125, "learning_rate": 9.599553948373047e-06, "loss": 0.6534, "step": 59 }, { "epoch": 0.2608695652173913, "grad_norm": 5.774858135760501, "learning_rate": 9.586056507527266e-06, "loss": 0.7715, "step": 60 }, { "epoch": 0.26521739130434785, "grad_norm": 6.723446220459966, "learning_rate": 9.572345161258235e-06, "loss": 0.762, "step": 61 }, { "epoch": 0.26956521739130435, "grad_norm": 5.1899164656824865, "learning_rate": 9.558420549098269e-06, "loss": 0.7894, "step": 62 }, { "epoch": 0.27391304347826084, "grad_norm": 4.6931553923683484, "learning_rate": 9.544283320526943e-06, "loss": 0.6347, "step": 63 }, { "epoch": 0.2782608695652174, "grad_norm": 4.373053450493686, "learning_rate": 9.529934134940819e-06, "loss": 0.6341, "step": 64 }, { "epoch": 0.2826086956521739, "grad_norm": 4.999427694976061, "learning_rate": 9.515373661622665e-06, "loss": 0.7572, "step": 65 }, { "epoch": 0.28695652173913044, "grad_norm": 6.758587468030614, "learning_rate": 9.500602579710256e-06, "loss": 0.665, "step": 66 }, { "epoch": 0.29130434782608694, "grad_norm": 4.891152841404051, "learning_rate": 9.48562157816469e-06, "loss": 0.6409, "step": 67 }, { "epoch": 0.2956521739130435, "grad_norm": 5.097742356761814, "learning_rate": 9.470431355738257e-06, "loss": 0.7526, "step": 68 }, { "epoch": 0.3, "grad_norm": 5.867883989296531, "learning_rate": 9.45503262094184e-06, "loss": 0.9745, "step": 69 }, { "epoch": 0.30434782608695654, "grad_norm": 6.0206884019070115, "learning_rate": 9.439426092011877e-06, "loss": 0.7403, "step": 70 }, { "epoch": 0.30869565217391304, "grad_norm": 7.276309489479921, "learning_rate": 9.423612496876856e-06, "loss": 0.8695, "step": 71 }, { "epoch": 0.3130434782608696, "grad_norm": 5.239327528351938, "learning_rate": 9.407592573123359e-06, "loss": 0.6404, "step": 72 }, { "epoch": 0.3173913043478261, "grad_norm": 5.793898990560427, "learning_rate": 9.39136706796167e-06, "loss": 0.8614, "step": 73 }, { "epoch": 0.3217391304347826, "grad_norm": 6.692327672103591, "learning_rate": 9.374936738190913e-06, "loss": 1.0608, "step": 74 }, { "epoch": 0.32608695652173914, "grad_norm": 5.4432252422315, "learning_rate": 9.358302350163758e-06, "loss": 0.9542, "step": 75 }, { "epoch": 0.33043478260869563, "grad_norm": 6.189724837073554, "learning_rate": 9.341464679750669e-06, "loss": 0.785, "step": 76 }, { "epoch": 0.3347826086956522, "grad_norm": 5.968778873070777, "learning_rate": 9.32442451230373e-06, "loss": 0.704, "step": 77 }, { "epoch": 0.3391304347826087, "grad_norm": 5.739575659080165, "learning_rate": 9.307182642620001e-06, "loss": 0.8448, "step": 78 }, { "epoch": 0.34347826086956523, "grad_norm": 7.1558802535598955, "learning_rate": 9.289739874904448e-06, "loss": 0.7278, "step": 79 }, { "epoch": 0.34782608695652173, "grad_norm": 5.8994399624654505, "learning_rate": 9.272097022732444e-06, "loss": 0.6515, "step": 80 }, { "epoch": 0.3521739130434783, "grad_norm": 5.19589456035667, "learning_rate": 9.254254909011805e-06, "loss": 0.6578, "step": 81 }, { "epoch": 0.3565217391304348, "grad_norm": 6.310044462837421, "learning_rate": 9.236214365944418e-06, "loss": 0.7226, "step": 82 }, { "epoch": 0.36086956521739133, "grad_norm": 7.094343209942033, "learning_rate": 9.217976234987429e-06, "loss": 0.7561, "step": 83 }, { "epoch": 0.3652173913043478, "grad_norm": 5.395238484535205, "learning_rate": 9.199541366813984e-06, "loss": 0.8546, "step": 84 }, { "epoch": 0.3695652173913043, "grad_norm": 6.294601516599205, "learning_rate": 9.180910621273555e-06, "loss": 0.8843, "step": 85 }, { "epoch": 0.3739130434782609, "grad_norm": 6.077039811904942, "learning_rate": 9.16208486735184e-06, "loss": 0.7714, "step": 86 }, { "epoch": 0.3782608695652174, "grad_norm": 6.820954588122634, "learning_rate": 9.14306498313023e-06, "loss": 0.875, "step": 87 }, { "epoch": 0.3826086956521739, "grad_norm": 6.019619698580998, "learning_rate": 9.123851855744842e-06, "loss": 0.8366, "step": 88 }, { "epoch": 0.3869565217391304, "grad_norm": 6.114796321611535, "learning_rate": 9.10444638134516e-06, "loss": 0.7968, "step": 89 }, { "epoch": 0.391304347826087, "grad_norm": 4.636889958853358, "learning_rate": 9.08484946505221e-06, "loss": 0.427, "step": 90 }, { "epoch": 0.39565217391304347, "grad_norm": 5.954435872454598, "learning_rate": 9.065062020916376e-06, "loss": 1.0072, "step": 91 }, { "epoch": 0.4, "grad_norm": 5.724801876279828, "learning_rate": 9.045084971874738e-06, "loss": 0.7092, "step": 92 }, { "epoch": 0.4043478260869565, "grad_norm": 6.8009420865289165, "learning_rate": 9.024919249708034e-06, "loss": 0.6406, "step": 93 }, { "epoch": 0.40869565217391307, "grad_norm": 7.158183866840323, "learning_rate": 9.004565794997209e-06, "loss": 1.1627, "step": 94 }, { "epoch": 0.41304347826086957, "grad_norm": 5.970996901339507, "learning_rate": 8.984025557079523e-06, "loss": 0.7724, "step": 95 }, { "epoch": 0.41739130434782606, "grad_norm": 5.616656204864542, "learning_rate": 8.963299494004292e-06, "loss": 0.8157, "step": 96 }, { "epoch": 0.4217391304347826, "grad_norm": 3.742930995375726, "learning_rate": 8.942388572488188e-06, "loss": 0.8402, "step": 97 }, { "epoch": 0.4260869565217391, "grad_norm": 5.800107420771233, "learning_rate": 8.921293767870157e-06, "loss": 0.9098, "step": 98 }, { "epoch": 0.43043478260869567, "grad_norm": 6.543219958920574, "learning_rate": 8.900016064065923e-06, "loss": 0.9196, "step": 99 }, { "epoch": 0.43478260869565216, "grad_norm": 4.63775639810159, "learning_rate": 8.8785564535221e-06, "loss": 0.6786, "step": 100 }, { "epoch": 0.4391304347826087, "grad_norm": 9.129310067931197, "learning_rate": 8.85691593716989e-06, "loss": 1.0006, "step": 101 }, { "epoch": 0.4434782608695652, "grad_norm": 5.79169101402568, "learning_rate": 8.835095524378413e-06, "loss": 0.7848, "step": 102 }, { "epoch": 0.44782608695652176, "grad_norm": 8.32981156706014, "learning_rate": 8.81309623290762e-06, "loss": 1.0868, "step": 103 }, { "epoch": 0.45217391304347826, "grad_norm": 3.7350886126503804, "learning_rate": 8.790919088860815e-06, "loss": 0.5445, "step": 104 }, { "epoch": 0.45652173913043476, "grad_norm": 6.277328479888874, "learning_rate": 8.768565126636806e-06, "loss": 1.0354, "step": 105 }, { "epoch": 0.4608695652173913, "grad_norm": 5.781414676947912, "learning_rate": 8.746035388881655e-06, "loss": 0.7979, "step": 106 }, { "epoch": 0.4652173913043478, "grad_norm": 5.96663186885216, "learning_rate": 8.723330926440045e-06, "loss": 0.8951, "step": 107 }, { "epoch": 0.46956521739130436, "grad_norm": 4.835300942906349, "learning_rate": 8.70045279830626e-06, "loss": 0.7859, "step": 108 }, { "epoch": 0.47391304347826085, "grad_norm": 4.985942536490541, "learning_rate": 8.677402071574806e-06, "loss": 0.5022, "step": 109 }, { "epoch": 0.4782608695652174, "grad_norm": 3.5391993265706643, "learning_rate": 8.65417982139062e-06, "loss": 0.5089, "step": 110 }, { "epoch": 0.4826086956521739, "grad_norm": 5.27727991244884, "learning_rate": 8.630787130898943e-06, "loss": 0.611, "step": 111 }, { "epoch": 0.48695652173913045, "grad_norm": 4.410902784418352, "learning_rate": 8.60722509119478e-06, "loss": 0.6555, "step": 112 }, { "epoch": 0.49130434782608695, "grad_norm": 5.311875445651678, "learning_rate": 8.583494801272018e-06, "loss": 0.7914, "step": 113 }, { "epoch": 0.4956521739130435, "grad_norm": 5.341460292078737, "learning_rate": 8.559597367972168e-06, "loss": 0.6386, "step": 114 }, { "epoch": 0.5, "grad_norm": 3.8467630962416237, "learning_rate": 8.535533905932739e-06, "loss": 0.706, "step": 115 }, { "epoch": 0.5043478260869565, "grad_norm": 6.272649123388026, "learning_rate": 8.511305537535238e-06, "loss": 0.8418, "step": 116 }, { "epoch": 0.508695652173913, "grad_norm": 4.449361932301055, "learning_rate": 8.48691339285283e-06, "loss": 0.5468, "step": 117 }, { "epoch": 0.5130434782608696, "grad_norm": 7.990771812448774, "learning_rate": 8.462358609597629e-06, "loss": 0.7669, "step": 118 }, { "epoch": 0.5173913043478261, "grad_norm": 5.264147376966948, "learning_rate": 8.437642333067626e-06, "loss": 0.87, "step": 119 }, { "epoch": 0.5217391304347826, "grad_norm": 5.41751513200529, "learning_rate": 8.412765716093273e-06, "loss": 0.9239, "step": 120 }, { "epoch": 0.5260869565217391, "grad_norm": 4.3710732873831635, "learning_rate": 8.387729918983706e-06, "loss": 0.7447, "step": 121 }, { "epoch": 0.5304347826086957, "grad_norm": 4.8604888967385325, "learning_rate": 8.362536109472637e-06, "loss": 0.8853, "step": 122 }, { "epoch": 0.5347826086956522, "grad_norm": 5.885943638280187, "learning_rate": 8.33718546266388e-06, "loss": 1.0368, "step": 123 }, { "epoch": 0.5391304347826087, "grad_norm": 5.215972523152178, "learning_rate": 8.31167916097654e-06, "loss": 0.6797, "step": 124 }, { "epoch": 0.5434782608695652, "grad_norm": 4.5386673376706135, "learning_rate": 8.286018394089864e-06, "loss": 0.819, "step": 125 }, { "epoch": 0.5478260869565217, "grad_norm": 7.394643265606397, "learning_rate": 8.260204358887753e-06, "loss": 0.8769, "step": 126 }, { "epoch": 0.5521739130434783, "grad_norm": 4.309426896828944, "learning_rate": 8.234238259402936e-06, "loss": 0.6467, "step": 127 }, { "epoch": 0.5565217391304348, "grad_norm": 7.0783793651308935, "learning_rate": 8.208121306760806e-06, "loss": 0.9724, "step": 128 }, { "epoch": 0.5608695652173913, "grad_norm": 7.352028331306423, "learning_rate": 8.181854719122938e-06, "loss": 0.8856, "step": 129 }, { "epoch": 0.5652173913043478, "grad_norm": 8.329174594301895, "learning_rate": 8.155439721630265e-06, "loss": 1.1432, "step": 130 }, { "epoch": 0.5695652173913044, "grad_norm": 6.313137339192889, "learning_rate": 8.128877546345932e-06, "loss": 0.7633, "step": 131 }, { "epoch": 0.5739130434782609, "grad_norm": 5.2417272675388675, "learning_rate": 8.102169432197842e-06, "loss": 0.6172, "step": 132 }, { "epoch": 0.5782608695652174, "grad_norm": 6.311371524751384, "learning_rate": 8.075316624920848e-06, "loss": 0.6881, "step": 133 }, { "epoch": 0.5826086956521739, "grad_norm": 4.59066197947503, "learning_rate": 8.048320376998675e-06, "loss": 0.8151, "step": 134 }, { "epoch": 0.5869565217391305, "grad_norm": 5.182510168555989, "learning_rate": 8.021181947605474e-06, "loss": 0.6908, "step": 135 }, { "epoch": 0.591304347826087, "grad_norm": 4.929873250505049, "learning_rate": 7.993902602547113e-06, "loss": 0.7495, "step": 136 }, { "epoch": 0.5956521739130435, "grad_norm": 6.65667604966722, "learning_rate": 7.966483614202127e-06, "loss": 0.8716, "step": 137 }, { "epoch": 0.6, "grad_norm": 4.1724711277921465, "learning_rate": 7.938926261462366e-06, "loss": 0.5766, "step": 138 }, { "epoch": 0.6043478260869565, "grad_norm": 5.2659580658654015, "learning_rate": 7.911231829673356e-06, "loss": 0.7273, "step": 139 }, { "epoch": 0.6086956521739131, "grad_norm": 6.757332780769119, "learning_rate": 7.883401610574338e-06, "loss": 0.6014, "step": 140 }, { "epoch": 0.6130434782608696, "grad_norm": 5.017197202764835, "learning_rate": 7.855436902238018e-06, "loss": 0.8168, "step": 141 }, { "epoch": 0.6173913043478261, "grad_norm": 3.800051418474176, "learning_rate": 7.82733900901003e-06, "loss": 0.7956, "step": 142 }, { "epoch": 0.6217391304347826, "grad_norm": 5.166291160524886, "learning_rate": 7.799109241448091e-06, "loss": 0.9329, "step": 143 }, { "epoch": 0.6260869565217392, "grad_norm": 6.435240811992826, "learning_rate": 7.770748916260875e-06, "loss": 1.0732, "step": 144 }, { "epoch": 0.6304347826086957, "grad_norm": 6.327169651045853, "learning_rate": 7.742259356246594e-06, "loss": 0.7525, "step": 145 }, { "epoch": 0.6347826086956522, "grad_norm": 7.828014215982498, "learning_rate": 7.71364189023131e-06, "loss": 0.9682, "step": 146 }, { "epoch": 0.6391304347826087, "grad_norm": 4.67990423869777, "learning_rate": 7.68489785300694e-06, "loss": 0.7471, "step": 147 }, { "epoch": 0.6434782608695652, "grad_norm": 6.966974153413055, "learning_rate": 7.656028585269017e-06, "loss": 0.8207, "step": 148 }, { "epoch": 0.6478260869565218, "grad_norm": 5.267291157936162, "learning_rate": 7.627035433554138e-06, "loss": 0.7868, "step": 149 }, { "epoch": 0.6521739130434783, "grad_norm": 3.9897462084848985, "learning_rate": 7.597919750177168e-06, "loss": 0.6855, "step": 150 }, { "epoch": 0.6565217391304348, "grad_norm": 6.0251822846734315, "learning_rate": 7.5686828931681646e-06, "loss": 0.708, "step": 151 }, { "epoch": 0.6608695652173913, "grad_norm": 4.6594453655758254, "learning_rate": 7.539326226209032e-06, "loss": 0.7041, "step": 152 }, { "epoch": 0.6652173913043479, "grad_norm": 4.378552460943295, "learning_rate": 7.509851118569915e-06, "loss": 0.7772, "step": 153 }, { "epoch": 0.6695652173913044, "grad_norm": 5.235514230052308, "learning_rate": 7.4802589450453415e-06, "loss": 0.8908, "step": 154 }, { "epoch": 0.6739130434782609, "grad_norm": 6.003166232352742, "learning_rate": 7.450551085890087e-06, "loss": 0.6599, "step": 155 }, { "epoch": 0.6782608695652174, "grad_norm": 6.96693463687554, "learning_rate": 7.420728926754803e-06, "loss": 0.6983, "step": 156 }, { "epoch": 0.6826086956521739, "grad_norm": 6.169751798135285, "learning_rate": 7.390793858621386e-06, "loss": 0.5701, "step": 157 }, { "epoch": 0.6869565217391305, "grad_norm": 4.644138430432932, "learning_rate": 7.360747277738094e-06, "loss": 0.8904, "step": 158 }, { "epoch": 0.691304347826087, "grad_norm": 4.818846895585623, "learning_rate": 7.330590585554428e-06, "loss": 0.7802, "step": 159 }, { "epoch": 0.6956521739130435, "grad_norm": 5.8506930704235485, "learning_rate": 7.300325188655762e-06, "loss": 0.7679, "step": 160 }, { "epoch": 0.7, "grad_norm": 5.1004665200015, "learning_rate": 7.269952498697734e-06, "loss": 0.9647, "step": 161 }, { "epoch": 0.7043478260869566, "grad_norm": 5.183216444935818, "learning_rate": 7.2394739323404105e-06, "loss": 1.0273, "step": 162 }, { "epoch": 0.7086956521739131, "grad_norm": 4.675829262788197, "learning_rate": 7.208890911182198e-06, "loss": 1.0811, "step": 163 }, { "epoch": 0.7130434782608696, "grad_norm": 5.254697834539126, "learning_rate": 7.178204861693546e-06, "loss": 0.7775, "step": 164 }, { "epoch": 0.717391304347826, "grad_norm": 6.4426513627853295, "learning_rate": 7.147417215150411e-06, "loss": 0.9937, "step": 165 }, { "epoch": 0.7217391304347827, "grad_norm": 4.1964116195751116, "learning_rate": 7.116529407567489e-06, "loss": 0.5633, "step": 166 }, { "epoch": 0.7260869565217392, "grad_norm": 9.567832200885338, "learning_rate": 7.085542879631253e-06, "loss": 0.7036, "step": 167 }, { "epoch": 0.7304347826086957, "grad_norm": 5.675320156588058, "learning_rate": 7.054459076632742e-06, "loss": 0.9462, "step": 168 }, { "epoch": 0.7347826086956522, "grad_norm": 6.094382398275439, "learning_rate": 7.0232794484001495e-06, "loss": 0.6914, "step": 169 }, { "epoch": 0.7391304347826086, "grad_norm": 5.727403297838351, "learning_rate": 6.9920054492312086e-06, "loss": 0.7357, "step": 170 }, { "epoch": 0.7434782608695653, "grad_norm": 9.8890170594532, "learning_rate": 6.960638537825352e-06, "loss": 1.1028, "step": 171 }, { "epoch": 0.7478260869565218, "grad_norm": 5.106590647280527, "learning_rate": 6.9291801772156775e-06, "loss": 0.7062, "step": 172 }, { "epoch": 0.7521739130434782, "grad_norm": 4.689025437172321, "learning_rate": 6.89763183470071e-06, "loss": 0.7428, "step": 173 }, { "epoch": 0.7565217391304347, "grad_norm": 4.773319191168081, "learning_rate": 6.865994981775958e-06, "loss": 0.8531, "step": 174 }, { "epoch": 0.7608695652173914, "grad_norm": 5.6351958720310655, "learning_rate": 6.834271094065284e-06, "loss": 0.8763, "step": 175 }, { "epoch": 0.7652173913043478, "grad_norm": 5.450635954315396, "learning_rate": 6.802461651252073e-06, "loss": 0.7972, "step": 176 }, { "epoch": 0.7695652173913043, "grad_norm": 4.476148487838498, "learning_rate": 6.770568137010226e-06, "loss": 0.8685, "step": 177 }, { "epoch": 0.7739130434782608, "grad_norm": 5.862791509115436, "learning_rate": 6.738592038934946e-06, "loss": 0.805, "step": 178 }, { "epoch": 0.7782608695652173, "grad_norm": 7.184841819252107, "learning_rate": 6.706534848473353e-06, "loss": 0.815, "step": 179 }, { "epoch": 0.782608695652174, "grad_norm": 5.671729894985735, "learning_rate": 6.674398060854931e-06, "loss": 0.7554, "step": 180 }, { "epoch": 0.7869565217391304, "grad_norm": 5.64912938658032, "learning_rate": 6.642183175021779e-06, "loss": 0.6776, "step": 181 }, { "epoch": 0.7913043478260869, "grad_norm": 4.264778264903819, "learning_rate": 6.609891693558692e-06, "loss": 0.6693, "step": 182 }, { "epoch": 0.7956521739130434, "grad_norm": 4.313430011704358, "learning_rate": 6.5775251226230855e-06, "loss": 0.7671, "step": 183 }, { "epoch": 0.8, "grad_norm": 4.945139192368261, "learning_rate": 6.545084971874738e-06, "loss": 0.795, "step": 184 }, { "epoch": 0.8043478260869565, "grad_norm": 6.727158152542215, "learning_rate": 6.51257275440538e-06, "loss": 1.1242, "step": 185 }, { "epoch": 0.808695652173913, "grad_norm": 4.78999021690085, "learning_rate": 6.479989986668118e-06, "loss": 0.6795, "step": 186 }, { "epoch": 0.8130434782608695, "grad_norm": 5.14510842920708, "learning_rate": 6.447338188406705e-06, "loss": 0.8969, "step": 187 }, { "epoch": 0.8173913043478261, "grad_norm": 3.749029424520381, "learning_rate": 6.41461888258465e-06, "loss": 0.6552, "step": 188 }, { "epoch": 0.8217391304347826, "grad_norm": 5.15904621321683, "learning_rate": 6.3818335953141955e-06, "loss": 0.8136, "step": 189 }, { "epoch": 0.8260869565217391, "grad_norm": 4.803773818163898, "learning_rate": 6.348983855785122e-06, "loss": 0.8237, "step": 190 }, { "epoch": 0.8304347826086956, "grad_norm": 5.182891820840243, "learning_rate": 6.31607119619343e-06, "loss": 0.5909, "step": 191 }, { "epoch": 0.8347826086956521, "grad_norm": 5.2883602858763235, "learning_rate": 6.283097151669869e-06, "loss": 0.8036, "step": 192 }, { "epoch": 0.8391304347826087, "grad_norm": 5.575481752264669, "learning_rate": 6.250063260208345e-06, "loss": 0.7226, "step": 193 }, { "epoch": 0.8434782608695652, "grad_norm": 4.959328867352831, "learning_rate": 6.216971062594179e-06, "loss": 0.7519, "step": 194 }, { "epoch": 0.8478260869565217, "grad_norm": 7.33419045327728, "learning_rate": 6.183822102332234e-06, "loss": 0.7705, "step": 195 }, { "epoch": 0.8521739130434782, "grad_norm": 4.819663888433419, "learning_rate": 6.1506179255749335e-06, "loss": 0.875, "step": 196 }, { "epoch": 0.8565217391304348, "grad_norm": 4.427401546783955, "learning_rate": 6.1173600810501355e-06, "loss": 0.6443, "step": 197 }, { "epoch": 0.8608695652173913, "grad_norm": 4.106798354300552, "learning_rate": 6.084050119988905e-06, "loss": 0.8034, "step": 198 }, { "epoch": 0.8652173913043478, "grad_norm": 5.960150253865078, "learning_rate": 6.050689596053151e-06, "loss": 0.8404, "step": 199 }, { "epoch": 0.8695652173913043, "grad_norm": 3.929860508285719, "learning_rate": 6.0172800652631706e-06, "loss": 0.6939, "step": 200 }, { "epoch": 0.8695652173913043, "eval_loss": 0.798964262008667, "eval_runtime": 2.9749, "eval_samples_per_second": 6.387, "eval_steps_per_second": 1.681, "step": 200 }, { "epoch": 0.8739130434782608, "grad_norm": 4.59188756563693, "learning_rate": 5.983823085925059e-06, "loss": 0.6078, "step": 201 }, { "epoch": 0.8782608695652174, "grad_norm": 5.005739757058599, "learning_rate": 5.950320218558037e-06, "loss": 0.5929, "step": 202 }, { "epoch": 0.8826086956521739, "grad_norm": 5.345651445814748, "learning_rate": 5.916773025821662e-06, "loss": 0.7753, "step": 203 }, { "epoch": 0.8869565217391304, "grad_norm": 4.336577036436432, "learning_rate": 5.883183072442938e-06, "loss": 0.6188, "step": 204 }, { "epoch": 0.8913043478260869, "grad_norm": 6.421225042043299, "learning_rate": 5.849551925143334e-06, "loss": 0.8011, "step": 205 }, { "epoch": 0.8956521739130435, "grad_norm": 4.789728877148738, "learning_rate": 5.815881152565712e-06, "loss": 0.7802, "step": 206 }, { "epoch": 0.9, "grad_norm": 4.587059004368843, "learning_rate": 5.782172325201155e-06, "loss": 0.953, "step": 207 }, { "epoch": 0.9043478260869565, "grad_norm": 5.009838863672234, "learning_rate": 5.7484270153157215e-06, "loss": 0.6242, "step": 208 }, { "epoch": 0.908695652173913, "grad_norm": 6.424271963786697, "learning_rate": 5.714646796877108e-06, "loss": 0.9939, "step": 209 }, { "epoch": 0.9130434782608695, "grad_norm": 3.971263429130473, "learning_rate": 5.680833245481234e-06, "loss": 0.7929, "step": 210 }, { "epoch": 0.9173913043478261, "grad_norm": 6.726704529221084, "learning_rate": 5.646987938278753e-06, "loss": 0.7622, "step": 211 }, { "epoch": 0.9217391304347826, "grad_norm": 5.6341731609562204, "learning_rate": 5.613112453901493e-06, "loss": 0.815, "step": 212 }, { "epoch": 0.9260869565217391, "grad_norm": 5.064362660354317, "learning_rate": 5.579208372388822e-06, "loss": 0.7597, "step": 213 }, { "epoch": 0.9304347826086956, "grad_norm": 5.760921891007834, "learning_rate": 5.5452772751139496e-06, "loss": 0.7194, "step": 214 }, { "epoch": 0.9347826086956522, "grad_norm": 4.666891611911306, "learning_rate": 5.511320744710171e-06, "loss": 0.651, "step": 215 }, { "epoch": 0.9391304347826087, "grad_norm": 6.267747734622622, "learning_rate": 5.477340364997051e-06, "loss": 0.9107, "step": 216 }, { "epoch": 0.9434782608695652, "grad_norm": 5.863468082482575, "learning_rate": 5.443337720906542e-06, "loss": 0.8784, "step": 217 }, { "epoch": 0.9478260869565217, "grad_norm": 4.161066615088509, "learning_rate": 5.409314398409067e-06, "loss": 0.6682, "step": 218 }, { "epoch": 0.9521739130434783, "grad_norm": 5.171402681222928, "learning_rate": 5.375271984439541e-06, "loss": 0.6462, "step": 219 }, { "epoch": 0.9565217391304348, "grad_norm": 5.096714508227432, "learning_rate": 5.341212066823356e-06, "loss": 0.7665, "step": 220 }, { "epoch": 0.9608695652173913, "grad_norm": 4.7801513361154, "learning_rate": 5.307136234202318e-06, "loss": 0.6241, "step": 221 }, { "epoch": 0.9652173913043478, "grad_norm": 5.916836433268943, "learning_rate": 5.27304607596055e-06, "loss": 1.0114, "step": 222 }, { "epoch": 0.9695652173913043, "grad_norm": 5.0695466084844165, "learning_rate": 5.238943182150361e-06, "loss": 0.6205, "step": 223 }, { "epoch": 0.9739130434782609, "grad_norm": 7.607343695393423, "learning_rate": 5.204829143418072e-06, "loss": 0.7219, "step": 224 }, { "epoch": 0.9782608695652174, "grad_norm": 5.071732375655891, "learning_rate": 5.17070555092984e-06, "loss": 0.6891, "step": 225 }, { "epoch": 0.9826086956521739, "grad_norm": 6.457774215325638, "learning_rate": 5.136573996297431e-06, "loss": 0.9542, "step": 226 }, { "epoch": 0.9869565217391304, "grad_norm": 6.582844550791858, "learning_rate": 5.102436071503983e-06, "loss": 0.8563, "step": 227 }, { "epoch": 0.991304347826087, "grad_norm": 2.936968503934489, "learning_rate": 5.068293368829755e-06, "loss": 0.7973, "step": 228 }, { "epoch": 0.9956521739130435, "grad_norm": 6.489247940312402, "learning_rate": 5.034147480777867e-06, "loss": 0.7773, "step": 229 }, { "epoch": 1.0, "grad_norm": 4.047478627971296, "learning_rate": 5e-06, "loss": 0.8434, "step": 230 }, { "epoch": 1.0043478260869565, "grad_norm": 4.837822183603084, "learning_rate": 4.965852519222135e-06, "loss": 0.5609, "step": 231 }, { "epoch": 1.008695652173913, "grad_norm": 4.082899268222064, "learning_rate": 4.931706631170246e-06, "loss": 0.5027, "step": 232 }, { "epoch": 1.0130434782608695, "grad_norm": 3.843325733590251, "learning_rate": 4.89756392849602e-06, "loss": 0.3658, "step": 233 }, { "epoch": 1.017391304347826, "grad_norm": 4.836952879734927, "learning_rate": 4.863426003702572e-06, "loss": 0.5756, "step": 234 }, { "epoch": 1.0217391304347827, "grad_norm": 4.103173107463818, "learning_rate": 4.829294449070161e-06, "loss": 0.4994, "step": 235 }, { "epoch": 1.0260869565217392, "grad_norm": 3.8903384128887106, "learning_rate": 4.795170856581929e-06, "loss": 0.4887, "step": 236 }, { "epoch": 1.0304347826086957, "grad_norm": 3.08596959753649, "learning_rate": 4.7610568178496405e-06, "loss": 0.4403, "step": 237 }, { "epoch": 1.0347826086956522, "grad_norm": 3.527272426977584, "learning_rate": 4.7269539240394505e-06, "loss": 0.3424, "step": 238 }, { "epoch": 1.0391304347826087, "grad_norm": 3.2374239776762246, "learning_rate": 4.692863765797683e-06, "loss": 0.4145, "step": 239 }, { "epoch": 1.0434782608695652, "grad_norm": 3.4800072353942224, "learning_rate": 4.6587879331766465e-06, "loss": 0.5048, "step": 240 }, { "epoch": 1.0478260869565217, "grad_norm": 3.192418206458333, "learning_rate": 4.624728015560461e-06, "loss": 0.4787, "step": 241 }, { "epoch": 1.0521739130434782, "grad_norm": 5.143086646430212, "learning_rate": 4.5906856015909365e-06, "loss": 0.3719, "step": 242 }, { "epoch": 1.0565217391304347, "grad_norm": 3.8074108245867997, "learning_rate": 4.556662279093461e-06, "loss": 0.3785, "step": 243 }, { "epoch": 1.0608695652173914, "grad_norm": 4.498314324006594, "learning_rate": 4.52265963500295e-06, "loss": 0.552, "step": 244 }, { "epoch": 1.065217391304348, "grad_norm": 4.163738307816856, "learning_rate": 4.488679255289829e-06, "loss": 0.4597, "step": 245 }, { "epoch": 1.0695652173913044, "grad_norm": 4.350615353682191, "learning_rate": 4.454722724886051e-06, "loss": 0.3357, "step": 246 }, { "epoch": 1.0739130434782609, "grad_norm": 3.943016818527937, "learning_rate": 4.4207916276111795e-06, "loss": 0.3514, "step": 247 }, { "epoch": 1.0782608695652174, "grad_norm": 5.2085391198088535, "learning_rate": 4.386887546098509e-06, "loss": 0.413, "step": 248 }, { "epoch": 1.0826086956521739, "grad_norm": 5.6969414567070045, "learning_rate": 4.353012061721249e-06, "loss": 0.3608, "step": 249 }, { "epoch": 1.0869565217391304, "grad_norm": 4.268384815942112, "learning_rate": 4.319166754518768e-06, "loss": 0.288, "step": 250 }, { "epoch": 1.0913043478260869, "grad_norm": 3.579380382838798, "learning_rate": 4.285353203122894e-06, "loss": 0.2789, "step": 251 }, { "epoch": 1.0956521739130434, "grad_norm": 5.219933048553219, "learning_rate": 4.251572984684281e-06, "loss": 0.5852, "step": 252 }, { "epoch": 1.1, "grad_norm": 5.975508571527501, "learning_rate": 4.217827674798845e-06, "loss": 0.4421, "step": 253 }, { "epoch": 1.1043478260869566, "grad_norm": 3.8240302327268174, "learning_rate": 4.18411884743429e-06, "loss": 0.3446, "step": 254 }, { "epoch": 1.108695652173913, "grad_norm": 3.7445910108565044, "learning_rate": 4.150448074856667e-06, "loss": 0.3875, "step": 255 }, { "epoch": 1.1130434782608696, "grad_norm": 4.3241863758181776, "learning_rate": 4.116816927557063e-06, "loss": 0.3562, "step": 256 }, { "epoch": 1.117391304347826, "grad_norm": 4.547780326995007, "learning_rate": 4.083226974178339e-06, "loss": 0.2453, "step": 257 }, { "epoch": 1.1217391304347826, "grad_norm": 3.687859503920098, "learning_rate": 4.0496797814419655e-06, "loss": 0.3146, "step": 258 }, { "epoch": 1.126086956521739, "grad_norm": 4.946712083509019, "learning_rate": 4.016176914074944e-06, "loss": 0.3909, "step": 259 }, { "epoch": 1.1304347826086956, "grad_norm": 5.24606301014812, "learning_rate": 3.982719934736832e-06, "loss": 0.3626, "step": 260 }, { "epoch": 1.134782608695652, "grad_norm": 3.4719423073847078, "learning_rate": 3.949310403946849e-06, "loss": 0.3811, "step": 261 }, { "epoch": 1.1391304347826088, "grad_norm": 4.29696507978264, "learning_rate": 3.915949880011096e-06, "loss": 0.332, "step": 262 }, { "epoch": 1.1434782608695653, "grad_norm": 3.9771307721259475, "learning_rate": 3.882639918949865e-06, "loss": 0.4423, "step": 263 }, { "epoch": 1.1478260869565218, "grad_norm": 3.744900801031998, "learning_rate": 3.849382074425069e-06, "loss": 0.3336, "step": 264 }, { "epoch": 1.1521739130434783, "grad_norm": 4.938081646469259, "learning_rate": 3.816177897667767e-06, "loss": 0.6139, "step": 265 }, { "epoch": 1.1565217391304348, "grad_norm": 6.709242400647857, "learning_rate": 3.7830289374058214e-06, "loss": 0.4085, "step": 266 }, { "epoch": 1.1608695652173913, "grad_norm": 6.894376674623404, "learning_rate": 3.749936739791656e-06, "loss": 0.3985, "step": 267 }, { "epoch": 1.1652173913043478, "grad_norm": 4.117232458690699, "learning_rate": 3.7169028483301333e-06, "loss": 0.3541, "step": 268 }, { "epoch": 1.1695652173913043, "grad_norm": 3.6495749370260957, "learning_rate": 3.6839288038065736e-06, "loss": 0.2845, "step": 269 }, { "epoch": 1.1739130434782608, "grad_norm": 4.0137488559165355, "learning_rate": 3.6510161442148783e-06, "loss": 0.3154, "step": 270 }, { "epoch": 1.1782608695652175, "grad_norm": 4.8152515686913455, "learning_rate": 3.6181664046858045e-06, "loss": 0.4594, "step": 271 }, { "epoch": 1.182608695652174, "grad_norm": 5.202407554106834, "learning_rate": 3.58538111741535e-06, "loss": 0.5321, "step": 272 }, { "epoch": 1.1869565217391305, "grad_norm": 5.208348292849049, "learning_rate": 3.5526618115932974e-06, "loss": 0.4076, "step": 273 }, { "epoch": 1.191304347826087, "grad_norm": 4.524886701172911, "learning_rate": 3.5200100133318836e-06, "loss": 0.6158, "step": 274 }, { "epoch": 1.1956521739130435, "grad_norm": 5.093651871829818, "learning_rate": 3.4874272455946217e-06, "loss": 0.2257, "step": 275 }, { "epoch": 1.2, "grad_norm": 4.022672833100005, "learning_rate": 3.4549150281252635e-06, "loss": 0.3768, "step": 276 }, { "epoch": 1.2043478260869565, "grad_norm": 2.8604068490644874, "learning_rate": 3.4224748773769166e-06, "loss": 0.3304, "step": 277 }, { "epoch": 1.208695652173913, "grad_norm": 3.563416318864363, "learning_rate": 3.39010830644131e-06, "loss": 0.4327, "step": 278 }, { "epoch": 1.2130434782608694, "grad_norm": 5.462342178891127, "learning_rate": 3.357816824978222e-06, "loss": 0.3563, "step": 279 }, { "epoch": 1.2173913043478262, "grad_norm": 5.776352122429311, "learning_rate": 3.3256019391450696e-06, "loss": 0.554, "step": 280 }, { "epoch": 1.2217391304347827, "grad_norm": 4.280137224720186, "learning_rate": 3.2934651515266485e-06, "loss": 0.4447, "step": 281 }, { "epoch": 1.2260869565217392, "grad_norm": 5.855318929352802, "learning_rate": 3.261407961065056e-06, "loss": 0.4613, "step": 282 }, { "epoch": 1.2304347826086957, "grad_norm": 3.759306773035557, "learning_rate": 3.2294318629897746e-06, "loss": 0.41, "step": 283 }, { "epoch": 1.2347826086956522, "grad_norm": 4.325778018741157, "learning_rate": 3.197538348747927e-06, "loss": 0.3822, "step": 284 }, { "epoch": 1.2391304347826086, "grad_norm": 3.271690860245072, "learning_rate": 3.1657289059347184e-06, "loss": 0.3265, "step": 285 }, { "epoch": 1.2434782608695651, "grad_norm": 8.121167597752617, "learning_rate": 3.1340050182240438e-06, "loss": 0.3817, "step": 286 }, { "epoch": 1.2478260869565219, "grad_norm": 6.009272476981181, "learning_rate": 3.1023681652992925e-06, "loss": 0.4827, "step": 287 }, { "epoch": 1.2521739130434781, "grad_norm": 4.551287991303892, "learning_rate": 3.070819822784323e-06, "loss": 0.333, "step": 288 }, { "epoch": 1.2565217391304349, "grad_norm": 4.40493888329962, "learning_rate": 3.03936146217465e-06, "loss": 0.2985, "step": 289 }, { "epoch": 1.2608695652173914, "grad_norm": 7.457602998331675, "learning_rate": 3.007994550768793e-06, "loss": 0.2688, "step": 290 }, { "epoch": 1.2652173913043478, "grad_norm": 3.5245316191041773, "learning_rate": 2.976720551599852e-06, "loss": 0.3413, "step": 291 }, { "epoch": 1.2695652173913043, "grad_norm": 5.458404782644148, "learning_rate": 2.9455409233672594e-06, "loss": 0.5688, "step": 292 }, { "epoch": 1.2739130434782608, "grad_norm": 5.849014777064085, "learning_rate": 2.914457120368748e-06, "loss": 0.5278, "step": 293 }, { "epoch": 1.2782608695652173, "grad_norm": 4.524775615715545, "learning_rate": 2.883470592432512e-06, "loss": 0.3695, "step": 294 }, { "epoch": 1.2826086956521738, "grad_norm": 3.0639485751220197, "learning_rate": 2.8525827848495912e-06, "loss": 0.3673, "step": 295 }, { "epoch": 1.2869565217391306, "grad_norm": 4.854031752617082, "learning_rate": 2.8217951383064546e-06, "loss": 0.3701, "step": 296 }, { "epoch": 1.2913043478260868, "grad_norm": 4.277779062420338, "learning_rate": 2.7911090888178033e-06, "loss": 0.4181, "step": 297 }, { "epoch": 1.2956521739130435, "grad_norm": 4.347824665030716, "learning_rate": 2.760526067659591e-06, "loss": 0.4506, "step": 298 }, { "epoch": 1.3, "grad_norm": 4.68297313331208, "learning_rate": 2.7300475013022666e-06, "loss": 0.45, "step": 299 }, { "epoch": 1.3043478260869565, "grad_norm": 3.0283941903778895, "learning_rate": 2.6996748113442397e-06, "loss": 0.2597, "step": 300 }, { "epoch": 1.308695652173913, "grad_norm": 4.491981965834742, "learning_rate": 2.669409414445574e-06, "loss": 0.382, "step": 301 }, { "epoch": 1.3130434782608695, "grad_norm": 3.990717626507738, "learning_rate": 2.6392527222619078e-06, "loss": 0.3516, "step": 302 }, { "epoch": 1.317391304347826, "grad_norm": 4.136276302818288, "learning_rate": 2.6092061413786158e-06, "loss": 0.3829, "step": 303 }, { "epoch": 1.3217391304347825, "grad_norm": 6.106967961277055, "learning_rate": 2.5792710732452e-06, "loss": 0.4728, "step": 304 }, { "epoch": 1.3260869565217392, "grad_norm": 3.5028685987260175, "learning_rate": 2.5494489141099155e-06, "loss": 0.2598, "step": 305 }, { "epoch": 1.3304347826086955, "grad_norm": 4.559833842039468, "learning_rate": 2.5197410549546598e-06, "loss": 0.253, "step": 306 }, { "epoch": 1.3347826086956522, "grad_norm": 4.5448409327910415, "learning_rate": 2.4901488814300855e-06, "loss": 0.4016, "step": 307 }, { "epoch": 1.3391304347826087, "grad_norm": 3.0979476281469265, "learning_rate": 2.4606737737909696e-06, "loss": 0.2321, "step": 308 }, { "epoch": 1.3434782608695652, "grad_norm": 5.981685141898299, "learning_rate": 2.431317106831836e-06, "loss": 0.5274, "step": 309 }, { "epoch": 1.3478260869565217, "grad_norm": 2.98685529750352, "learning_rate": 2.4020802498228333e-06, "loss": 0.396, "step": 310 }, { "epoch": 1.3521739130434782, "grad_norm": 5.125431488555637, "learning_rate": 2.3729645664458637e-06, "loss": 0.505, "step": 311 }, { "epoch": 1.3565217391304347, "grad_norm": 5.094394216531229, "learning_rate": 2.3439714147309845e-06, "loss": 0.4012, "step": 312 }, { "epoch": 1.3608695652173912, "grad_norm": 4.289708162043688, "learning_rate": 2.315102146993061e-06, "loss": 0.2589, "step": 313 }, { "epoch": 1.365217391304348, "grad_norm": 4.3432839576954745, "learning_rate": 2.286358109768693e-06, "loss": 0.3326, "step": 314 }, { "epoch": 1.3695652173913042, "grad_norm": 4.063212467164958, "learning_rate": 2.2577406437534055e-06, "loss": 0.2893, "step": 315 }, { "epoch": 1.373913043478261, "grad_norm": 3.1836830488379033, "learning_rate": 2.229251083739127e-06, "loss": 0.2955, "step": 316 }, { "epoch": 1.3782608695652174, "grad_norm": 4.9246519028776685, "learning_rate": 2.2008907585519094e-06, "loss": 0.4412, "step": 317 }, { "epoch": 1.382608695652174, "grad_norm": 5.147968019536769, "learning_rate": 2.172660990989971e-06, "loss": 0.3728, "step": 318 }, { "epoch": 1.3869565217391304, "grad_norm": 5.0886629945359365, "learning_rate": 2.144563097761984e-06, "loss": 0.4549, "step": 319 }, { "epoch": 1.391304347826087, "grad_norm": 4.110328785074501, "learning_rate": 2.1165983894256647e-06, "loss": 0.399, "step": 320 }, { "epoch": 1.3956521739130434, "grad_norm": 5.678018080542645, "learning_rate": 2.0887681703266453e-06, "loss": 0.4922, "step": 321 }, { "epoch": 1.4, "grad_norm": 3.522299014746889, "learning_rate": 2.061073738537635e-06, "loss": 0.2851, "step": 322 }, { "epoch": 1.4043478260869566, "grad_norm": 6.210071220589272, "learning_rate": 2.0335163857978747e-06, "loss": 0.4381, "step": 323 }, { "epoch": 1.4086956521739131, "grad_norm": 4.677651092740623, "learning_rate": 2.0060973974528873e-06, "loss": 0.3304, "step": 324 }, { "epoch": 1.4130434782608696, "grad_norm": 3.7070115905602585, "learning_rate": 1.978818052394528e-06, "loss": 0.2646, "step": 325 }, { "epoch": 1.4173913043478261, "grad_norm": 4.375456577369518, "learning_rate": 1.9516796230013275e-06, "loss": 0.2989, "step": 326 }, { "epoch": 1.4217391304347826, "grad_norm": 5.570245887780959, "learning_rate": 1.9246833750791526e-06, "loss": 0.4208, "step": 327 }, { "epoch": 1.4260869565217391, "grad_norm": 4.886938750299713, "learning_rate": 1.8978305678021598e-06, "loss": 0.4118, "step": 328 }, { "epoch": 1.4304347826086956, "grad_norm": 4.620746234726746, "learning_rate": 1.8711224536540678e-06, "loss": 0.2954, "step": 329 }, { "epoch": 1.434782608695652, "grad_norm": 4.633932338754762, "learning_rate": 1.8445602783697375e-06, "loss": 0.4206, "step": 330 }, { "epoch": 1.4391304347826086, "grad_norm": 3.999563113243534, "learning_rate": 1.8181452808770638e-06, "loss": 0.3652, "step": 331 }, { "epoch": 1.4434782608695653, "grad_norm": 4.232490038953125, "learning_rate": 1.7918786932391945e-06, "loss": 0.4048, "step": 332 }, { "epoch": 1.4478260869565218, "grad_norm": 4.376778688265914, "learning_rate": 1.765761740597065e-06, "loss": 0.5538, "step": 333 }, { "epoch": 1.4521739130434783, "grad_norm": 3.6752993411272086, "learning_rate": 1.739795641112248e-06, "loss": 0.4687, "step": 334 }, { "epoch": 1.4565217391304348, "grad_norm": 4.55460302537388, "learning_rate": 1.7139816059101372e-06, "loss": 0.7519, "step": 335 }, { "epoch": 1.4608695652173913, "grad_norm": 5.114759853944812, "learning_rate": 1.688320839023463e-06, "loss": 0.3998, "step": 336 }, { "epoch": 1.4652173913043478, "grad_norm": 3.1716648657362128, "learning_rate": 1.662814537336122e-06, "loss": 0.3869, "step": 337 }, { "epoch": 1.4695652173913043, "grad_norm": 3.7587234242982905, "learning_rate": 1.6374638905273643e-06, "loss": 0.3841, "step": 338 }, { "epoch": 1.4739130434782608, "grad_norm": 3.63611176729071, "learning_rate": 1.6122700810162967e-06, "loss": 0.3801, "step": 339 }, { "epoch": 1.4782608695652173, "grad_norm": 4.796431057792193, "learning_rate": 1.5872342839067305e-06, "loss": 0.2504, "step": 340 }, { "epoch": 1.482608695652174, "grad_norm": 4.964948958311623, "learning_rate": 1.5623576669323743e-06, "loss": 0.4862, "step": 341 }, { "epoch": 1.4869565217391305, "grad_norm": 3.9303479836058464, "learning_rate": 1.5376413904023723e-06, "loss": 0.3774, "step": 342 }, { "epoch": 1.491304347826087, "grad_norm": 4.11021434418603, "learning_rate": 1.5130866071471717e-06, "loss": 0.3495, "step": 343 }, { "epoch": 1.4956521739130435, "grad_norm": 4.0020770636229575, "learning_rate": 1.4886944624647647e-06, "loss": 0.2751, "step": 344 }, { "epoch": 1.5, "grad_norm": 5.903091442855636, "learning_rate": 1.4644660940672628e-06, "loss": 0.4938, "step": 345 }, { "epoch": 1.5043478260869565, "grad_norm": 4.598078158755451, "learning_rate": 1.4404026320278318e-06, "loss": 0.2931, "step": 346 }, { "epoch": 1.508695652173913, "grad_norm": 6.359179166789268, "learning_rate": 1.4165051987279832e-06, "loss": 0.4609, "step": 347 }, { "epoch": 1.5130434782608697, "grad_norm": 3.7748104229756305, "learning_rate": 1.3927749088052218e-06, "loss": 0.4253, "step": 348 }, { "epoch": 1.517391304347826, "grad_norm": 4.449981154160408, "learning_rate": 1.3692128691010592e-06, "loss": 0.3325, "step": 349 }, { "epoch": 1.5217391304347827, "grad_norm": 5.0666604676427305, "learning_rate": 1.3458201786093795e-06, "loss": 0.4088, "step": 350 }, { "epoch": 1.526086956521739, "grad_norm": 4.630295107740943, "learning_rate": 1.3225979284251955e-06, "loss": 0.2542, "step": 351 }, { "epoch": 1.5304347826086957, "grad_norm": 4.7269256098025085, "learning_rate": 1.2995472016937405e-06, "loss": 0.3752, "step": 352 }, { "epoch": 1.5347826086956522, "grad_norm": 4.340635258567047, "learning_rate": 1.2766690735599569e-06, "loss": 0.408, "step": 353 }, { "epoch": 1.5391304347826087, "grad_norm": 4.770896513431023, "learning_rate": 1.2539646111183452e-06, "loss": 0.29, "step": 354 }, { "epoch": 1.5434782608695652, "grad_norm": 4.228926300066491, "learning_rate": 1.2314348733631958e-06, "loss": 0.3813, "step": 355 }, { "epoch": 1.5478260869565217, "grad_norm": 2.7862114049851052, "learning_rate": 1.209080911139187e-06, "loss": 0.2217, "step": 356 }, { "epoch": 1.5521739130434784, "grad_norm": 4.834499117010849, "learning_rate": 1.1869037670923817e-06, "loss": 0.3568, "step": 357 }, { "epoch": 1.5565217391304347, "grad_norm": 4.436098328818527, "learning_rate": 1.1649044756215872e-06, "loss": 0.4675, "step": 358 }, { "epoch": 1.5608695652173914, "grad_norm": 3.505364253769214, "learning_rate": 1.1430840628301093e-06, "loss": 0.4031, "step": 359 }, { "epoch": 1.5652173913043477, "grad_norm": 3.5337346862290686, "learning_rate": 1.1214435464779006e-06, "loss": 0.4156, "step": 360 }, { "epoch": 1.5695652173913044, "grad_norm": 6.840098625250913, "learning_rate": 1.099983935934077e-06, "loss": 0.4857, "step": 361 }, { "epoch": 1.5739130434782609, "grad_norm": 6.213293552584588, "learning_rate": 1.0787062321298441e-06, "loss": 0.4374, "step": 362 }, { "epoch": 1.5782608695652174, "grad_norm": 5.404535179527454, "learning_rate": 1.0576114275118132e-06, "loss": 0.3358, "step": 363 }, { "epoch": 1.5826086956521739, "grad_norm": 3.3759662275329965, "learning_rate": 1.0367005059957097e-06, "loss": 0.3992, "step": 364 }, { "epoch": 1.5869565217391304, "grad_norm": 4.33324237420986, "learning_rate": 1.0159744429204776e-06, "loss": 0.3414, "step": 365 }, { "epoch": 1.591304347826087, "grad_norm": 8.014303432550552, "learning_rate": 9.954342050027922e-07, "loss": 0.5959, "step": 366 }, { "epoch": 1.5956521739130434, "grad_norm": 5.063352120229355, "learning_rate": 9.75080750291965e-07, "loss": 0.4683, "step": 367 }, { "epoch": 1.6, "grad_norm": 2.6582958542181774, "learning_rate": 9.549150281252633e-07, "loss": 0.3139, "step": 368 }, { "epoch": 1.6043478260869564, "grad_norm": 3.534876787372464, "learning_rate": 9.349379790836243e-07, "loss": 0.3933, "step": 369 }, { "epoch": 1.608695652173913, "grad_norm": 4.091568890168747, "learning_rate": 9.151505349477901e-07, "loss": 0.4115, "step": 370 }, { "epoch": 1.6130434782608696, "grad_norm": 4.587879328576348, "learning_rate": 8.955536186548425e-07, "loss": 0.4574, "step": 371 }, { "epoch": 1.617391304347826, "grad_norm": 4.11512437136083, "learning_rate": 8.761481442551573e-07, "loss": 0.4054, "step": 372 }, { "epoch": 1.6217391304347826, "grad_norm": 4.948336357859377, "learning_rate": 8.569350168697705e-07, "loss": 0.4593, "step": 373 }, { "epoch": 1.626086956521739, "grad_norm": 4.571512826347378, "learning_rate": 8.379151326481588e-07, "loss": 0.3444, "step": 374 }, { "epoch": 1.6304347826086958, "grad_norm": 4.328104628253105, "learning_rate": 8.19089378726447e-07, "loss": 0.4923, "step": 375 }, { "epoch": 1.634782608695652, "grad_norm": 3.760449059509458, "learning_rate": 8.004586331860176e-07, "loss": 0.2563, "step": 376 }, { "epoch": 1.6391304347826088, "grad_norm": 4.322342061262116, "learning_rate": 7.820237650125711e-07, "loss": 0.2826, "step": 377 }, { "epoch": 1.643478260869565, "grad_norm": 5.664753766424785, "learning_rate": 7.637856340555822e-07, "loss": 0.5416, "step": 378 }, { "epoch": 1.6478260869565218, "grad_norm": 4.168173552178201, "learning_rate": 7.457450909881969e-07, "loss": 0.4541, "step": 379 }, { "epoch": 1.6521739130434783, "grad_norm": 3.746985781310187, "learning_rate": 7.279029772675572e-07, "loss": 0.3148, "step": 380 }, { "epoch": 1.6565217391304348, "grad_norm": 4.0890307040702, "learning_rate": 7.102601250955526e-07, "loss": 0.2536, "step": 381 }, { "epoch": 1.6608695652173913, "grad_norm": 4.147467566269263, "learning_rate": 6.928173573800007e-07, "loss": 0.5819, "step": 382 }, { "epoch": 1.6652173913043478, "grad_norm": 3.470213468989912, "learning_rate": 6.755754876962711e-07, "loss": 0.3264, "step": 383 }, { "epoch": 1.6695652173913045, "grad_norm": 3.790885229353616, "learning_rate": 6.585353202493322e-07, "loss": 0.4863, "step": 384 }, { "epoch": 1.6739130434782608, "grad_norm": 3.2098499133289873, "learning_rate": 6.416976498362432e-07, "loss": 0.4062, "step": 385 }, { "epoch": 1.6782608695652175, "grad_norm": 3.052567811452033, "learning_rate": 6.250632618090868e-07, "loss": 0.3766, "step": 386 }, { "epoch": 1.6826086956521737, "grad_norm": 3.251506558261061, "learning_rate": 6.086329320383311e-07, "loss": 0.2694, "step": 387 }, { "epoch": 1.6869565217391305, "grad_norm": 4.429111727950372, "learning_rate": 5.924074268766422e-07, "loss": 0.3576, "step": 388 }, { "epoch": 1.691304347826087, "grad_norm": 5.864544691259148, "learning_rate": 5.763875031231464e-07, "loss": 0.4093, "step": 389 }, { "epoch": 1.6956521739130435, "grad_norm": 3.999472642222693, "learning_rate": 5.60573907988124e-07, "loss": 0.2376, "step": 390 }, { "epoch": 1.7, "grad_norm": 4.508948892750066, "learning_rate": 5.449673790581611e-07, "loss": 0.3957, "step": 391 }, { "epoch": 1.7043478260869565, "grad_norm": 4.096578412910045, "learning_rate": 5.295686442617442e-07, "loss": 0.3033, "step": 392 }, { "epoch": 1.7086956521739132, "grad_norm": 3.958895722894547, "learning_rate": 5.143784218353104e-07, "loss": 0.59, "step": 393 }, { "epoch": 1.7130434782608694, "grad_norm": 3.6190572925004485, "learning_rate": 4.993974202897456e-07, "loss": 0.473, "step": 394 }, { "epoch": 1.7173913043478262, "grad_norm": 5.161364474058307, "learning_rate": 4.846263383773364e-07, "loss": 0.3472, "step": 395 }, { "epoch": 1.7217391304347827, "grad_norm": 4.226485622410246, "learning_rate": 4.7006586505918273e-07, "loss": 0.3042, "step": 396 }, { "epoch": 1.7260869565217392, "grad_norm": 3.9403605213034347, "learning_rate": 4.557166794730572e-07, "loss": 0.3942, "step": 397 }, { "epoch": 1.7304347826086957, "grad_norm": 3.464344850791239, "learning_rate": 4.4157945090173294e-07, "loss": 0.3491, "step": 398 }, { "epoch": 1.7347826086956522, "grad_norm": 3.6832590658291253, "learning_rate": 4.276548387417656e-07, "loss": 0.4446, "step": 399 }, { "epoch": 1.7391304347826086, "grad_norm": 3.9257122718747173, "learning_rate": 4.139434924727359e-07, "loss": 0.1904, "step": 400 }, { "epoch": 1.7391304347826086, "eval_loss": 0.84710693359375, "eval_runtime": 2.9637, "eval_samples_per_second": 6.411, "eval_steps_per_second": 1.687, "step": 400 }, { "epoch": 1.7434782608695651, "grad_norm": 4.2238949061823625, "learning_rate": 4.004460516269554e-07, "loss": 0.2747, "step": 401 }, { "epoch": 1.7478260869565219, "grad_norm": 4.415499878703038, "learning_rate": 3.8716314575964197e-07, "loss": 0.434, "step": 402 }, { "epoch": 1.7521739130434781, "grad_norm": 4.413123149635052, "learning_rate": 3.740953944195497e-07, "loss": 0.3275, "step": 403 }, { "epoch": 1.7565217391304349, "grad_norm": 5.005472770791753, "learning_rate": 3.612434071200771e-07, "loss": 0.4393, "step": 404 }, { "epoch": 1.7608695652173914, "grad_norm": 4.77916502918349, "learning_rate": 3.486077833108342e-07, "loss": 0.3882, "step": 405 }, { "epoch": 1.7652173913043478, "grad_norm": 4.888332155893412, "learning_rate": 3.361891123496824e-07, "loss": 0.2972, "step": 406 }, { "epoch": 1.7695652173913043, "grad_norm": 8.555798239045828, "learning_rate": 3.2398797347524656e-07, "loss": 0.416, "step": 407 }, { "epoch": 1.7739130434782608, "grad_norm": 4.311854645200206, "learning_rate": 3.1200493577989875e-07, "loss": 0.4497, "step": 408 }, { "epoch": 1.7782608695652173, "grad_norm": 4.916122775051447, "learning_rate": 3.002405581832135e-07, "loss": 0.639, "step": 409 }, { "epoch": 1.7826086956521738, "grad_norm": 5.578493709235784, "learning_rate": 2.88695389405898e-07, "loss": 0.5367, "step": 410 }, { "epoch": 1.7869565217391306, "grad_norm": 4.301048656163112, "learning_rate": 2.7736996794419767e-07, "loss": 0.4093, "step": 411 }, { "epoch": 1.7913043478260868, "grad_norm": 4.646134977327319, "learning_rate": 2.662648220447811e-07, "loss": 0.4533, "step": 412 }, { "epoch": 1.7956521739130435, "grad_norm": 4.061060226299757, "learning_rate": 2.5538046968010097e-07, "loss": 0.417, "step": 413 }, { "epoch": 1.8, "grad_norm": 4.907143921361194, "learning_rate": 2.447174185242324e-07, "loss": 0.4717, "step": 414 }, { "epoch": 1.8043478260869565, "grad_norm": 4.6576414692745445, "learning_rate": 2.3427616592919587e-07, "loss": 0.1985, "step": 415 }, { "epoch": 1.808695652173913, "grad_norm": 7.551034297687061, "learning_rate": 2.240571989017598e-07, "loss": 0.2973, "step": 416 }, { "epoch": 1.8130434782608695, "grad_norm": 4.711055462866289, "learning_rate": 2.1406099408072256e-07, "loss": 0.332, "step": 417 }, { "epoch": 1.8173913043478263, "grad_norm": 5.451809254942853, "learning_rate": 2.0428801771468388e-07, "loss": 0.4386, "step": 418 }, { "epoch": 1.8217391304347825, "grad_norm": 4.973253351932573, "learning_rate": 1.947387256402966e-07, "loss": 0.4752, "step": 419 }, { "epoch": 1.8260869565217392, "grad_norm": 5.230761253263795, "learning_rate": 1.8541356326100436e-07, "loss": 0.4217, "step": 420 }, { "epoch": 1.8304347826086955, "grad_norm": 5.234189528741744, "learning_rate": 1.7631296552626687e-07, "loss": 0.4596, "step": 421 }, { "epoch": 1.8347826086956522, "grad_norm": 8.698679258939933, "learning_rate": 1.6743735691127639e-07, "loss": 0.3867, "step": 422 }, { "epoch": 1.8391304347826087, "grad_norm": 4.0581036301103826, "learning_rate": 1.5878715139715395e-07, "loss": 0.3501, "step": 423 }, { "epoch": 1.8434782608695652, "grad_norm": 3.3295020409277134, "learning_rate": 1.5036275245164377e-07, "loss": 0.333, "step": 424 }, { "epoch": 1.8478260869565217, "grad_norm": 5.625873750329853, "learning_rate": 1.4216455301029274e-07, "loss": 0.2544, "step": 425 }, { "epoch": 1.8521739130434782, "grad_norm": 21.31563926936729, "learning_rate": 1.341929354581234e-07, "loss": 0.3744, "step": 426 }, { "epoch": 1.856521739130435, "grad_norm": 3.6645244784937554, "learning_rate": 1.2644827161179763e-07, "loss": 0.3328, "step": 427 }, { "epoch": 1.8608695652173912, "grad_norm": 4.035545215811824, "learning_rate": 1.1893092270227724e-07, "loss": 0.3116, "step": 428 }, { "epoch": 1.865217391304348, "grad_norm": 4.778912796903033, "learning_rate": 1.1164123935797189e-07, "loss": 0.4285, "step": 429 }, { "epoch": 1.8695652173913042, "grad_norm": 4.365193705408143, "learning_rate": 1.0457956158838545e-07, "loss": 0.3649, "step": 430 }, { "epoch": 1.873913043478261, "grad_norm": 4.931094956899494, "learning_rate": 9.774621876825985e-08, "loss": 0.569, "step": 431 }, { "epoch": 1.8782608695652174, "grad_norm": 4.916680811083485, "learning_rate": 9.114152962220734e-08, "loss": 0.4065, "step": 432 }, { "epoch": 1.882608695652174, "grad_norm": 5.327664249364534, "learning_rate": 8.476580220984854e-08, "loss": 0.4887, "step": 433 }, { "epoch": 1.8869565217391304, "grad_norm": 5.630899155916819, "learning_rate": 7.861933391144272e-08, "loss": 0.3431, "step": 434 }, { "epoch": 1.891304347826087, "grad_norm": 4.192157519887303, "learning_rate": 7.270241141401568e-08, "loss": 0.4561, "step": 435 }, { "epoch": 1.8956521739130436, "grad_norm": 3.3402242500527346, "learning_rate": 6.701531069799039e-08, "loss": 0.2535, "step": 436 }, { "epoch": 1.9, "grad_norm": 5.05505766202891, "learning_rate": 6.15582970243117e-08, "loss": 0.4571, "step": 437 }, { "epoch": 1.9043478260869566, "grad_norm": 4.482518001328448, "learning_rate": 5.633162492207633e-08, "loss": 0.4594, "step": 438 }, { "epoch": 1.908695652173913, "grad_norm": 4.747721812858791, "learning_rate": 5.133553817665948e-08, "loss": 0.4966, "step": 439 }, { "epoch": 1.9130434782608696, "grad_norm": 5.406074579801078, "learning_rate": 4.657026981834623e-08, "loss": 0.4339, "step": 440 }, { "epoch": 1.9173913043478261, "grad_norm": 3.305337867337301, "learning_rate": 4.203604211145851e-08, "loss": 0.2366, "step": 441 }, { "epoch": 1.9217391304347826, "grad_norm": 3.4638544230453303, "learning_rate": 3.773306654399234e-08, "loss": 0.3431, "step": 442 }, { "epoch": 1.9260869565217391, "grad_norm": 4.732049116559757, "learning_rate": 3.366154381775011e-08, "loss": 0.4252, "step": 443 }, { "epoch": 1.9304347826086956, "grad_norm": 6.208168501352479, "learning_rate": 2.9821663838981994e-08, "loss": 0.4341, "step": 444 }, { "epoch": 1.9347826086956523, "grad_norm": 3.631999073133573, "learning_rate": 2.6213605709525803e-08, "loss": 0.2918, "step": 445 }, { "epoch": 1.9391304347826086, "grad_norm": 3.4071911648402047, "learning_rate": 2.283753771845587e-08, "loss": 0.2576, "step": 446 }, { "epoch": 1.9434782608695653, "grad_norm": 4.341970244912221, "learning_rate": 1.969361733423103e-08, "loss": 0.4952, "step": 447 }, { "epoch": 1.9478260869565216, "grad_norm": 5.0988876739070665, "learning_rate": 1.6781991197352133e-08, "loss": 0.3971, "step": 448 }, { "epoch": 1.9521739130434783, "grad_norm": 5.344651969376342, "learning_rate": 1.4102795113520307e-08, "loss": 0.3179, "step": 449 }, { "epoch": 1.9565217391304348, "grad_norm": 3.1321296957399816, "learning_rate": 1.1656154047303691e-08, "loss": 0.3592, "step": 450 }, { "epoch": 1.9608695652173913, "grad_norm": 5.289764246110828, "learning_rate": 9.442182116309872e-09, "loss": 0.5675, "step": 451 }, { "epoch": 1.9652173913043478, "grad_norm": 3.539029704298015, "learning_rate": 7.460982585860144e-09, "loss": 0.2401, "step": 452 }, { "epoch": 1.9695652173913043, "grad_norm": 3.3170116889868084, "learning_rate": 5.712647864176135e-09, "loss": 0.4096, "step": 453 }, { "epoch": 1.973913043478261, "grad_norm": 4.591533334025833, "learning_rate": 4.197259498067707e-09, "loss": 0.3984, "step": 454 }, { "epoch": 1.9782608695652173, "grad_norm": 3.485313689550101, "learning_rate": 2.9148881691298812e-09, "loss": 0.3549, "step": 455 }, { "epoch": 1.982608695652174, "grad_norm": 5.597770599873441, "learning_rate": 1.865593690446588e-09, "loss": 0.3956, "step": 456 }, { "epoch": 1.9869565217391303, "grad_norm": 3.9083218733863356, "learning_rate": 1.0494250038006747e-09, "loss": 0.3444, "step": 457 }, { "epoch": 1.991304347826087, "grad_norm": 5.853251874420562, "learning_rate": 4.664201773896259e-10, "loss": 0.373, "step": 458 }, { "epoch": 1.9956521739130435, "grad_norm": 4.471869039132135, "learning_rate": 1.1660640405308787e-10, "loss": 0.4927, "step": 459 }, { "epoch": 2.0, "grad_norm": 4.073032878536336, "learning_rate": 0.0, "loss": 0.3028, "step": 460 }, { "epoch": 2.0, "step": 460, "total_flos": 3525630345216.0, "train_loss": 0.5884321572663991, "train_runtime": 1069.0797, "train_samples_per_second": 3.442, "train_steps_per_second": 0.43 } ], "logging_steps": 1, "max_steps": 460, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3525630345216.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }