diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -10,12586 +10,12586 @@ "log_history": [ { "epoch": 0.0, - "grad_norm": 468.0, - "learning_rate": 2.229654403567447e-08, - "loss": 11.4202, + "grad_norm": 14.375, + "learning_rate": 0.001, + "loss": 10.66, "step": 1 }, { "epoch": 0.0, - "grad_norm": 468.0, - "learning_rate": 1.1148272017837236e-07, - "loss": 11.4863, + "grad_norm": 2.15625, + "learning_rate": 0.001, + "loss": 9.0861, "step": 5 }, { "epoch": 0.0, - "grad_norm": 448.0, - "learning_rate": 2.2296544035674473e-07, - "loss": 11.4597, + "grad_norm": 0.89453125, + "learning_rate": 0.001, + "loss": 7.5929, "step": 10 }, { "epoch": 0.0, - "grad_norm": 466.0, - "learning_rate": 3.3444816053511706e-07, - "loss": 11.5247, + "grad_norm": 0.80078125, + "learning_rate": 0.001, + "loss": 7.2385, "step": 15 }, { "epoch": 0.0, - "grad_norm": 472.0, - "learning_rate": 4.4593088071348945e-07, - "loss": 11.5105, + "grad_norm": 0.890625, + "learning_rate": 0.001, + "loss": 7.1247, "step": 20 }, { "epoch": 0.0, - "grad_norm": 438.0, - "learning_rate": 5.574136008918618e-07, - "loss": 11.452, + "grad_norm": 0.55078125, + "learning_rate": 0.001, + "loss": 6.9193, "step": 25 }, { "epoch": 0.0, - "grad_norm": 400.0, - "learning_rate": 6.688963210702341e-07, - "loss": 11.4388, + "grad_norm": 0.78515625, + "learning_rate": 0.001, + "loss": 6.8173, "step": 30 }, { "epoch": 0.0, - "grad_norm": 466.0, - "learning_rate": 7.803790412486065e-07, - "loss": 11.4574, + "grad_norm": 0.68359375, + "learning_rate": 0.001, + "loss": 6.7162, "step": 35 }, { "epoch": 0.0, - "grad_norm": 474.0, - "learning_rate": 8.918617614269789e-07, - "loss": 11.4573, + "grad_norm": 0.67578125, + "learning_rate": 0.001, + "loss": 6.6121, "step": 40 }, { "epoch": 0.01, - "grad_norm": 458.0, - "learning_rate": 1.0033444816053512e-06, - "loss": 11.4497, + "grad_norm": 0.7265625, + "learning_rate": 0.001, + "loss": 6.4806, "step": 45 }, { "epoch": 0.01, - "grad_norm": 464.0, - "learning_rate": 1.1148272017837236e-06, - "loss": 11.4557, + "grad_norm": 0.63671875, + "learning_rate": 0.001, + "loss": 6.4596, "step": 50 }, { "epoch": 0.01, - "grad_norm": 436.0, - "learning_rate": 1.226309921962096e-06, - "loss": 11.3922, + "grad_norm": 0.875, + "learning_rate": 0.001, + "loss": 6.3081, "step": 55 }, { "epoch": 0.01, - "grad_norm": 460.0, - "learning_rate": 1.3377926421404683e-06, - "loss": 11.4687, + "grad_norm": 0.8125, + "learning_rate": 0.001, + "loss": 6.2485, "step": 60 }, { "epoch": 0.01, - "grad_norm": 448.0, - "learning_rate": 1.4492753623188408e-06, - "loss": 11.4079, + "grad_norm": 0.765625, + "learning_rate": 0.001, + "loss": 6.1595, "step": 65 }, { "epoch": 0.01, - "grad_norm": 446.0, - "learning_rate": 1.560758082497213e-06, - "loss": 11.4078, + "grad_norm": 0.80859375, + "learning_rate": 0.001, + "loss": 6.113, "step": 70 }, { "epoch": 0.01, - "grad_norm": 468.0, - "learning_rate": 1.6722408026755855e-06, - "loss": 11.391, + "grad_norm": 0.89453125, + "learning_rate": 0.001, + "loss": 6.0143, "step": 75 }, { "epoch": 0.01, - "grad_norm": 458.0, - "learning_rate": 1.7837235228539578e-06, - "loss": 11.3797, + "grad_norm": 1.203125, + "learning_rate": 0.001, + "loss": 5.9454, "step": 80 }, { "epoch": 0.01, - "grad_norm": 472.0, - "learning_rate": 1.8952062430323302e-06, - "loss": 11.358, + "grad_norm": 0.78515625, + "learning_rate": 0.001, + "loss": 5.8563, "step": 85 }, { "epoch": 0.01, - "grad_norm": 428.0, - "learning_rate": 2.0066889632107025e-06, - "loss": 11.3317, + "grad_norm": 0.89453125, + "learning_rate": 0.001, + "loss": 5.8343, "step": 90 }, { "epoch": 0.01, - "grad_norm": 450.0, - "learning_rate": 2.1181716833890746e-06, - "loss": 11.3934, + "grad_norm": 0.6015625, + "learning_rate": 0.001, + "loss": 5.8936, "step": 95 }, { "epoch": 0.01, - "grad_norm": 456.0, - "learning_rate": 2.229654403567447e-06, - "loss": 11.378, + "grad_norm": 0.765625, + "learning_rate": 0.001, + "loss": 5.6957, "step": 100 }, { "epoch": 0.01, - "grad_norm": 446.0, - "learning_rate": 2.3411371237458197e-06, - "loss": 11.373, + "grad_norm": 0.5, + "learning_rate": 0.001, + "loss": 5.6288, "step": 105 }, { "epoch": 0.01, - "grad_norm": 446.0, - "learning_rate": 2.452619843924192e-06, - "loss": 11.3715, + "grad_norm": 0.73828125, + "learning_rate": 0.001, + "loss": 5.601, "step": 110 }, { "epoch": 0.01, - "grad_norm": 448.0, - "learning_rate": 2.564102564102564e-06, - "loss": 11.2856, + "grad_norm": 0.64453125, + "learning_rate": 0.001, + "loss": 5.6385, "step": 115 }, { "epoch": 0.01, - "grad_norm": 452.0, - "learning_rate": 2.6755852842809365e-06, - "loss": 11.3129, + "grad_norm": 0.5625, + "learning_rate": 0.001, + "loss": 5.5606, "step": 120 }, { "epoch": 0.01, - "grad_norm": 466.0, - "learning_rate": 2.787068004459309e-06, - "loss": 11.2773, + "grad_norm": 0.625, + "learning_rate": 0.001, + "loss": 5.53, "step": 125 }, { "epoch": 0.01, - "grad_norm": 464.0, - "learning_rate": 2.8985507246376816e-06, - "loss": 11.2578, + "grad_norm": 0.62109375, + "learning_rate": 0.001, + "loss": 5.512, "step": 130 }, { "epoch": 0.02, - "grad_norm": 442.0, - "learning_rate": 3.010033444816054e-06, - "loss": 11.2884, + "grad_norm": 0.91015625, + "learning_rate": 0.001, + "loss": 5.3661, "step": 135 }, { "epoch": 0.02, - "grad_norm": 450.0, - "learning_rate": 3.121516164994426e-06, - "loss": 11.2413, + "grad_norm": 0.5546875, + "learning_rate": 0.001, + "loss": 5.4498, "step": 140 }, { "epoch": 0.02, - "grad_norm": 444.0, - "learning_rate": 3.2329988851727984e-06, - "loss": 11.2083, + "grad_norm": 0.54296875, + "learning_rate": 0.001, + "loss": 5.3858, "step": 145 }, { "epoch": 0.02, - "grad_norm": 438.0, - "learning_rate": 3.344481605351171e-06, - "loss": 11.1428, + "grad_norm": 0.59375, + "learning_rate": 0.001, + "loss": 5.3242, "step": 150 }, { "epoch": 0.02, - "grad_norm": 432.0, - "learning_rate": 3.4559643255295435e-06, - "loss": 11.1927, + "grad_norm": 0.9140625, + "learning_rate": 0.001, + "loss": 5.3414, "step": 155 }, { "epoch": 0.02, - "grad_norm": 420.0, - "learning_rate": 3.5674470457079156e-06, - "loss": 11.1467, + "grad_norm": 1.15625, + "learning_rate": 0.001, + "loss": 5.3306, "step": 160 }, { "epoch": 0.02, - "grad_norm": 434.0, - "learning_rate": 3.6789297658862878e-06, - "loss": 11.1394, + "grad_norm": 0.79296875, + "learning_rate": 0.001, + "loss": 5.2121, "step": 165 }, { "epoch": 0.02, - "grad_norm": 444.0, - "learning_rate": 3.7904124860646603e-06, - "loss": 11.078, + "grad_norm": 0.578125, + "learning_rate": 0.001, + "loss": 5.1801, "step": 170 }, { "epoch": 0.02, - "grad_norm": 412.0, - "learning_rate": 3.901895206243033e-06, - "loss": 11.06, + "grad_norm": 0.71484375, + "learning_rate": 0.001, + "loss": 5.1879, "step": 175 }, { "epoch": 0.02, - "grad_norm": 432.0, - "learning_rate": 4.013377926421405e-06, - "loss": 11.0569, + "grad_norm": 0.76953125, + "learning_rate": 0.001, + "loss": 5.0849, "step": 180 }, { "epoch": 0.02, - "grad_norm": 432.0, - "learning_rate": 4.124860646599778e-06, - "loss": 10.9972, + "grad_norm": 0.75, + "learning_rate": 0.001, + "loss": 5.1201, "step": 185 }, { "epoch": 0.02, - "grad_norm": 410.0, - "learning_rate": 4.236343366778149e-06, - "loss": 10.9987, + "grad_norm": 0.77734375, + "learning_rate": 0.001, + "loss": 5.0651, "step": 190 }, { "epoch": 0.02, - "grad_norm": 440.0, - "learning_rate": 4.347826086956522e-06, - "loss": 11.0114, + "grad_norm": 0.7109375, + "learning_rate": 0.001, + "loss": 5.0007, "step": 195 }, { "epoch": 0.02, - "grad_norm": 424.0, - "learning_rate": 4.459308807134894e-06, - "loss": 10.9548, + "grad_norm": 0.60546875, + "learning_rate": 0.001, + "loss": 5.0496, "step": 200 }, { "epoch": 0.02, - "grad_norm": 398.0, - "learning_rate": 4.570791527313267e-06, - "loss": 10.9053, + "grad_norm": 0.5390625, + "learning_rate": 0.001, + "loss": 4.9546, "step": 205 }, { "epoch": 0.02, - "grad_norm": 402.0, - "learning_rate": 4.6822742474916394e-06, - "loss": 10.9281, + "grad_norm": 0.73828125, + "learning_rate": 0.001, + "loss": 4.9174, "step": 210 }, { "epoch": 0.02, - "grad_norm": 414.0, - "learning_rate": 4.7937569676700116e-06, - "loss": 10.9083, + "grad_norm": 0.609375, + "learning_rate": 0.001, + "loss": 4.8822, "step": 215 }, { "epoch": 0.02, - "grad_norm": 414.0, - "learning_rate": 4.905239687848384e-06, - "loss": 10.8098, + "grad_norm": 0.6171875, + "learning_rate": 0.001, + "loss": 4.9179, "step": 220 }, { "epoch": 0.03, - "grad_norm": 404.0, - "learning_rate": 5.016722408026757e-06, - "loss": 10.8274, + "grad_norm": 0.55859375, + "learning_rate": 0.001, + "loss": 4.8214, "step": 225 }, { "epoch": 0.03, - "grad_norm": 406.0, - "learning_rate": 5.128205128205128e-06, - "loss": 10.7921, + "grad_norm": 0.625, + "learning_rate": 0.001, + "loss": 4.8761, "step": 230 }, { "epoch": 0.03, - "grad_norm": 406.0, - "learning_rate": 5.239687848383501e-06, - "loss": 10.7835, + "grad_norm": 0.55078125, + "learning_rate": 0.001, + "loss": 4.9586, "step": 235 }, { "epoch": 0.03, - "grad_norm": 408.0, - "learning_rate": 5.351170568561873e-06, - "loss": 10.6869, + "grad_norm": 0.6484375, + "learning_rate": 0.001, + "loss": 4.8608, "step": 240 }, { "epoch": 0.03, - "grad_norm": 402.0, - "learning_rate": 5.462653288740246e-06, - "loss": 10.6904, + "grad_norm": 1.0703125, + "learning_rate": 0.001, + "loss": 4.8165, "step": 245 }, { "epoch": 0.03, - "grad_norm": 388.0, - "learning_rate": 5.574136008918618e-06, - "loss": 10.6711, + "grad_norm": 0.71484375, + "learning_rate": 0.001, + "loss": 4.8616, "step": 250 }, { "epoch": 0.03, - "grad_norm": 388.0, - "learning_rate": 5.68561872909699e-06, - "loss": 10.6473, + "grad_norm": 0.671875, + "learning_rate": 0.001, + "loss": 4.7908, "step": 255 }, { "epoch": 0.03, - "grad_norm": 380.0, - "learning_rate": 5.797101449275363e-06, - "loss": 10.629, + "grad_norm": 0.52734375, + "learning_rate": 0.001, + "loss": 4.7085, "step": 260 }, { "epoch": 0.03, - "grad_norm": 422.0, - "learning_rate": 5.9085841694537345e-06, - "loss": 10.5391, + "grad_norm": 0.6484375, + "learning_rate": 0.001, + "loss": 4.6968, "step": 265 }, { "epoch": 0.03, - "grad_norm": 374.0, - "learning_rate": 6.020066889632108e-06, - "loss": 10.4908, + "grad_norm": 0.67578125, + "learning_rate": 0.001, + "loss": 4.648, "step": 270 }, { "epoch": 0.03, - "grad_norm": 330.0, - "learning_rate": 6.13154960981048e-06, - "loss": 10.3306, + "grad_norm": 0.62109375, + "learning_rate": 0.001, + "loss": 4.7703, "step": 275 }, { "epoch": 0.03, - "grad_norm": 356.0, - "learning_rate": 6.243032329988852e-06, - "loss": 10.2361, + "grad_norm": 0.71484375, + "learning_rate": 0.001, + "loss": 4.6552, "step": 280 }, { "epoch": 0.03, - "grad_norm": 282.0, - "learning_rate": 6.354515050167225e-06, - "loss": 9.9523, + "grad_norm": 0.51953125, + "learning_rate": 0.001, + "loss": 4.6484, "step": 285 }, { "epoch": 0.03, - "grad_norm": 130.0, - "learning_rate": 6.465997770345597e-06, - "loss": 9.4284, + "grad_norm": 0.546875, + "learning_rate": 0.001, + "loss": 4.6077, "step": 290 }, { "epoch": 0.03, - "grad_norm": 90.5, - "learning_rate": 6.57748049052397e-06, - "loss": 8.6984, + "grad_norm": 0.69140625, + "learning_rate": 0.001, + "loss": 4.5301, "step": 295 }, { "epoch": 0.03, - "grad_norm": 54.0, - "learning_rate": 6.688963210702342e-06, - "loss": 8.199, + "grad_norm": 0.50390625, + "learning_rate": 0.001, + "loss": 4.5485, "step": 300 }, { "epoch": 0.03, - "grad_norm": 38.5, - "learning_rate": 6.800445930880713e-06, - "loss": 7.7912, + "grad_norm": 0.73046875, + "learning_rate": 0.001, + "loss": 4.6582, "step": 305 }, { "epoch": 0.03, - "grad_norm": 35.0, - "learning_rate": 6.911928651059087e-06, - "loss": 7.3875, + "grad_norm": 0.546875, + "learning_rate": 0.001, + "loss": 4.6396, "step": 310 }, { "epoch": 0.04, - "grad_norm": 24.375, - "learning_rate": 7.023411371237458e-06, - "loss": 7.1806, + "grad_norm": 0.5625, + "learning_rate": 0.001, + "loss": 4.4431, "step": 315 }, { "epoch": 0.04, - "grad_norm": 25.375, - "learning_rate": 7.134894091415831e-06, - "loss": 7.0469, + "grad_norm": 0.5859375, + "learning_rate": 0.001, + "loss": 4.4276, "step": 320 }, { "epoch": 0.04, - "grad_norm": 21.0, - "learning_rate": 7.246376811594203e-06, - "loss": 6.9367, + "grad_norm": 0.58203125, + "learning_rate": 0.001, + "loss": 4.5243, "step": 325 }, { "epoch": 0.04, - "grad_norm": 26.875, - "learning_rate": 7.3578595317725755e-06, - "loss": 6.8302, + "grad_norm": 0.6796875, + "learning_rate": 0.001, + "loss": 4.4766, "step": 330 }, { "epoch": 0.04, - "grad_norm": 30.625, - "learning_rate": 7.4693422519509485e-06, - "loss": 6.8096, + "grad_norm": 0.4765625, + "learning_rate": 0.001, + "loss": 4.5035, "step": 335 }, { "epoch": 0.04, - "grad_norm": 21.25, - "learning_rate": 7.580824972129321e-06, - "loss": 6.6859, + "grad_norm": 0.5, + "learning_rate": 0.001, + "loss": 4.4449, "step": 340 }, { "epoch": 0.04, - "grad_norm": 32.0, - "learning_rate": 7.692307692307694e-06, - "loss": 6.6708, + "grad_norm": 0.5625, + "learning_rate": 0.001, + "loss": 4.3884, "step": 345 }, { "epoch": 0.04, - "grad_norm": 24.0, - "learning_rate": 7.803790412486066e-06, - "loss": 6.6274, + "grad_norm": 0.58203125, + "learning_rate": 0.001, + "loss": 4.4832, "step": 350 }, { "epoch": 0.04, - "grad_norm": 26.25, - "learning_rate": 7.915273132664438e-06, - "loss": 6.634, + "grad_norm": 0.546875, + "learning_rate": 0.001, + "loss": 4.3619, "step": 355 }, { "epoch": 0.04, - "grad_norm": 27.5, - "learning_rate": 8.02675585284281e-06, - "loss": 6.5753, + "grad_norm": 0.64453125, + "learning_rate": 0.001, + "loss": 4.4318, "step": 360 }, { "epoch": 0.04, - "grad_norm": 29.375, - "learning_rate": 8.138238573021182e-06, - "loss": 6.556, + "grad_norm": 0.59375, + "learning_rate": 0.001, + "loss": 4.3091, "step": 365 }, { "epoch": 0.04, - "grad_norm": 16.25, - "learning_rate": 8.249721293199556e-06, - "loss": 6.5317, + "grad_norm": 0.609375, + "learning_rate": 0.001, + "loss": 4.3793, "step": 370 }, { "epoch": 0.04, - "grad_norm": 31.75, - "learning_rate": 8.361204013377926e-06, - "loss": 6.4376, + "grad_norm": 0.60546875, + "learning_rate": 0.001, + "loss": 4.3673, "step": 375 }, { "epoch": 0.04, - "grad_norm": 22.0, - "learning_rate": 8.472686733556298e-06, - "loss": 6.4093, + "grad_norm": 0.7734375, + "learning_rate": 0.001, + "loss": 4.2584, "step": 380 }, { "epoch": 0.04, - "grad_norm": 20.625, - "learning_rate": 8.584169453734672e-06, - "loss": 6.5175, + "grad_norm": 0.5078125, + "learning_rate": 0.001, + "loss": 4.432, "step": 385 }, { "epoch": 0.04, - "grad_norm": 32.75, - "learning_rate": 8.695652173913044e-06, - "loss": 6.4589, + "grad_norm": 0.53515625, + "learning_rate": 0.001, + "loss": 4.2351, "step": 390 }, { "epoch": 0.04, - "grad_norm": 19.875, - "learning_rate": 8.807134894091417e-06, - "loss": 6.3879, + "grad_norm": 0.462890625, + "learning_rate": 0.001, + "loss": 4.3768, "step": 395 }, { "epoch": 0.04, - "grad_norm": 18.125, - "learning_rate": 8.918617614269789e-06, - "loss": 6.3995, + "grad_norm": 0.5, + "learning_rate": 0.001, + "loss": 4.2398, "step": 400 }, { "epoch": 0.05, - "grad_norm": 18.75, - "learning_rate": 9.03010033444816e-06, - "loss": 6.3709, + "grad_norm": 0.484375, + "learning_rate": 0.001, + "loss": 4.2591, "step": 405 }, { "epoch": 0.05, - "grad_norm": 24.0, - "learning_rate": 9.141583054626535e-06, - "loss": 6.3538, + "grad_norm": 0.51953125, + "learning_rate": 0.001, + "loss": 4.2083, "step": 410 }, { "epoch": 0.05, - "grad_norm": 37.75, - "learning_rate": 9.253065774804905e-06, - "loss": 6.305, + "grad_norm": 0.52734375, + "learning_rate": 0.001, + "loss": 4.2354, "step": 415 }, { "epoch": 0.05, - "grad_norm": 31.25, - "learning_rate": 9.364548494983279e-06, - "loss": 6.3054, + "grad_norm": 0.640625, + "learning_rate": 0.001, + "loss": 4.2133, "step": 420 }, { "epoch": 0.05, - "grad_norm": 27.125, - "learning_rate": 9.476031215161651e-06, - "loss": 6.2965, + "grad_norm": 0.58203125, + "learning_rate": 0.001, + "loss": 4.2055, "step": 425 }, { "epoch": 0.05, - "grad_norm": 21.25, - "learning_rate": 9.587513935340023e-06, - "loss": 6.3533, + "grad_norm": 0.56640625, + "learning_rate": 0.001, + "loss": 4.1682, "step": 430 }, { "epoch": 0.05, - "grad_norm": 22.25, - "learning_rate": 9.698996655518395e-06, - "loss": 6.3765, + "grad_norm": 0.53515625, + "learning_rate": 0.001, + "loss": 4.2379, "step": 435 }, { "epoch": 0.05, - "grad_norm": 28.375, - "learning_rate": 9.810479375696767e-06, - "loss": 6.307, + "grad_norm": 0.578125, + "learning_rate": 0.001, + "loss": 4.1974, "step": 440 }, { "epoch": 0.05, - "grad_norm": 22.125, - "learning_rate": 9.921962095875141e-06, - "loss": 6.2707, + "grad_norm": 0.49609375, + "learning_rate": 0.001, + "loss": 4.2177, "step": 445 }, { "epoch": 0.05, - "grad_norm": 23.625, - "learning_rate": 1.0033444816053513e-05, - "loss": 6.2558, + "grad_norm": 0.59375, + "learning_rate": 0.001, + "loss": 4.1886, "step": 450 }, { "epoch": 0.05, - "grad_norm": 25.0, - "learning_rate": 1.0144927536231885e-05, - "loss": 6.3292, + "grad_norm": 0.58203125, + "learning_rate": 0.001, + "loss": 4.1948, "step": 455 }, { "epoch": 0.05, - "grad_norm": 21.125, - "learning_rate": 1.0256410256410256e-05, - "loss": 6.2039, + "grad_norm": 0.5390625, + "learning_rate": 0.001, + "loss": 4.2911, "step": 460 }, { "epoch": 0.05, - "grad_norm": 20.125, - "learning_rate": 1.036789297658863e-05, - "loss": 6.2238, + "grad_norm": 0.515625, + "learning_rate": 0.001, + "loss": 4.2212, "step": 465 }, { "epoch": 0.05, - "grad_norm": 20.75, - "learning_rate": 1.0479375696767002e-05, - "loss": 6.2397, + "grad_norm": 0.49609375, + "learning_rate": 0.001, + "loss": 4.1132, "step": 470 }, { "epoch": 0.05, - "grad_norm": 23.625, - "learning_rate": 1.0590858416945376e-05, - "loss": 6.184, + "grad_norm": 0.6796875, + "learning_rate": 0.001, + "loss": 4.1323, "step": 475 }, { "epoch": 0.05, - "grad_norm": 24.75, - "learning_rate": 1.0702341137123746e-05, - "loss": 6.1554, + "grad_norm": 0.5546875, + "learning_rate": 0.001, + "loss": 4.0926, "step": 480 }, { "epoch": 0.05, - "grad_norm": 27.125, - "learning_rate": 1.081382385730212e-05, - "loss": 6.2529, + "grad_norm": 0.53125, + "learning_rate": 0.001, + "loss": 4.0398, "step": 485 }, { "epoch": 0.05, - "grad_norm": 26.5, - "learning_rate": 1.0925306577480492e-05, - "loss": 6.2227, + "grad_norm": 0.484375, + "learning_rate": 0.001, + "loss": 4.1016, "step": 490 }, { "epoch": 0.06, - "grad_norm": 24.5, - "learning_rate": 1.1036789297658862e-05, - "loss": 6.2164, + "grad_norm": 0.578125, + "learning_rate": 0.001, + "loss": 3.9908, "step": 495 }, { "epoch": 0.06, - "grad_norm": 16.375, - "learning_rate": 1.1148272017837236e-05, - "loss": 6.1305, + "grad_norm": 0.5, + "learning_rate": 0.001, + "loss": 4.1144, "step": 500 }, { "epoch": 0.06, - "grad_norm": 24.875, - "learning_rate": 1.1259754738015608e-05, - "loss": 6.1337, + "grad_norm": 0.59375, + "learning_rate": 0.001, + "loss": 4.0445, "step": 505 }, { "epoch": 0.06, - "grad_norm": 29.875, - "learning_rate": 1.137123745819398e-05, - "loss": 6.2112, + "grad_norm": 0.52734375, + "learning_rate": 0.001, + "loss": 4.0426, "step": 510 }, { "epoch": 0.06, - "grad_norm": 17.25, - "learning_rate": 1.1482720178372353e-05, - "loss": 6.1722, + "grad_norm": 0.5234375, + "learning_rate": 0.001, + "loss": 4.0339, "step": 515 }, { "epoch": 0.06, - "grad_norm": 25.5, - "learning_rate": 1.1594202898550726e-05, - "loss": 6.202, + "grad_norm": 0.55859375, + "learning_rate": 0.001, + "loss": 4.0517, "step": 520 }, { "epoch": 0.06, - "grad_norm": 31.5, - "learning_rate": 1.1705685618729099e-05, - "loss": 6.1368, + "grad_norm": 0.5625, + "learning_rate": 0.001, + "loss": 3.9779, "step": 525 }, { "epoch": 0.06, - "grad_norm": 19.125, - "learning_rate": 1.1817168338907469e-05, - "loss": 6.0854, + "grad_norm": 0.482421875, + "learning_rate": 0.001, + "loss": 3.9787, "step": 530 }, { "epoch": 0.06, - "grad_norm": 20.5, - "learning_rate": 1.1928651059085843e-05, - "loss": 6.1283, + "grad_norm": 0.494140625, + "learning_rate": 0.001, + "loss": 3.9433, "step": 535 }, { "epoch": 0.06, - "grad_norm": 23.25, - "learning_rate": 1.2040133779264217e-05, - "loss": 6.1011, + "grad_norm": 0.58984375, + "learning_rate": 0.001, + "loss": 3.9909, "step": 540 }, { "epoch": 0.06, - "grad_norm": 20.375, - "learning_rate": 1.2151616499442587e-05, - "loss": 6.0732, + "grad_norm": 0.498046875, + "learning_rate": 0.001, + "loss": 3.9582, "step": 545 }, { "epoch": 0.06, - "grad_norm": 24.625, - "learning_rate": 1.226309921962096e-05, - "loss": 6.0853, + "grad_norm": 0.453125, + "learning_rate": 0.001, + "loss": 3.996, "step": 550 }, { "epoch": 0.06, - "grad_norm": 26.0, - "learning_rate": 1.2374581939799333e-05, - "loss": 6.12, + "grad_norm": 0.546875, + "learning_rate": 0.001, + "loss": 4.0016, "step": 555 }, { "epoch": 0.06, - "grad_norm": 33.0, - "learning_rate": 1.2486064659977703e-05, - "loss": 6.1036, + "grad_norm": 0.64453125, + "learning_rate": 0.001, + "loss": 3.9915, "step": 560 }, { "epoch": 0.06, - "grad_norm": 21.875, - "learning_rate": 1.2597547380156077e-05, - "loss": 6.0976, + "grad_norm": 0.640625, + "learning_rate": 0.001, + "loss": 3.8457, "step": 565 }, { "epoch": 0.06, - "grad_norm": 23.875, - "learning_rate": 1.270903010033445e-05, - "loss": 5.9798, + "grad_norm": 0.5, + "learning_rate": 0.001, + "loss": 3.8556, "step": 570 }, { "epoch": 0.06, - "grad_norm": 25.25, - "learning_rate": 1.2820512820512823e-05, - "loss": 6.051, + "grad_norm": 0.578125, + "learning_rate": 0.001, + "loss": 3.9233, "step": 575 }, { "epoch": 0.06, - "grad_norm": 27.375, - "learning_rate": 1.2931995540691194e-05, - "loss": 6.0116, + "grad_norm": 0.62109375, + "learning_rate": 0.001, + "loss": 3.9413, "step": 580 }, { "epoch": 0.07, - "grad_norm": 22.5, - "learning_rate": 1.3043478260869566e-05, - "loss": 6.0917, + "grad_norm": 0.474609375, + "learning_rate": 0.001, + "loss": 3.968, "step": 585 }, { "epoch": 0.07, - "grad_norm": 21.25, - "learning_rate": 1.315496098104794e-05, - "loss": 6.0277, + "grad_norm": 0.486328125, + "learning_rate": 0.001, + "loss": 3.971, "step": 590 }, { "epoch": 0.07, - "grad_norm": 18.375, - "learning_rate": 1.326644370122631e-05, - "loss": 5.968, + "grad_norm": 0.4765625, + "learning_rate": 0.001, + "loss": 4.0015, "step": 595 }, { "epoch": 0.07, - "grad_norm": 30.125, - "learning_rate": 1.3377926421404684e-05, - "loss": 5.9714, + "grad_norm": 0.60546875, + "learning_rate": 0.001, + "loss": 3.8761, "step": 600 }, { "epoch": 0.07, - "grad_norm": 21.375, - "learning_rate": 1.3489409141583056e-05, - "loss": 5.967, + "grad_norm": 0.48828125, + "learning_rate": 0.001, + "loss": 3.8149, "step": 605 }, { "epoch": 0.07, - "grad_norm": 29.75, - "learning_rate": 1.3600891861761426e-05, - "loss": 5.9877, + "grad_norm": 0.5, + "learning_rate": 0.001, + "loss": 3.8305, "step": 610 }, { "epoch": 0.07, - "grad_norm": 25.375, - "learning_rate": 1.37123745819398e-05, - "loss": 5.9865, + "grad_norm": 0.498046875, + "learning_rate": 0.001, + "loss": 3.7626, "step": 615 }, { "epoch": 0.07, - "grad_norm": 21.75, - "learning_rate": 1.3823857302118174e-05, - "loss": 5.9437, + "grad_norm": 0.46875, + "learning_rate": 0.001, + "loss": 3.8127, "step": 620 }, { "epoch": 0.07, - "grad_norm": 27.0, - "learning_rate": 1.3935340022296546e-05, - "loss": 5.9155, + "grad_norm": 0.52734375, + "learning_rate": 0.001, + "loss": 3.8408, "step": 625 }, { "epoch": 0.07, - "grad_norm": 25.875, - "learning_rate": 1.4046822742474917e-05, - "loss": 5.9022, + "grad_norm": 0.4765625, + "learning_rate": 0.001, + "loss": 3.9199, "step": 630 }, { "epoch": 0.07, - "grad_norm": 23.75, - "learning_rate": 1.415830546265329e-05, - "loss": 5.9, + "grad_norm": 0.60546875, + "learning_rate": 0.001, + "loss": 3.8861, "step": 635 }, { "epoch": 0.07, - "grad_norm": 25.75, - "learning_rate": 1.4269788182831663e-05, - "loss": 5.9568, + "grad_norm": 0.451171875, + "learning_rate": 0.001, + "loss": 3.8037, "step": 640 }, { "epoch": 0.07, - "grad_norm": 30.375, - "learning_rate": 1.4381270903010035e-05, - "loss": 5.9495, + "grad_norm": 0.50390625, + "learning_rate": 0.001, + "loss": 3.8155, "step": 645 }, { "epoch": 0.07, - "grad_norm": 31.875, - "learning_rate": 1.4492753623188407e-05, - "loss": 5.8347, + "grad_norm": 0.5859375, + "learning_rate": 0.001, + "loss": 3.8044, "step": 650 }, { "epoch": 0.07, - "grad_norm": 24.125, - "learning_rate": 1.460423634336678e-05, - "loss": 5.8962, + "grad_norm": 0.4921875, + "learning_rate": 0.001, + "loss": 3.7604, "step": 655 }, { "epoch": 0.07, - "grad_norm": 27.625, - "learning_rate": 1.4715719063545151e-05, - "loss": 5.8729, + "grad_norm": 0.5390625, + "learning_rate": 0.001, + "loss": 3.7374, "step": 660 }, { "epoch": 0.07, - "grad_norm": 25.875, - "learning_rate": 1.4827201783723523e-05, - "loss": 5.8105, + "grad_norm": 0.54296875, + "learning_rate": 0.001, + "loss": 3.7128, "step": 665 }, { "epoch": 0.07, - "grad_norm": 36.25, - "learning_rate": 1.4938684503901897e-05, - "loss": 5.853, + "grad_norm": 0.4453125, + "learning_rate": 0.001, + "loss": 3.6821, "step": 670 }, { "epoch": 0.08, - "grad_norm": 40.75, - "learning_rate": 1.5050167224080269e-05, - "loss": 5.8977, + "grad_norm": 0.466796875, + "learning_rate": 0.001, + "loss": 3.689, "step": 675 }, { "epoch": 0.08, - "grad_norm": 25.875, - "learning_rate": 1.5161649944258641e-05, - "loss": 5.8372, + "grad_norm": 0.53125, + "learning_rate": 0.001, + "loss": 3.7706, "step": 680 }, { "epoch": 0.08, - "grad_norm": 32.5, - "learning_rate": 1.5273132664437013e-05, - "loss": 5.824, + "grad_norm": 0.53125, + "learning_rate": 0.001, + "loss": 3.7512, "step": 685 }, { "epoch": 0.08, - "grad_norm": 28.25, - "learning_rate": 1.5384615384615387e-05, - "loss": 5.8085, + "grad_norm": 0.53515625, + "learning_rate": 0.001, + "loss": 3.687, "step": 690 }, { "epoch": 0.08, - "grad_norm": 26.25, - "learning_rate": 1.5496098104793758e-05, - "loss": 5.8476, + "grad_norm": 0.57421875, + "learning_rate": 0.001, + "loss": 3.6497, "step": 695 }, { "epoch": 0.08, - "grad_norm": 17.875, - "learning_rate": 1.560758082497213e-05, - "loss": 5.7549, + "grad_norm": 0.55859375, + "learning_rate": 0.001, + "loss": 3.7325, "step": 700 }, { "epoch": 0.08, - "grad_norm": 29.375, - "learning_rate": 1.5719063545150505e-05, - "loss": 5.8592, + "grad_norm": 0.59375, + "learning_rate": 0.001, + "loss": 3.6482, "step": 705 }, { "epoch": 0.08, - "grad_norm": 30.5, - "learning_rate": 1.5830546265328876e-05, - "loss": 5.781, + "grad_norm": 0.48046875, + "learning_rate": 0.001, + "loss": 3.7139, "step": 710 }, { "epoch": 0.08, - "grad_norm": 26.125, - "learning_rate": 1.5942028985507246e-05, - "loss": 5.715, + "grad_norm": 0.58984375, + "learning_rate": 0.001, + "loss": 3.6265, "step": 715 }, { "epoch": 0.08, - "grad_norm": 22.875, - "learning_rate": 1.605351170568562e-05, - "loss": 5.7379, + "grad_norm": 0.478515625, + "learning_rate": 0.001, + "loss": 3.6359, "step": 720 }, { "epoch": 0.08, - "grad_norm": 22.625, - "learning_rate": 1.6164994425863994e-05, - "loss": 5.7255, + "grad_norm": 0.4609375, + "learning_rate": 0.001, + "loss": 3.689, "step": 725 }, { "epoch": 0.08, - "grad_norm": 25.0, - "learning_rate": 1.6276477146042364e-05, - "loss": 5.7092, + "grad_norm": 0.4921875, + "learning_rate": 0.001, + "loss": 3.664, "step": 730 }, { "epoch": 0.08, - "grad_norm": 32.0, - "learning_rate": 1.6387959866220738e-05, - "loss": 5.6632, + "grad_norm": 0.5234375, + "learning_rate": 0.001, + "loss": 3.5956, "step": 735 }, { "epoch": 0.08, - "grad_norm": 32.25, - "learning_rate": 1.6499442586399112e-05, - "loss": 5.7087, + "grad_norm": 0.458984375, + "learning_rate": 0.001, + "loss": 3.6903, "step": 740 }, { "epoch": 0.08, - "grad_norm": 40.5, - "learning_rate": 1.6610925306577482e-05, - "loss": 5.7586, + "grad_norm": 0.47265625, + "learning_rate": 0.001, + "loss": 3.7775, "step": 745 }, { "epoch": 0.08, - "grad_norm": 29.25, - "learning_rate": 1.6722408026755853e-05, - "loss": 5.6654, + "grad_norm": 0.462890625, + "learning_rate": 0.001, + "loss": 3.604, "step": 750 }, { "epoch": 0.08, - "grad_norm": 31.625, - "learning_rate": 1.6833890746934227e-05, - "loss": 5.6465, + "grad_norm": 0.5390625, + "learning_rate": 0.001, + "loss": 3.5746, "step": 755 }, { "epoch": 0.08, - "grad_norm": 36.5, - "learning_rate": 1.6945373467112597e-05, - "loss": 5.7025, + "grad_norm": 0.455078125, + "learning_rate": 0.001, + "loss": 3.5907, "step": 760 }, { "epoch": 0.09, - "grad_norm": 22.625, - "learning_rate": 1.705685618729097e-05, - "loss": 5.6813, + "grad_norm": 0.41796875, + "learning_rate": 0.001, + "loss": 3.6306, "step": 765 }, { "epoch": 0.09, - "grad_norm": 25.375, - "learning_rate": 1.7168338907469345e-05, - "loss": 5.6159, + "grad_norm": 0.498046875, + "learning_rate": 0.001, + "loss": 3.7143, "step": 770 }, { "epoch": 0.09, - "grad_norm": 26.375, - "learning_rate": 1.727982162764772e-05, - "loss": 5.6733, + "grad_norm": 0.478515625, + "learning_rate": 0.001, + "loss": 3.6161, "step": 775 }, { "epoch": 0.09, - "grad_norm": 42.5, - "learning_rate": 1.739130434782609e-05, - "loss": 5.5965, + "grad_norm": 0.462890625, + "learning_rate": 0.001, + "loss": 3.6404, "step": 780 }, { "epoch": 0.09, - "grad_norm": 37.5, - "learning_rate": 1.750278706800446e-05, - "loss": 5.5587, + "grad_norm": 0.40625, + "learning_rate": 0.001, + "loss": 3.6052, "step": 785 }, { "epoch": 0.09, - "grad_norm": 31.375, - "learning_rate": 1.7614269788182833e-05, - "loss": 5.521, + "grad_norm": 0.55078125, + "learning_rate": 0.001, + "loss": 3.5776, "step": 790 }, { "epoch": 0.09, - "grad_norm": 44.5, - "learning_rate": 1.7725752508361204e-05, - "loss": 5.6231, + "grad_norm": 0.484375, + "learning_rate": 0.001, + "loss": 3.56, "step": 795 }, { "epoch": 0.09, - "grad_norm": 29.375, - "learning_rate": 1.7837235228539577e-05, - "loss": 5.6229, + "grad_norm": 0.484375, + "learning_rate": 0.001, + "loss": 3.5469, "step": 800 }, { "epoch": 0.09, - "grad_norm": 34.5, - "learning_rate": 1.794871794871795e-05, - "loss": 5.5861, + "grad_norm": 0.5625, + "learning_rate": 0.001, + "loss": 3.637, "step": 805 }, { "epoch": 0.09, - "grad_norm": 33.5, - "learning_rate": 1.806020066889632e-05, - "loss": 5.5641, + "grad_norm": 0.50390625, + "learning_rate": 0.001, + "loss": 3.582, "step": 810 }, { "epoch": 0.09, - "grad_norm": 37.25, - "learning_rate": 1.8171683389074695e-05, - "loss": 5.531, + "grad_norm": 0.515625, + "learning_rate": 0.001, + "loss": 3.6892, "step": 815 }, { "epoch": 0.09, - "grad_norm": 37.0, - "learning_rate": 1.828316610925307e-05, - "loss": 5.5316, + "grad_norm": 0.494140625, + "learning_rate": 0.001, + "loss": 3.5234, "step": 820 }, { "epoch": 0.09, - "grad_norm": 37.25, - "learning_rate": 1.839464882943144e-05, - "loss": 5.5453, + "grad_norm": 0.43359375, + "learning_rate": 0.001, + "loss": 3.5884, "step": 825 }, { "epoch": 0.09, - "grad_norm": 30.125, - "learning_rate": 1.850613154960981e-05, - "loss": 5.5278, + "grad_norm": 0.515625, + "learning_rate": 0.001, + "loss": 3.541, "step": 830 }, { "epoch": 0.09, - "grad_norm": 23.375, - "learning_rate": 1.8617614269788184e-05, - "loss": 5.4992, + "grad_norm": 0.474609375, + "learning_rate": 0.001, + "loss": 3.4975, "step": 835 }, { "epoch": 0.09, - "grad_norm": 44.0, - "learning_rate": 1.8729096989966558e-05, - "loss": 5.4551, + "grad_norm": 0.462890625, + "learning_rate": 0.001, + "loss": 3.5908, "step": 840 }, { "epoch": 0.09, - "grad_norm": 24.5, - "learning_rate": 1.8840579710144928e-05, - "loss": 5.5757, + "grad_norm": 0.4765625, + "learning_rate": 0.001, + "loss": 3.5859, "step": 845 }, { "epoch": 0.09, - "grad_norm": 34.0, - "learning_rate": 1.8952062430323302e-05, - "loss": 5.5208, + "grad_norm": 0.41796875, + "learning_rate": 0.001, + "loss": 3.4903, "step": 850 }, { "epoch": 0.1, - "grad_norm": 40.0, - "learning_rate": 1.9063545150501676e-05, - "loss": 5.4401, + "grad_norm": 0.49609375, + "learning_rate": 0.001, + "loss": 3.4214, "step": 855 }, { "epoch": 0.1, - "grad_norm": 36.25, - "learning_rate": 1.9175027870680046e-05, - "loss": 5.5302, + "grad_norm": 0.455078125, + "learning_rate": 0.001, + "loss": 3.4849, "step": 860 }, { "epoch": 0.1, - "grad_norm": 42.25, - "learning_rate": 1.9286510590858417e-05, - "loss": 5.5197, + "grad_norm": 0.451171875, + "learning_rate": 0.001, + "loss": 3.4287, "step": 865 }, { "epoch": 0.1, - "grad_norm": 59.25, - "learning_rate": 1.939799331103679e-05, - "loss": 5.5592, + "grad_norm": 0.50390625, + "learning_rate": 0.001, + "loss": 3.3918, "step": 870 }, { "epoch": 0.1, - "grad_norm": 44.25, - "learning_rate": 1.9509476031215164e-05, - "loss": 5.4991, + "grad_norm": 0.4921875, + "learning_rate": 0.001, + "loss": 3.4713, "step": 875 }, { "epoch": 0.1, - "grad_norm": 38.0, - "learning_rate": 1.9620958751393535e-05, - "loss": 5.438, + "grad_norm": 0.51171875, + "learning_rate": 0.001, + "loss": 3.4109, "step": 880 }, { "epoch": 0.1, - "grad_norm": 30.5, - "learning_rate": 1.973244147157191e-05, - "loss": 5.3923, + "grad_norm": 0.5625, + "learning_rate": 0.001, + "loss": 3.4379, "step": 885 }, { "epoch": 0.1, - "grad_norm": 28.375, - "learning_rate": 1.9843924191750282e-05, - "loss": 5.3843, + "grad_norm": 0.4921875, + "learning_rate": 0.001, + "loss": 3.4546, "step": 890 }, { "epoch": 0.1, - "grad_norm": 29.125, - "learning_rate": 1.9955406911928653e-05, - "loss": 5.4087, + "grad_norm": 0.482421875, + "learning_rate": 0.001, + "loss": 3.3923, "step": 895 }, { "epoch": 0.1, - "grad_norm": 40.5, - "learning_rate": 1.9999993183681055e-05, - "loss": 5.4785, + "grad_norm": 0.47265625, + "learning_rate": 0.001, + "loss": 3.4302, "step": 900 }, { "epoch": 0.1, - "grad_norm": 43.25, - "learning_rate": 1.9999951528432276e-05, - "loss": 5.3825, + "grad_norm": 0.494140625, + "learning_rate": 0.001, + "loss": 3.4475, "step": 905 }, { "epoch": 0.1, - "grad_norm": 40.75, - "learning_rate": 1.9999872004936127e-05, - "loss": 5.3783, + "grad_norm": 0.51953125, + "learning_rate": 0.001, + "loss": 3.3023, "step": 910 }, { "epoch": 0.1, - "grad_norm": 43.75, - "learning_rate": 1.9999754613493748e-05, - "loss": 5.4162, + "grad_norm": 0.439453125, + "learning_rate": 0.001, + "loss": 3.3485, "step": 915 }, { "epoch": 0.1, - "grad_norm": 43.75, - "learning_rate": 1.999959935454968e-05, - "loss": 5.3683, + "grad_norm": 0.4375, + "learning_rate": 0.001, + "loss": 3.3677, "step": 920 }, { "epoch": 0.1, - "grad_norm": 31.25, - "learning_rate": 1.9999406228691872e-05, - "loss": 5.3471, + "grad_norm": 0.52734375, + "learning_rate": 0.001, + "loss": 3.5076, "step": 925 }, { "epoch": 0.1, - "grad_norm": 31.625, - "learning_rate": 1.9999175236651652e-05, - "loss": 5.3723, + "grad_norm": 0.53515625, + "learning_rate": 0.001, + "loss": 3.421, "step": 930 }, { "epoch": 0.1, - "grad_norm": 42.0, - "learning_rate": 1.999890637930376e-05, - "loss": 5.3983, + "grad_norm": 0.474609375, + "learning_rate": 0.001, + "loss": 3.4751, "step": 935 }, { "epoch": 0.1, - "grad_norm": 30.625, - "learning_rate": 1.9998599657666307e-05, - "loss": 5.408, + "grad_norm": 0.5234375, + "learning_rate": 0.001, + "loss": 3.2864, "step": 940 }, { "epoch": 0.11, - "grad_norm": 28.875, - "learning_rate": 1.9998255072900807e-05, - "loss": 5.3377, + "grad_norm": 0.4453125, + "learning_rate": 0.001, + "loss": 3.3964, "step": 945 }, { "epoch": 0.11, - "grad_norm": 29.25, - "learning_rate": 1.9997872626312147e-05, - "loss": 5.4067, + "grad_norm": 0.51171875, + "learning_rate": 0.001, + "loss": 3.367, "step": 950 }, { "epoch": 0.11, - "grad_norm": 32.25, - "learning_rate": 1.999745231934859e-05, - "loss": 5.365, + "grad_norm": 0.45703125, + "learning_rate": 0.001, + "loss": 3.3819, "step": 955 }, { "epoch": 0.11, - "grad_norm": 26.375, - "learning_rate": 1.9996994153601777e-05, - "loss": 5.2902, + "grad_norm": 0.435546875, + "learning_rate": 0.001, + "loss": 3.2415, "step": 960 }, { "epoch": 0.11, - "grad_norm": 38.25, - "learning_rate": 1.9996498130806704e-05, - "loss": 5.3347, + "grad_norm": 0.48828125, + "learning_rate": 0.001, + "loss": 3.2919, "step": 965 }, { "epoch": 0.11, - "grad_norm": 53.75, - "learning_rate": 1.9995964252841735e-05, - "loss": 5.324, + "grad_norm": 0.47265625, + "learning_rate": 0.001, + "loss": 3.4433, "step": 970 }, { "epoch": 0.11, - "grad_norm": 46.0, - "learning_rate": 1.999539252172858e-05, - "loss": 5.3197, + "grad_norm": 0.470703125, + "learning_rate": 0.001, + "loss": 3.3476, "step": 975 }, { "epoch": 0.11, - "grad_norm": 28.625, - "learning_rate": 1.9994782939632298e-05, - "loss": 5.3279, + "grad_norm": 0.451171875, + "learning_rate": 0.001, + "loss": 3.2323, "step": 980 }, { "epoch": 0.11, - "grad_norm": 30.75, - "learning_rate": 1.9994135508861282e-05, - "loss": 5.3228, + "grad_norm": 0.5390625, + "learning_rate": 0.001, + "loss": 3.3249, "step": 985 }, { "epoch": 0.11, - "grad_norm": 38.75, - "learning_rate": 1.999345023186724e-05, - "loss": 5.2432, + "grad_norm": 0.51171875, + "learning_rate": 0.001, + "loss": 3.215, "step": 990 }, { "epoch": 0.11, - "grad_norm": 32.25, - "learning_rate": 1.999272711124522e-05, - "loss": 5.2532, + "grad_norm": 0.435546875, + "learning_rate": 0.001, + "loss": 3.2893, "step": 995 }, { "epoch": 0.11, - "grad_norm": 29.5, - "learning_rate": 1.9991966149733566e-05, - "loss": 5.2364, + "grad_norm": 0.48828125, + "learning_rate": 0.001, + "loss": 3.3386, "step": 1000 }, { "epoch": 0.11, - "grad_norm": 40.75, - "learning_rate": 1.9991167350213913e-05, - "loss": 5.24, + "grad_norm": 0.4921875, + "learning_rate": 0.001, + "loss": 3.2875, "step": 1005 }, { "epoch": 0.11, - "grad_norm": 38.0, - "learning_rate": 1.999033071571119e-05, - "loss": 5.3299, + "grad_norm": 0.5234375, + "learning_rate": 0.001, + "loss": 3.3014, "step": 1010 }, { "epoch": 0.11, - "grad_norm": 42.0, - "learning_rate": 1.998945624939361e-05, - "loss": 5.2602, + "grad_norm": 0.458984375, + "learning_rate": 0.001, + "loss": 3.176, "step": 1015 }, { "epoch": 0.11, - "grad_norm": 30.625, - "learning_rate": 1.998854395457263e-05, - "loss": 5.2489, + "grad_norm": 0.46484375, + "learning_rate": 0.001, + "loss": 3.2795, "step": 1020 }, { "epoch": 0.11, - "grad_norm": 39.25, - "learning_rate": 1.9987593834702972e-05, - "loss": 5.2893, + "grad_norm": 0.494140625, + "learning_rate": 0.001, + "loss": 3.2852, "step": 1025 }, { "epoch": 0.11, - "grad_norm": 28.625, - "learning_rate": 1.998660589338259e-05, - "loss": 5.2161, + "grad_norm": 0.5703125, + "learning_rate": 0.001, + "loss": 3.278, "step": 1030 }, { "epoch": 0.12, - "grad_norm": 26.375, - "learning_rate": 1.998558013435266e-05, - "loss": 5.1957, + "grad_norm": 0.54296875, + "learning_rate": 0.001, + "loss": 3.2953, "step": 1035 }, { "epoch": 0.12, - "grad_norm": 30.875, - "learning_rate": 1.998451656149757e-05, - "loss": 5.2335, + "grad_norm": 0.482421875, + "learning_rate": 0.001, + "loss": 3.217, "step": 1040 }, { "epoch": 0.12, - "grad_norm": 47.25, - "learning_rate": 1.998341517884491e-05, - "loss": 5.2359, + "grad_norm": 0.48046875, + "learning_rate": 0.001, + "loss": 3.1406, "step": 1045 }, { "epoch": 0.12, - "grad_norm": 35.25, - "learning_rate": 1.998227599056544e-05, - "loss": 5.2585, + "grad_norm": 0.439453125, + "learning_rate": 0.001, + "loss": 3.2346, "step": 1050 }, { "epoch": 0.12, - "grad_norm": 51.25, - "learning_rate": 1.9981099000973083e-05, - "loss": 5.2168, + "grad_norm": 0.451171875, + "learning_rate": 0.001, + "loss": 3.1984, "step": 1055 }, { "epoch": 0.12, - "grad_norm": 35.0, - "learning_rate": 1.9979884214524922e-05, - "loss": 5.1868, + "grad_norm": 0.49609375, + "learning_rate": 0.001, + "loss": 3.2343, "step": 1060 }, { "epoch": 0.12, - "grad_norm": 38.25, - "learning_rate": 1.9978631635821155e-05, - "loss": 5.2182, + "grad_norm": 0.455078125, + "learning_rate": 0.001, + "loss": 3.1503, "step": 1065 }, { "epoch": 0.12, - "grad_norm": 28.0, - "learning_rate": 1.9977341269605107e-05, - "loss": 5.2858, + "grad_norm": 0.427734375, + "learning_rate": 0.001, + "loss": 3.224, "step": 1070 }, { "epoch": 0.12, - "grad_norm": 25.625, - "learning_rate": 1.9976013120763187e-05, - "loss": 5.1941, + "grad_norm": 0.43359375, + "learning_rate": 0.001, + "loss": 3.1678, "step": 1075 }, { "epoch": 0.12, - "grad_norm": 38.0, - "learning_rate": 1.997464719432489e-05, - "loss": 5.1468, + "grad_norm": 0.451171875, + "learning_rate": 0.001, + "loss": 3.1891, "step": 1080 }, { "epoch": 0.12, - "grad_norm": 41.25, - "learning_rate": 1.9973243495462762e-05, - "loss": 5.1775, + "grad_norm": 0.515625, + "learning_rate": 0.001, + "loss": 3.197, "step": 1085 }, { "epoch": 0.12, - "grad_norm": 28.125, - "learning_rate": 1.99718020294924e-05, - "loss": 5.2217, + "grad_norm": 0.42578125, + "learning_rate": 0.001, + "loss": 3.1608, "step": 1090 }, { "epoch": 0.12, - "grad_norm": 49.75, - "learning_rate": 1.9970322801872394e-05, - "loss": 5.1177, + "grad_norm": 0.46484375, + "learning_rate": 0.001, + "loss": 3.2191, "step": 1095 }, { "epoch": 0.12, - "grad_norm": 67.5, - "learning_rate": 1.996880581820436e-05, - "loss": 5.2115, + "grad_norm": 0.50390625, + "learning_rate": 0.001, + "loss": 3.1926, "step": 1100 }, { "epoch": 0.12, - "grad_norm": 34.75, - "learning_rate": 1.9967251084232867e-05, - "loss": 5.1848, + "grad_norm": 0.474609375, + "learning_rate": 0.001, + "loss": 3.2302, "step": 1105 }, { "epoch": 0.12, - "grad_norm": 49.75, - "learning_rate": 1.9965658605845458e-05, - "loss": 5.2274, + "grad_norm": 0.466796875, + "learning_rate": 0.001, + "loss": 3.1557, "step": 1110 }, { "epoch": 0.12, - "grad_norm": 43.0, - "learning_rate": 1.996402838907259e-05, - "loss": 5.1731, + "grad_norm": 0.4453125, + "learning_rate": 0.001, + "loss": 3.1696, "step": 1115 }, { "epoch": 0.12, - "grad_norm": 30.75, - "learning_rate": 1.9962360440087644e-05, - "loss": 5.1872, + "grad_norm": 0.4921875, + "learning_rate": 0.001, + "loss": 3.2021, "step": 1120 }, { "epoch": 0.13, - "grad_norm": 60.75, - "learning_rate": 1.9960654765206882e-05, - "loss": 5.1681, + "grad_norm": 0.4609375, + "learning_rate": 0.001, + "loss": 3.1396, "step": 1125 }, { "epoch": 0.13, - "grad_norm": 74.5, - "learning_rate": 1.9958911370889425e-05, - "loss": 5.1957, + "grad_norm": 0.42578125, + "learning_rate": 0.001, + "loss": 3.126, "step": 1130 }, { "epoch": 0.13, - "grad_norm": 70.5, - "learning_rate": 1.995713026373723e-05, - "loss": 5.1432, + "grad_norm": 0.44921875, + "learning_rate": 0.001, + "loss": 3.1153, "step": 1135 }, { "epoch": 0.13, - "grad_norm": 119.0, - "learning_rate": 1.9955311450495075e-05, - "loss": 5.2231, + "grad_norm": 0.46484375, + "learning_rate": 0.001, + "loss": 3.2244, "step": 1140 }, { "epoch": 0.13, - "grad_norm": 56.0, - "learning_rate": 1.995345493805052e-05, - "loss": 5.1691, + "grad_norm": 0.44140625, + "learning_rate": 0.001, + "loss": 3.1862, "step": 1145 }, { "epoch": 0.13, - "grad_norm": 89.0, - "learning_rate": 1.995156073343388e-05, - "loss": 5.1904, + "grad_norm": 0.458984375, + "learning_rate": 0.001, + "loss": 3.1397, "step": 1150 }, { "epoch": 0.13, - "grad_norm": 138.0, - "learning_rate": 1.9949628843818223e-05, - "loss": 5.1577, + "grad_norm": 0.44921875, + "learning_rate": 0.001, + "loss": 3.1137, "step": 1155 }, { "epoch": 0.13, - "grad_norm": 179.0, - "learning_rate": 1.9947659276519303e-05, - "loss": 5.2386, + "grad_norm": 0.3984375, + "learning_rate": 0.001, + "loss": 3.0842, "step": 1160 }, { "epoch": 0.13, - "grad_norm": 82.5, - "learning_rate": 1.9945652038995564e-05, - "loss": 5.2989, + "grad_norm": 0.4765625, + "learning_rate": 0.001, + "loss": 3.1646, "step": 1165 }, { "epoch": 0.13, - "grad_norm": 75.5, - "learning_rate": 1.9943607138848096e-05, - "loss": 5.2105, + "grad_norm": 0.455078125, + "learning_rate": 0.001, + "loss": 3.1085, "step": 1170 }, { "epoch": 0.13, - "grad_norm": 58.0, - "learning_rate": 1.9941524583820624e-05, - "loss": 5.25, + "grad_norm": 0.431640625, + "learning_rate": 0.001, + "loss": 3.1618, "step": 1175 }, { "epoch": 0.13, - "grad_norm": 48.0, - "learning_rate": 1.993940438179945e-05, - "loss": 5.1375, + "grad_norm": 0.421875, + "learning_rate": 0.001, + "loss": 3.1494, "step": 1180 }, { "epoch": 0.13, - "grad_norm": 32.5, - "learning_rate": 1.993724654081345e-05, - "loss": 5.1981, + "grad_norm": 0.455078125, + "learning_rate": 0.001, + "loss": 3.1187, "step": 1185 }, { "epoch": 0.13, - "grad_norm": 29.25, - "learning_rate": 1.9935051069034023e-05, - "loss": 5.1697, + "grad_norm": 0.4921875, + "learning_rate": 0.001, + "loss": 3.1668, "step": 1190 }, { "epoch": 0.13, - "grad_norm": 30.0, - "learning_rate": 1.9932817974775084e-05, - "loss": 5.1373, + "grad_norm": 0.47265625, + "learning_rate": 0.001, + "loss": 3.0082, "step": 1195 }, { "epoch": 0.13, - "grad_norm": 67.5, - "learning_rate": 1.9930547266493008e-05, - "loss": 5.1737, + "grad_norm": 0.5234375, + "learning_rate": 0.001, + "loss": 3.0856, "step": 1200 }, { "epoch": 0.13, - "grad_norm": 37.5, - "learning_rate": 1.9928238952786607e-05, - "loss": 5.1334, + "grad_norm": 0.43359375, + "learning_rate": 0.001, + "loss": 3.0244, "step": 1205 }, { "epoch": 0.13, - "grad_norm": 31.5, - "learning_rate": 1.992589304239711e-05, - "loss": 5.1319, + "grad_norm": 0.51953125, + "learning_rate": 0.001, + "loss": 3.1119, "step": 1210 }, { "epoch": 0.14, - "grad_norm": 35.25, - "learning_rate": 1.9923509544208102e-05, - "loss": 5.1123, + "grad_norm": 0.423828125, + "learning_rate": 0.001, + "loss": 3.1893, "step": 1215 }, { "epoch": 0.14, - "grad_norm": 24.875, - "learning_rate": 1.9921088467245527e-05, - "loss": 5.1198, + "grad_norm": 0.46484375, + "learning_rate": 0.001, + "loss": 3.0317, "step": 1220 }, { "epoch": 0.14, - "grad_norm": 31.75, - "learning_rate": 1.9918629820677617e-05, - "loss": 5.0838, + "grad_norm": 0.498046875, + "learning_rate": 0.001, + "loss": 3.0754, "step": 1225 }, { "epoch": 0.14, - "grad_norm": 27.25, - "learning_rate": 1.9916133613814884e-05, - "loss": 5.0794, + "grad_norm": 0.4921875, + "learning_rate": 0.001, + "loss": 3.0712, "step": 1230 }, { "epoch": 0.14, - "grad_norm": 24.0, - "learning_rate": 1.991359985611007e-05, - "loss": 5.0729, + "grad_norm": 0.443359375, + "learning_rate": 0.001, + "loss": 3.0459, "step": 1235 }, { "epoch": 0.14, - "grad_norm": 31.0, - "learning_rate": 1.9911028557158116e-05, - "loss": 5.0976, + "grad_norm": 0.451171875, + "learning_rate": 0.001, + "loss": 3.1248, "step": 1240 }, { "epoch": 0.14, - "grad_norm": 34.5, - "learning_rate": 1.990841972669613e-05, - "loss": 5.0719, + "grad_norm": 0.515625, + "learning_rate": 0.001, + "loss": 3.0001, "step": 1245 }, { "epoch": 0.14, - "grad_norm": 33.0, - "learning_rate": 1.9905773374603344e-05, - "loss": 5.1073, + "grad_norm": 0.451171875, + "learning_rate": 0.001, + "loss": 3.0746, "step": 1250 }, { "epoch": 0.14, - "grad_norm": 43.25, - "learning_rate": 1.990308951090108e-05, - "loss": 5.019, + "grad_norm": 0.4921875, + "learning_rate": 0.001, + "loss": 3.0217, "step": 1255 }, { "epoch": 0.14, - "grad_norm": 32.5, - "learning_rate": 1.9900368145752706e-05, - "loss": 5.0418, + "grad_norm": 0.65234375, + "learning_rate": 0.001, + "loss": 3.0922, "step": 1260 }, { "epoch": 0.14, - "grad_norm": 29.875, - "learning_rate": 1.98976092894636e-05, - "loss": 4.9849, + "grad_norm": 0.478515625, + "learning_rate": 0.001, + "loss": 3.0769, "step": 1265 }, { "epoch": 0.14, - "grad_norm": 25.75, - "learning_rate": 1.989481295248113e-05, - "loss": 5.0978, + "grad_norm": 0.408203125, + "learning_rate": 0.001, + "loss": 3.0214, "step": 1270 }, { "epoch": 0.14, - "grad_norm": 35.75, - "learning_rate": 1.9891979145394572e-05, - "loss": 5.003, + "grad_norm": 0.42578125, + "learning_rate": 0.001, + "loss": 3.0603, "step": 1275 }, { "epoch": 0.14, - "grad_norm": 40.0, - "learning_rate": 1.988910787893512e-05, - "loss": 4.9988, + "grad_norm": 0.50390625, + "learning_rate": 0.001, + "loss": 3.0081, "step": 1280 }, { "epoch": 0.14, - "grad_norm": 27.625, - "learning_rate": 1.98861991639758e-05, - "loss": 4.9981, + "grad_norm": 0.419921875, + "learning_rate": 0.001, + "loss": 3.1073, "step": 1285 }, { "epoch": 0.14, - "grad_norm": 35.25, - "learning_rate": 1.9883253011531467e-05, - "loss": 4.9886, + "grad_norm": 0.400390625, + "learning_rate": 0.001, + "loss": 3.0161, "step": 1290 }, { "epoch": 0.14, - "grad_norm": 28.125, - "learning_rate": 1.9880269432758737e-05, - "loss": 5.0796, + "grad_norm": 0.4453125, + "learning_rate": 0.001, + "loss": 3.0399, "step": 1295 }, { "epoch": 0.14, - "grad_norm": 21.25, - "learning_rate": 1.987724843895595e-05, - "loss": 4.968, + "grad_norm": 0.427734375, + "learning_rate": 0.001, + "loss": 3.0176, "step": 1300 }, { "epoch": 0.15, - "grad_norm": 36.5, - "learning_rate": 1.9874190041563137e-05, - "loss": 5.0259, + "grad_norm": 0.466796875, + "learning_rate": 0.001, + "loss": 3.0438, "step": 1305 }, { "epoch": 0.15, - "grad_norm": 45.5, - "learning_rate": 1.987109425216197e-05, - "loss": 5.0353, + "grad_norm": 0.45703125, + "learning_rate": 0.001, + "loss": 2.9605, "step": 1310 }, { "epoch": 0.15, - "grad_norm": 43.25, - "learning_rate": 1.986796108247572e-05, - "loss": 4.9776, + "grad_norm": 0.490234375, + "learning_rate": 0.001, + "loss": 3.1799, "step": 1315 }, { "epoch": 0.15, - "grad_norm": 42.25, - "learning_rate": 1.9864790544369206e-05, - "loss": 4.9804, + "grad_norm": 0.4453125, + "learning_rate": 0.001, + "loss": 3.0367, "step": 1320 }, { "epoch": 0.15, - "grad_norm": 40.0, - "learning_rate": 1.986158264984876e-05, - "loss": 4.976, + "grad_norm": 0.48046875, + "learning_rate": 0.001, + "loss": 3.0772, "step": 1325 }, { "epoch": 0.15, - "grad_norm": 38.5, - "learning_rate": 1.9858337411062173e-05, - "loss": 4.9836, + "grad_norm": 0.453125, + "learning_rate": 0.001, + "loss": 2.9758, "step": 1330 }, { "epoch": 0.15, - "grad_norm": 29.625, - "learning_rate": 1.9855054840298656e-05, - "loss": 4.9848, + "grad_norm": 0.435546875, + "learning_rate": 0.001, + "loss": 2.9411, "step": 1335 }, { "epoch": 0.15, - "grad_norm": 24.5, - "learning_rate": 1.985173494998879e-05, - "loss": 5.0017, + "grad_norm": 0.419921875, + "learning_rate": 0.001, + "loss": 3.0461, "step": 1340 }, { "epoch": 0.15, - "grad_norm": 39.25, - "learning_rate": 1.9848377752704483e-05, - "loss": 4.8884, + "grad_norm": 0.466796875, + "learning_rate": 0.001, + "loss": 3.0639, "step": 1345 }, { "epoch": 0.15, - "grad_norm": 29.0, - "learning_rate": 1.9844983261158903e-05, - "loss": 4.977, + "grad_norm": 0.51171875, + "learning_rate": 0.001, + "loss": 3.0514, "step": 1350 }, { "epoch": 0.15, - "grad_norm": 30.125, - "learning_rate": 1.984155148820647e-05, - "loss": 4.8959, + "grad_norm": 0.396484375, + "learning_rate": 0.001, + "loss": 2.9288, "step": 1355 }, { "epoch": 0.15, - "grad_norm": 28.625, - "learning_rate": 1.9838082446842758e-05, - "loss": 4.9377, + "grad_norm": 0.44921875, + "learning_rate": 0.001, + "loss": 2.9909, "step": 1360 }, { "epoch": 0.15, - "grad_norm": 29.375, - "learning_rate": 1.983457615020449e-05, - "loss": 4.9921, + "grad_norm": 0.416015625, + "learning_rate": 0.001, + "loss": 3.0404, "step": 1365 }, { "epoch": 0.15, - "grad_norm": 40.5, - "learning_rate": 1.9831032611569458e-05, - "loss": 4.9766, + "grad_norm": 0.4296875, + "learning_rate": 0.001, + "loss": 2.9636, "step": 1370 }, { "epoch": 0.15, - "grad_norm": 36.5, - "learning_rate": 1.982745184435649e-05, - "loss": 4.9295, + "grad_norm": 0.44921875, + "learning_rate": 0.001, + "loss": 2.9691, "step": 1375 }, { "epoch": 0.15, - "grad_norm": 24.375, - "learning_rate": 1.982383386212539e-05, - "loss": 4.9953, + "grad_norm": 0.41015625, + "learning_rate": 0.001, + "loss": 3.0421, "step": 1380 }, { "epoch": 0.15, - "grad_norm": 28.625, - "learning_rate": 1.982017867857688e-05, - "loss": 4.9461, + "grad_norm": 0.453125, + "learning_rate": 0.001, + "loss": 3.0106, "step": 1385 }, { "epoch": 0.15, - "grad_norm": 24.875, - "learning_rate": 1.9816486307552575e-05, - "loss": 4.9479, + "grad_norm": 0.408203125, + "learning_rate": 0.001, + "loss": 2.9206, "step": 1390 }, { "epoch": 0.16, - "grad_norm": 23.625, - "learning_rate": 1.9812756763034903e-05, - "loss": 4.9738, + "grad_norm": 0.412109375, + "learning_rate": 0.001, + "loss": 2.9851, "step": 1395 }, { "epoch": 0.16, - "grad_norm": 34.0, - "learning_rate": 1.9808990059147066e-05, - "loss": 4.9395, + "grad_norm": 0.427734375, + "learning_rate": 0.001, + "loss": 2.8973, "step": 1400 }, { "epoch": 0.16, - "grad_norm": 37.5, - "learning_rate": 1.9805186210152977e-05, - "loss": 4.9216, + "grad_norm": 0.4453125, + "learning_rate": 0.001, + "loss": 2.9926, "step": 1405 }, { "epoch": 0.16, - "grad_norm": 46.75, - "learning_rate": 1.9801345230457214e-05, - "loss": 4.9212, + "grad_norm": 0.40625, + "learning_rate": 0.001, + "loss": 2.9713, "step": 1410 }, { "epoch": 0.16, - "grad_norm": 30.0, - "learning_rate": 1.9797467134604968e-05, - "loss": 4.9848, + "grad_norm": 0.5, + "learning_rate": 0.001, + "loss": 2.9641, "step": 1415 }, { "epoch": 0.16, - "grad_norm": 43.25, - "learning_rate": 1.979355193728198e-05, - "loss": 4.9646, + "grad_norm": 0.431640625, + "learning_rate": 0.001, + "loss": 2.9165, "step": 1420 }, { "epoch": 0.16, - "grad_norm": 29.875, - "learning_rate": 1.9789599653314484e-05, - "loss": 4.8391, + "grad_norm": 0.42578125, + "learning_rate": 0.001, + "loss": 2.9043, "step": 1425 }, { "epoch": 0.16, - "grad_norm": 39.5, - "learning_rate": 1.9785610297669163e-05, - "loss": 4.9009, + "grad_norm": 0.4140625, + "learning_rate": 0.001, + "loss": 3.0078, "step": 1430 }, { "epoch": 0.16, - "grad_norm": 29.125, - "learning_rate": 1.978158388545308e-05, - "loss": 4.9264, + "grad_norm": 0.40625, + "learning_rate": 0.001, + "loss": 2.9917, "step": 1435 }, { "epoch": 0.16, - "grad_norm": 32.25, - "learning_rate": 1.9777520431913626e-05, - "loss": 4.9447, + "grad_norm": 0.51953125, + "learning_rate": 0.001, + "loss": 2.9517, "step": 1440 }, { "epoch": 0.16, - "grad_norm": 27.875, - "learning_rate": 1.977341995243846e-05, - "loss": 4.8708, + "grad_norm": 0.431640625, + "learning_rate": 0.001, + "loss": 2.9723, "step": 1445 }, { "epoch": 0.16, - "grad_norm": 40.75, - "learning_rate": 1.9769282462555458e-05, - "loss": 4.8212, + "grad_norm": 0.4140625, + "learning_rate": 0.001, + "loss": 2.8912, "step": 1450 }, { "epoch": 0.16, - "grad_norm": 30.0, - "learning_rate": 1.9765107977932644e-05, - "loss": 4.8624, + "grad_norm": 0.439453125, + "learning_rate": 0.001, + "loss": 2.997, "step": 1455 }, { "epoch": 0.16, - "grad_norm": 30.75, - "learning_rate": 1.9760896514378133e-05, - "loss": 4.8255, + "grad_norm": 0.419921875, + "learning_rate": 0.001, + "loss": 2.8812, "step": 1460 }, { "epoch": 0.16, - "grad_norm": 33.5, - "learning_rate": 1.9756648087840076e-05, - "loss": 4.8653, + "grad_norm": 0.396484375, + "learning_rate": 0.001, + "loss": 2.9449, "step": 1465 }, { "epoch": 0.16, - "grad_norm": 34.5, - "learning_rate": 1.9752362714406605e-05, - "loss": 4.9149, + "grad_norm": 0.400390625, + "learning_rate": 0.001, + "loss": 2.8727, "step": 1470 }, { "epoch": 0.16, - "grad_norm": 28.125, - "learning_rate": 1.9748040410305752e-05, - "loss": 4.9303, + "grad_norm": 0.4296875, + "learning_rate": 0.001, + "loss": 2.9621, "step": 1475 }, { "epoch": 0.17, - "grad_norm": 31.0, - "learning_rate": 1.9743681191905404e-05, - "loss": 4.8612, + "grad_norm": 0.3984375, + "learning_rate": 0.001, + "loss": 2.8718, "step": 1480 }, { "epoch": 0.17, - "grad_norm": 25.75, - "learning_rate": 1.9739285075713238e-05, - "loss": 4.9553, + "grad_norm": 0.412109375, + "learning_rate": 0.001, + "loss": 2.924, "step": 1485 }, { "epoch": 0.17, - "grad_norm": 26.375, - "learning_rate": 1.9734852078376655e-05, - "loss": 4.9717, + "grad_norm": 0.431640625, + "learning_rate": 0.001, + "loss": 2.975, "step": 1490 }, { "epoch": 0.17, - "grad_norm": 28.875, - "learning_rate": 1.973038221668272e-05, - "loss": 4.8671, + "grad_norm": 0.373046875, + "learning_rate": 0.001, + "loss": 2.8352, "step": 1495 }, { "epoch": 0.17, - "grad_norm": 31.5, - "learning_rate": 1.9725875507558105e-05, - "loss": 4.841, + "grad_norm": 0.4375, + "learning_rate": 0.001, + "loss": 2.8661, "step": 1500 }, { "epoch": 0.17, - "grad_norm": 33.25, - "learning_rate": 1.9721331968069e-05, - "loss": 4.8898, + "grad_norm": 0.41796875, + "learning_rate": 0.001, + "loss": 2.8946, "step": 1505 }, { "epoch": 0.17, - "grad_norm": 26.25, - "learning_rate": 1.9716751615421085e-05, - "loss": 4.8337, + "grad_norm": 0.42578125, + "learning_rate": 0.001, + "loss": 2.8299, "step": 1510 }, { "epoch": 0.17, - "grad_norm": 34.0, - "learning_rate": 1.971213446695943e-05, - "loss": 4.9062, + "grad_norm": 0.439453125, + "learning_rate": 0.001, + "loss": 2.8354, "step": 1515 }, { "epoch": 0.17, - "grad_norm": 35.75, - "learning_rate": 1.9707480540168458e-05, - "loss": 4.8542, + "grad_norm": 0.455078125, + "learning_rate": 0.001, + "loss": 2.8615, "step": 1520 }, { "epoch": 0.17, - "grad_norm": 30.875, - "learning_rate": 1.9702789852671853e-05, - "loss": 4.8385, + "grad_norm": 0.3984375, + "learning_rate": 0.001, + "loss": 2.858, "step": 1525 }, { "epoch": 0.17, - "grad_norm": 34.0, - "learning_rate": 1.9698062422232517e-05, - "loss": 4.8726, + "grad_norm": 0.43359375, + "learning_rate": 0.001, + "loss": 2.9946, "step": 1530 }, { "epoch": 0.17, - "grad_norm": 41.75, - "learning_rate": 1.9693298266752482e-05, - "loss": 4.9083, + "grad_norm": 0.470703125, + "learning_rate": 0.001, + "loss": 2.9449, "step": 1535 }, { "epoch": 0.17, - "grad_norm": 44.25, - "learning_rate": 1.9688497404272862e-05, - "loss": 4.8042, + "grad_norm": 0.447265625, + "learning_rate": 0.001, + "loss": 2.8862, "step": 1540 }, { "epoch": 0.17, - "grad_norm": 47.5, - "learning_rate": 1.9683659852973773e-05, - "loss": 4.8553, + "grad_norm": 0.404296875, + "learning_rate": 0.001, + "loss": 2.8575, "step": 1545 }, { "epoch": 0.17, - "grad_norm": 52.75, - "learning_rate": 1.9678785631174255e-05, - "loss": 4.8468, + "grad_norm": 0.359375, + "learning_rate": 0.001, + "loss": 2.8596, "step": 1550 }, { "epoch": 0.17, - "grad_norm": 36.5, - "learning_rate": 1.9673874757332226e-05, - "loss": 4.7812, + "grad_norm": 0.419921875, + "learning_rate": 0.001, + "loss": 2.9418, "step": 1555 }, { "epoch": 0.17, - "grad_norm": 29.25, - "learning_rate": 1.9668927250044395e-05, - "loss": 4.754, + "grad_norm": 0.4140625, + "learning_rate": 0.001, + "loss": 2.8957, "step": 1560 }, { "epoch": 0.17, - "grad_norm": 36.75, - "learning_rate": 1.9663943128046192e-05, - "loss": 4.8048, + "grad_norm": 0.4296875, + "learning_rate": 0.001, + "loss": 2.8291, "step": 1565 }, { "epoch": 0.18, - "grad_norm": 31.875, - "learning_rate": 1.965892241021171e-05, - "loss": 4.8343, + "grad_norm": 0.455078125, + "learning_rate": 0.001, + "loss": 2.8985, "step": 1570 }, { "epoch": 0.18, - "grad_norm": 28.625, - "learning_rate": 1.9653865115553613e-05, - "loss": 4.8158, + "grad_norm": 0.439453125, + "learning_rate": 0.001, + "loss": 2.9283, "step": 1575 }, { "epoch": 0.18, - "grad_norm": 32.75, - "learning_rate": 1.964877126322309e-05, - "loss": 4.8319, + "grad_norm": 0.416015625, + "learning_rate": 0.001, + "loss": 2.9447, "step": 1580 }, { "epoch": 0.18, - "grad_norm": 22.875, - "learning_rate": 1.9643640872509756e-05, - "loss": 4.7906, + "grad_norm": 0.42578125, + "learning_rate": 0.001, + "loss": 2.9154, "step": 1585 }, { "epoch": 0.18, - "grad_norm": 26.625, - "learning_rate": 1.9638473962841594e-05, - "loss": 4.783, + "grad_norm": 0.470703125, + "learning_rate": 0.001, + "loss": 2.9456, "step": 1590 }, { "epoch": 0.18, - "grad_norm": 29.25, - "learning_rate": 1.9633270553784888e-05, - "loss": 4.8572, + "grad_norm": 0.439453125, + "learning_rate": 0.001, + "loss": 2.8641, "step": 1595 }, { "epoch": 0.18, - "grad_norm": 27.25, - "learning_rate": 1.9628030665044118e-05, - "loss": 4.7619, + "grad_norm": 0.41796875, + "learning_rate": 0.001, + "loss": 2.996, "step": 1600 }, { "epoch": 0.18, - "grad_norm": 28.5, - "learning_rate": 1.9622754316461936e-05, - "loss": 4.7988, + "grad_norm": 0.431640625, + "learning_rate": 0.001, + "loss": 2.9102, "step": 1605 }, { "epoch": 0.18, - "grad_norm": 26.75, - "learning_rate": 1.9617441528019037e-05, - "loss": 4.7544, + "grad_norm": 0.416015625, + "learning_rate": 0.001, + "loss": 2.8548, "step": 1610 }, { "epoch": 0.18, - "grad_norm": 36.5, - "learning_rate": 1.961209231983412e-05, - "loss": 4.7597, + "grad_norm": 0.423828125, + "learning_rate": 0.001, + "loss": 2.8116, "step": 1615 }, { "epoch": 0.18, - "grad_norm": 31.5, - "learning_rate": 1.9606706712163794e-05, - "loss": 4.8483, + "grad_norm": 0.39453125, + "learning_rate": 0.001, + "loss": 2.9482, "step": 1620 }, { "epoch": 0.18, - "grad_norm": 27.875, - "learning_rate": 1.960128472540251e-05, - "loss": 4.8134, + "grad_norm": 0.4921875, + "learning_rate": 0.001, + "loss": 2.8992, "step": 1625 }, { "epoch": 0.18, - "grad_norm": 37.0, - "learning_rate": 1.959582638008249e-05, - "loss": 4.7172, + "grad_norm": 0.3984375, + "learning_rate": 0.001, + "loss": 2.8682, "step": 1630 }, { "epoch": 0.18, - "grad_norm": 55.75, - "learning_rate": 1.9590331696873622e-05, - "loss": 4.759, + "grad_norm": 0.412109375, + "learning_rate": 0.001, + "loss": 2.8866, "step": 1635 }, { "epoch": 0.18, - "grad_norm": 50.5, - "learning_rate": 1.9584800696583412e-05, - "loss": 4.8185, + "grad_norm": 0.421875, + "learning_rate": 0.001, + "loss": 2.8674, "step": 1640 }, { "epoch": 0.18, - "grad_norm": 33.25, - "learning_rate": 1.9579233400156888e-05, - "loss": 4.8496, + "grad_norm": 0.451171875, + "learning_rate": 0.001, + "loss": 2.8261, "step": 1645 }, { "epoch": 0.18, - "grad_norm": 41.5, - "learning_rate": 1.9573629828676535e-05, - "loss": 4.8197, + "grad_norm": 0.3984375, + "learning_rate": 0.001, + "loss": 3.0015, "step": 1650 }, { "epoch": 0.18, - "grad_norm": 43.25, - "learning_rate": 1.9567990003362188e-05, - "loss": 4.7548, + "grad_norm": 0.4453125, + "learning_rate": 0.001, + "loss": 2.859, "step": 1655 }, { "epoch": 0.19, - "grad_norm": 31.5, - "learning_rate": 1.9562313945570993e-05, - "loss": 4.7551, + "grad_norm": 0.451171875, + "learning_rate": 0.001, + "loss": 2.8403, "step": 1660 }, { "epoch": 0.19, - "grad_norm": 25.625, - "learning_rate": 1.955660167679728e-05, - "loss": 4.7709, + "grad_norm": 0.408203125, + "learning_rate": 0.001, + "loss": 2.8866, "step": 1665 }, { "epoch": 0.19, - "grad_norm": 31.75, - "learning_rate": 1.955085321867252e-05, - "loss": 4.8556, + "grad_norm": 0.40234375, + "learning_rate": 0.001, + "loss": 2.8863, "step": 1670 }, { "epoch": 0.19, - "grad_norm": 38.0, - "learning_rate": 1.954506859296522e-05, - "loss": 4.7994, + "grad_norm": 0.435546875, + "learning_rate": 0.001, + "loss": 2.8138, "step": 1675 }, { "epoch": 0.19, - "grad_norm": 45.75, - "learning_rate": 1.9539247821580853e-05, - "loss": 4.7625, + "grad_norm": 0.390625, + "learning_rate": 0.001, + "loss": 2.9055, "step": 1680 }, { "epoch": 0.19, - "grad_norm": 33.25, - "learning_rate": 1.9533390926561763e-05, - "loss": 4.7402, + "grad_norm": 0.494140625, + "learning_rate": 0.001, + "loss": 2.8598, "step": 1685 }, { "epoch": 0.19, - "grad_norm": 38.25, - "learning_rate": 1.952749793008709e-05, - "loss": 4.7518, + "grad_norm": 0.40234375, + "learning_rate": 0.001, + "loss": 2.8609, "step": 1690 }, { "epoch": 0.19, - "grad_norm": 30.0, - "learning_rate": 1.9521568854472694e-05, - "loss": 4.7366, + "grad_norm": 0.3515625, + "learning_rate": 0.001, + "loss": 2.7287, "step": 1695 }, { "epoch": 0.19, - "grad_norm": 32.75, - "learning_rate": 1.951560372217105e-05, - "loss": 4.7395, + "grad_norm": 0.43359375, + "learning_rate": 0.001, + "loss": 2.8634, "step": 1700 }, { "epoch": 0.19, - "grad_norm": 33.0, - "learning_rate": 1.950960255577118e-05, - "loss": 4.8346, + "grad_norm": 0.419921875, + "learning_rate": 0.001, + "loss": 2.8631, "step": 1705 }, { "epoch": 0.19, - "grad_norm": 31.375, - "learning_rate": 1.9503565377998558e-05, - "loss": 4.7899, + "grad_norm": 0.43359375, + "learning_rate": 0.001, + "loss": 2.8012, "step": 1710 }, { "epoch": 0.19, - "grad_norm": 32.5, - "learning_rate": 1.9497492211715027e-05, - "loss": 4.7473, + "grad_norm": 0.435546875, + "learning_rate": 0.001, + "loss": 2.8285, "step": 1715 }, { "epoch": 0.19, - "grad_norm": 28.75, - "learning_rate": 1.949138307991872e-05, - "loss": 4.6899, + "grad_norm": 0.439453125, + "learning_rate": 0.001, + "loss": 2.8212, "step": 1720 }, { "epoch": 0.19, - "grad_norm": 31.875, - "learning_rate": 1.9485238005743956e-05, - "loss": 4.8114, + "grad_norm": 0.4140625, + "learning_rate": 0.001, + "loss": 2.8448, "step": 1725 }, { "epoch": 0.19, - "grad_norm": 32.25, - "learning_rate": 1.947905701246117e-05, - "loss": 4.6848, + "grad_norm": 0.453125, + "learning_rate": 0.001, + "loss": 2.8296, "step": 1730 }, { "epoch": 0.19, - "grad_norm": 29.375, - "learning_rate": 1.947284012347681e-05, - "loss": 4.7991, + "grad_norm": 0.412109375, + "learning_rate": 0.001, + "loss": 2.8172, "step": 1735 }, { "epoch": 0.19, - "grad_norm": 31.5, - "learning_rate": 1.9466587362333263e-05, - "loss": 4.7457, + "grad_norm": 0.416015625, + "learning_rate": 0.001, + "loss": 2.8287, "step": 1740 }, { "epoch": 0.19, - "grad_norm": 26.625, - "learning_rate": 1.946029875270875e-05, - "loss": 4.726, + "grad_norm": 0.396484375, + "learning_rate": 0.001, + "loss": 2.8966, "step": 1745 }, { "epoch": 0.2, - "grad_norm": 29.0, - "learning_rate": 1.945397431841725e-05, - "loss": 4.6879, + "grad_norm": 0.462890625, + "learning_rate": 0.001, + "loss": 2.7409, "step": 1750 }, { "epoch": 0.2, - "grad_norm": 31.875, - "learning_rate": 1.944761408340841e-05, - "loss": 4.6809, + "grad_norm": 0.421875, + "learning_rate": 0.001, + "loss": 2.8212, "step": 1755 }, { "epoch": 0.2, - "grad_norm": 32.75, - "learning_rate": 1.9441218071767433e-05, - "loss": 4.7246, + "grad_norm": 0.40234375, + "learning_rate": 0.001, + "loss": 2.7865, "step": 1760 }, { "epoch": 0.2, - "grad_norm": 37.25, - "learning_rate": 1.9434786307715015e-05, - "loss": 4.6855, + "grad_norm": 0.423828125, + "learning_rate": 0.001, + "loss": 2.8383, "step": 1765 }, { "epoch": 0.2, - "grad_norm": 50.0, - "learning_rate": 1.942831881560724e-05, - "loss": 4.6621, + "grad_norm": 0.421875, + "learning_rate": 0.001, + "loss": 2.8843, "step": 1770 }, { "epoch": 0.2, - "grad_norm": 43.75, - "learning_rate": 1.9421815619935484e-05, - "loss": 4.7311, + "grad_norm": 0.439453125, + "learning_rate": 0.001, + "loss": 2.7954, "step": 1775 }, { "epoch": 0.2, - "grad_norm": 38.0, - "learning_rate": 1.9415276745326328e-05, - "loss": 4.769, + "grad_norm": 0.404296875, + "learning_rate": 0.001, + "loss": 2.7362, "step": 1780 }, { "epoch": 0.2, - "grad_norm": 67.0, - "learning_rate": 1.940870221654146e-05, - "loss": 4.7451, + "grad_norm": 0.41796875, + "learning_rate": 0.001, + "loss": 2.7698, "step": 1785 }, { "epoch": 0.2, - "grad_norm": 41.25, - "learning_rate": 1.9402092058477596e-05, - "loss": 4.6838, + "grad_norm": 0.4921875, + "learning_rate": 0.001, + "loss": 2.9273, "step": 1790 }, { "epoch": 0.2, - "grad_norm": 33.75, - "learning_rate": 1.9395446296166364e-05, - "loss": 4.8233, + "grad_norm": 0.400390625, + "learning_rate": 0.001, + "loss": 2.774, "step": 1795 }, { "epoch": 0.2, - "grad_norm": 46.75, - "learning_rate": 1.9388764954774225e-05, - "loss": 4.6763, + "grad_norm": 0.40234375, + "learning_rate": 0.001, + "loss": 2.842, "step": 1800 }, { "epoch": 0.2, - "grad_norm": 49.0, - "learning_rate": 1.9382048059602368e-05, - "loss": 4.7354, + "grad_norm": 0.443359375, + "learning_rate": 0.001, + "loss": 2.7766, "step": 1805 }, { "epoch": 0.2, - "grad_norm": 36.75, - "learning_rate": 1.937529563608662e-05, - "loss": 4.8521, + "grad_norm": 0.462890625, + "learning_rate": 0.001, + "loss": 2.7165, "step": 1810 }, { "epoch": 0.2, - "grad_norm": 31.0, - "learning_rate": 1.9368507709797352e-05, - "loss": 4.6896, + "grad_norm": 0.427734375, + "learning_rate": 0.001, + "loss": 2.8396, "step": 1815 }, { "epoch": 0.2, - "grad_norm": 30.625, - "learning_rate": 1.9361684306439377e-05, - "loss": 4.7905, + "grad_norm": 0.4140625, + "learning_rate": 0.001, + "loss": 2.7133, "step": 1820 }, { "epoch": 0.2, - "grad_norm": 25.75, - "learning_rate": 1.935482545185185e-05, - "loss": 4.7496, + "grad_norm": 0.3984375, + "learning_rate": 0.001, + "loss": 2.7195, "step": 1825 }, { "epoch": 0.2, - "grad_norm": 38.25, - "learning_rate": 1.934793117200818e-05, - "loss": 4.7733, + "grad_norm": 0.47265625, + "learning_rate": 0.001, + "loss": 2.7769, "step": 1830 }, { "epoch": 0.2, - "grad_norm": 31.625, - "learning_rate": 1.934100149301592e-05, - "loss": 4.7399, + "grad_norm": 0.416015625, + "learning_rate": 0.001, + "loss": 2.8127, "step": 1835 }, { "epoch": 0.21, - "grad_norm": 24.75, - "learning_rate": 1.9334036441116673e-05, - "loss": 4.7555, + "grad_norm": 0.435546875, + "learning_rate": 0.001, + "loss": 2.7074, "step": 1840 }, { "epoch": 0.21, - "grad_norm": 45.5, - "learning_rate": 1.932703604268601e-05, - "loss": 4.6672, + "grad_norm": 0.380859375, + "learning_rate": 0.001, + "loss": 2.8092, "step": 1845 }, { "epoch": 0.21, - "grad_norm": 38.75, - "learning_rate": 1.9320000324233334e-05, - "loss": 4.6448, + "grad_norm": 0.4375, + "learning_rate": 0.001, + "loss": 2.8355, "step": 1850 }, { "epoch": 0.21, - "grad_norm": 43.5, - "learning_rate": 1.9312929312401806e-05, - "loss": 4.6889, + "grad_norm": 0.412109375, + "learning_rate": 0.001, + "loss": 2.7602, "step": 1855 }, { "epoch": 0.21, - "grad_norm": 31.125, - "learning_rate": 1.9305823033968237e-05, - "loss": 4.6852, + "grad_norm": 0.40234375, + "learning_rate": 0.001, + "loss": 2.7212, "step": 1860 }, { "epoch": 0.21, - "grad_norm": 32.5, - "learning_rate": 1.9298681515842994e-05, - "loss": 4.682, + "grad_norm": 0.388671875, + "learning_rate": 0.001, + "loss": 2.8304, "step": 1865 }, { "epoch": 0.21, - "grad_norm": 31.75, - "learning_rate": 1.929150478506988e-05, - "loss": 4.7149, + "grad_norm": 0.40234375, + "learning_rate": 0.001, + "loss": 2.8164, "step": 1870 }, { "epoch": 0.21, - "grad_norm": 23.125, - "learning_rate": 1.9284292868826054e-05, - "loss": 4.7104, + "grad_norm": 0.40625, + "learning_rate": 0.001, + "loss": 2.8169, "step": 1875 }, { "epoch": 0.21, - "grad_norm": 32.0, - "learning_rate": 1.9277045794421903e-05, - "loss": 4.6934, + "grad_norm": 0.388671875, + "learning_rate": 0.001, + "loss": 2.7487, "step": 1880 }, { "epoch": 0.21, - "grad_norm": 41.0, - "learning_rate": 1.9269763589300972e-05, - "loss": 4.7288, + "grad_norm": 0.40234375, + "learning_rate": 0.001, + "loss": 2.7734, "step": 1885 }, { "epoch": 0.21, - "grad_norm": 42.25, - "learning_rate": 1.926244628103982e-05, - "loss": 4.6974, + "grad_norm": 0.4453125, + "learning_rate": 0.001, + "loss": 2.7757, "step": 1890 }, { "epoch": 0.21, - "grad_norm": 33.5, - "learning_rate": 1.9255093897347947e-05, - "loss": 4.6671, + "grad_norm": 0.40234375, + "learning_rate": 0.001, + "loss": 2.8023, "step": 1895 }, { "epoch": 0.21, - "grad_norm": 30.875, - "learning_rate": 1.9247706466067677e-05, - "loss": 4.6992, + "grad_norm": 0.376953125, + "learning_rate": 0.001, + "loss": 2.7523, "step": 1900 }, { "epoch": 0.21, - "grad_norm": 39.0, - "learning_rate": 1.9240284015174056e-05, - "loss": 4.7779, + "grad_norm": 0.390625, + "learning_rate": 0.001, + "loss": 2.818, "step": 1905 }, { "epoch": 0.21, - "grad_norm": 32.25, - "learning_rate": 1.9232826572774735e-05, - "loss": 4.6719, + "grad_norm": 0.41015625, + "learning_rate": 0.001, + "loss": 2.7938, "step": 1910 }, { "epoch": 0.21, - "grad_norm": 35.75, - "learning_rate": 1.9225334167109887e-05, - "loss": 4.7321, + "grad_norm": 0.41015625, + "learning_rate": 0.001, + "loss": 2.7914, "step": 1915 }, { "epoch": 0.21, - "grad_norm": 29.25, - "learning_rate": 1.9217806826552067e-05, - "loss": 4.7423, + "grad_norm": 0.431640625, + "learning_rate": 0.001, + "loss": 2.7482, "step": 1920 }, { "epoch": 0.21, - "grad_norm": 30.375, - "learning_rate": 1.9210244579606138e-05, - "loss": 4.7124, + "grad_norm": 0.474609375, + "learning_rate": 0.001, + "loss": 2.7364, "step": 1925 }, { "epoch": 0.22, - "grad_norm": 37.0, - "learning_rate": 1.920264745490914e-05, - "loss": 4.708, + "grad_norm": 0.38671875, + "learning_rate": 0.001, + "loss": 2.7383, "step": 1930 }, { "epoch": 0.22, - "grad_norm": 24.875, - "learning_rate": 1.9195015481230198e-05, - "loss": 4.7222, + "grad_norm": 0.416015625, + "learning_rate": 0.001, + "loss": 2.7172, "step": 1935 }, { "epoch": 0.22, - "grad_norm": 26.5, - "learning_rate": 1.9187348687470395e-05, - "loss": 4.7631, + "grad_norm": 0.39453125, + "learning_rate": 0.001, + "loss": 2.6953, "step": 1940 }, { "epoch": 0.22, - "grad_norm": 25.0, - "learning_rate": 1.9179647102662674e-05, - "loss": 4.7321, + "grad_norm": 0.41796875, + "learning_rate": 0.001, + "loss": 2.6664, "step": 1945 }, { "epoch": 0.22, - "grad_norm": 28.0, - "learning_rate": 1.9171910755971724e-05, - "loss": 4.7903, + "grad_norm": 0.41796875, + "learning_rate": 0.001, + "loss": 2.6742, "step": 1950 }, { "epoch": 0.22, - "grad_norm": 32.0, - "learning_rate": 1.9164139676693888e-05, - "loss": 4.6749, + "grad_norm": 0.44140625, + "learning_rate": 0.001, + "loss": 2.7493, "step": 1955 }, { "epoch": 0.22, - "grad_norm": 35.5, - "learning_rate": 1.9156333894257018e-05, - "loss": 4.7071, + "grad_norm": 0.384765625, + "learning_rate": 0.001, + "loss": 2.7087, "step": 1960 }, { "epoch": 0.22, - "grad_norm": 44.75, - "learning_rate": 1.9148493438220385e-05, - "loss": 4.6273, + "grad_norm": 0.400390625, + "learning_rate": 0.001, + "loss": 2.7275, "step": 1965 }, { "epoch": 0.22, - "grad_norm": 37.5, - "learning_rate": 1.9140618338274568e-05, - "loss": 4.7294, + "grad_norm": 0.4296875, + "learning_rate": 0.001, + "loss": 2.7522, "step": 1970 }, { "epoch": 0.22, - "grad_norm": 27.5, - "learning_rate": 1.9132708624241336e-05, - "loss": 4.6711, + "grad_norm": 0.41796875, + "learning_rate": 0.001, + "loss": 2.7288, "step": 1975 }, { "epoch": 0.22, - "grad_norm": 28.0, - "learning_rate": 1.9124764326073534e-05, - "loss": 4.7, + "grad_norm": 0.37109375, + "learning_rate": 0.001, + "loss": 2.7241, "step": 1980 }, { "epoch": 0.22, - "grad_norm": 25.125, - "learning_rate": 1.911678547385497e-05, - "loss": 4.7425, + "grad_norm": 0.39453125, + "learning_rate": 0.001, + "loss": 2.7156, "step": 1985 }, { "epoch": 0.22, - "grad_norm": 30.375, - "learning_rate": 1.910877209780031e-05, - "loss": 4.6738, + "grad_norm": 0.435546875, + "learning_rate": 0.001, + "loss": 2.7864, "step": 1990 }, { "epoch": 0.22, - "grad_norm": 30.125, - "learning_rate": 1.9100724228254946e-05, - "loss": 4.7154, + "grad_norm": 0.46875, + "learning_rate": 0.001, + "loss": 2.7317, "step": 1995 }, { "epoch": 0.22, - "grad_norm": 27.25, - "learning_rate": 1.9092641895694905e-05, - "loss": 4.6125, + "grad_norm": 0.4453125, + "learning_rate": 0.001, + "loss": 2.6977, "step": 2000 }, { "epoch": 0.22, - "grad_norm": 29.75, - "learning_rate": 1.9084525130726703e-05, - "loss": 4.702, + "grad_norm": 0.392578125, + "learning_rate": 0.001, + "loss": 2.7876, "step": 2005 }, { "epoch": 0.22, - "grad_norm": 35.0, - "learning_rate": 1.907637396408726e-05, - "loss": 4.6051, + "grad_norm": 0.380859375, + "learning_rate": 0.001, + "loss": 2.8518, "step": 2010 }, { "epoch": 0.22, - "grad_norm": 30.125, - "learning_rate": 1.906818842664377e-05, - "loss": 4.6288, + "grad_norm": 0.39453125, + "learning_rate": 0.001, + "loss": 2.6466, "step": 2015 }, { "epoch": 0.23, - "grad_norm": 32.0, - "learning_rate": 1.9059968549393572e-05, - "loss": 4.6941, + "grad_norm": 0.384765625, + "learning_rate": 0.001, + "loss": 2.8142, "step": 2020 }, { "epoch": 0.23, - "grad_norm": 25.375, - "learning_rate": 1.9051714363464054e-05, - "loss": 4.6207, + "grad_norm": 0.40234375, + "learning_rate": 0.001, + "loss": 2.6646, "step": 2025 }, { "epoch": 0.23, - "grad_norm": 28.125, - "learning_rate": 1.904342590011252e-05, - "loss": 4.7014, + "grad_norm": 0.421875, + "learning_rate": 0.001, + "loss": 2.7533, "step": 2030 }, { "epoch": 0.23, - "grad_norm": 33.25, - "learning_rate": 1.903510319072609e-05, - "loss": 4.725, + "grad_norm": 0.384765625, + "learning_rate": 0.001, + "loss": 2.7672, "step": 2035 }, { "epoch": 0.23, - "grad_norm": 33.5, - "learning_rate": 1.9026746266821546e-05, - "loss": 4.7135, + "grad_norm": 0.37890625, + "learning_rate": 0.001, + "loss": 2.6538, "step": 2040 }, { "epoch": 0.23, - "grad_norm": 26.625, - "learning_rate": 1.9018355160045253e-05, - "loss": 4.7004, + "grad_norm": 0.357421875, + "learning_rate": 0.001, + "loss": 2.7903, "step": 2045 }, { "epoch": 0.23, - "grad_norm": 21.25, - "learning_rate": 1.9009929902173015e-05, - "loss": 4.5973, + "grad_norm": 0.3828125, + "learning_rate": 0.001, + "loss": 2.6225, "step": 2050 }, { "epoch": 0.23, - "grad_norm": 24.0, - "learning_rate": 1.9001470525109963e-05, - "loss": 4.7958, + "grad_norm": 0.369140625, + "learning_rate": 0.001, + "loss": 2.6761, "step": 2055 }, { "epoch": 0.23, - "grad_norm": 23.625, - "learning_rate": 1.8992977060890427e-05, - "loss": 4.6877, + "grad_norm": 0.40234375, + "learning_rate": 0.001, + "loss": 2.7514, "step": 2060 }, { "epoch": 0.23, - "grad_norm": 37.75, - "learning_rate": 1.8984449541677832e-05, - "loss": 4.703, + "grad_norm": 0.3984375, + "learning_rate": 0.001, + "loss": 2.7695, "step": 2065 }, { "epoch": 0.23, - "grad_norm": 34.0, - "learning_rate": 1.8975887999764545e-05, - "loss": 4.6349, + "grad_norm": 0.41015625, + "learning_rate": 0.001, + "loss": 2.6763, "step": 2070 }, { "epoch": 0.23, - "grad_norm": 24.5, - "learning_rate": 1.8967292467571787e-05, - "loss": 4.594, + "grad_norm": 0.361328125, + "learning_rate": 0.001, + "loss": 2.7686, "step": 2075 }, { "epoch": 0.23, - "grad_norm": 28.0, - "learning_rate": 1.895866297764949e-05, - "loss": 4.6723, + "grad_norm": 0.359375, + "learning_rate": 0.001, + "loss": 2.6337, "step": 2080 }, { "epoch": 0.23, - "grad_norm": 29.0, - "learning_rate": 1.8949999562676174e-05, - "loss": 4.6697, + "grad_norm": 0.42578125, + "learning_rate": 0.001, + "loss": 2.7801, "step": 2085 }, { "epoch": 0.23, - "grad_norm": 26.75, - "learning_rate": 1.894130225545884e-05, - "loss": 4.6116, + "grad_norm": 0.3671875, + "learning_rate": 0.001, + "loss": 2.7185, "step": 2090 }, { "epoch": 0.23, - "grad_norm": 29.25, - "learning_rate": 1.8932571088932818e-05, - "loss": 4.6919, + "grad_norm": 0.390625, + "learning_rate": 0.001, + "loss": 2.7413, "step": 2095 }, { "epoch": 0.23, - "grad_norm": 34.5, - "learning_rate": 1.892380609616167e-05, - "loss": 4.7391, + "grad_norm": 0.37109375, + "learning_rate": 0.001, + "loss": 2.6984, "step": 2100 }, { "epoch": 0.23, - "grad_norm": 29.0, - "learning_rate": 1.891500731033705e-05, - "loss": 4.6767, + "grad_norm": 0.39453125, + "learning_rate": 0.001, + "loss": 2.7253, "step": 2105 }, { "epoch": 0.24, - "grad_norm": 30.5, - "learning_rate": 1.890617476477857e-05, - "loss": 4.7263, + "grad_norm": 0.408203125, + "learning_rate": 0.001, + "loss": 2.7485, "step": 2110 }, { "epoch": 0.24, - "grad_norm": 31.125, - "learning_rate": 1.8897308492933705e-05, - "loss": 4.5852, + "grad_norm": 0.373046875, + "learning_rate": 0.001, + "loss": 2.6853, "step": 2115 }, { "epoch": 0.24, - "grad_norm": 34.0, - "learning_rate": 1.888840852837763e-05, - "loss": 4.6959, + "grad_norm": 0.38671875, + "learning_rate": 0.001, + "loss": 2.6931, "step": 2120 }, { "epoch": 0.24, - "grad_norm": 25.125, - "learning_rate": 1.8879474904813113e-05, - "loss": 4.6538, + "grad_norm": 0.39453125, + "learning_rate": 0.001, + "loss": 2.6173, "step": 2125 }, { "epoch": 0.24, - "grad_norm": 34.0, - "learning_rate": 1.887050765607039e-05, - "loss": 4.6064, + "grad_norm": 0.421875, + "learning_rate": 0.001, + "loss": 2.624, "step": 2130 }, { "epoch": 0.24, - "grad_norm": 24.875, - "learning_rate": 1.886150681610702e-05, - "loss": 4.6473, + "grad_norm": 0.3984375, + "learning_rate": 0.001, + "loss": 2.6629, "step": 2135 }, { "epoch": 0.24, - "grad_norm": 25.25, - "learning_rate": 1.8852472419007774e-05, - "loss": 4.7146, + "grad_norm": 0.388671875, + "learning_rate": 0.001, + "loss": 2.7097, "step": 2140 }, { "epoch": 0.24, - "grad_norm": 31.0, - "learning_rate": 1.8843404498984498e-05, - "loss": 4.5834, + "grad_norm": 0.40234375, + "learning_rate": 0.001, + "loss": 2.7286, "step": 2145 }, { "epoch": 0.24, - "grad_norm": 28.875, - "learning_rate": 1.8834303090375974e-05, - "loss": 4.6003, + "grad_norm": 0.404296875, + "learning_rate": 0.001, + "loss": 2.7264, "step": 2150 }, { "epoch": 0.24, - "grad_norm": 26.875, - "learning_rate": 1.882516822764782e-05, - "loss": 4.6232, + "grad_norm": 0.47265625, + "learning_rate": 0.001, + "loss": 2.6504, "step": 2155 }, { "epoch": 0.24, - "grad_norm": 25.75, - "learning_rate": 1.881599994539232e-05, - "loss": 4.5999, + "grad_norm": 0.392578125, + "learning_rate": 0.001, + "loss": 2.7058, "step": 2160 }, { "epoch": 0.24, - "grad_norm": 30.875, - "learning_rate": 1.880679827832832e-05, - "loss": 4.6455, + "grad_norm": 0.365234375, + "learning_rate": 0.001, + "loss": 2.7127, "step": 2165 }, { "epoch": 0.24, - "grad_norm": 33.75, - "learning_rate": 1.8797563261301095e-05, - "loss": 4.5924, + "grad_norm": 0.37109375, + "learning_rate": 0.001, + "loss": 2.6349, "step": 2170 }, { "epoch": 0.24, - "grad_norm": 36.0, - "learning_rate": 1.8788294929282202e-05, - "loss": 4.6145, + "grad_norm": 0.375, + "learning_rate": 0.001, + "loss": 2.7502, "step": 2175 }, { "epoch": 0.24, - "grad_norm": 24.875, - "learning_rate": 1.877899331736936e-05, - "loss": 4.6553, + "grad_norm": 0.388671875, + "learning_rate": 0.001, + "loss": 2.7021, "step": 2180 }, { "epoch": 0.24, - "grad_norm": 25.5, - "learning_rate": 1.8769658460786316e-05, - "loss": 4.6791, + "grad_norm": 0.470703125, + "learning_rate": 0.001, + "loss": 2.6564, "step": 2185 }, { "epoch": 0.24, - "grad_norm": 35.25, - "learning_rate": 1.87602903948827e-05, - "loss": 4.5998, + "grad_norm": 0.3828125, + "learning_rate": 0.001, + "loss": 2.6715, "step": 2190 }, { "epoch": 0.24, - "grad_norm": 31.125, - "learning_rate": 1.875088915513392e-05, - "loss": 4.6993, + "grad_norm": 0.421875, + "learning_rate": 0.001, + "loss": 2.7572, "step": 2195 }, { "epoch": 0.25, - "grad_norm": 29.375, - "learning_rate": 1.874145477714098e-05, - "loss": 4.6483, + "grad_norm": 0.38671875, + "learning_rate": 0.001, + "loss": 2.6954, "step": 2200 }, { "epoch": 0.25, - "grad_norm": 26.75, - "learning_rate": 1.8731987296630407e-05, - "loss": 4.686, + "grad_norm": 0.41015625, + "learning_rate": 0.001, + "loss": 2.7125, "step": 2205 }, { "epoch": 0.25, - "grad_norm": 26.125, - "learning_rate": 1.872248674945405e-05, - "loss": 4.6177, + "grad_norm": 0.408203125, + "learning_rate": 0.001, + "loss": 2.6772, "step": 2210 }, { "epoch": 0.25, - "grad_norm": 27.375, - "learning_rate": 1.8712953171588997e-05, - "loss": 4.6286, + "grad_norm": 0.3671875, + "learning_rate": 0.001, + "loss": 2.8311, "step": 2215 }, { "epoch": 0.25, - "grad_norm": 29.5, - "learning_rate": 1.870338659913741e-05, - "loss": 4.632, + "grad_norm": 0.4140625, + "learning_rate": 0.001, + "loss": 2.6858, "step": 2220 }, { "epoch": 0.25, - "grad_norm": 32.75, - "learning_rate": 1.86937870683264e-05, - "loss": 4.6015, + "grad_norm": 0.44140625, + "learning_rate": 0.001, + "loss": 2.6552, "step": 2225 }, { "epoch": 0.25, - "grad_norm": 32.0, - "learning_rate": 1.8684154615507876e-05, - "loss": 4.5869, + "grad_norm": 0.380859375, + "learning_rate": 0.001, + "loss": 2.7589, "step": 2230 }, { "epoch": 0.25, - "grad_norm": 29.25, - "learning_rate": 1.8674489277158435e-05, - "loss": 4.636, + "grad_norm": 0.40234375, + "learning_rate": 0.001, + "loss": 2.6547, "step": 2235 }, { "epoch": 0.25, - "grad_norm": 30.25, - "learning_rate": 1.8664791089879195e-05, - "loss": 4.6906, + "grad_norm": 0.36328125, + "learning_rate": 0.001, + "loss": 2.6727, "step": 2240 }, { "epoch": 0.25, - "grad_norm": 25.625, - "learning_rate": 1.8655060090395665e-05, - "loss": 4.6324, + "grad_norm": 0.357421875, + "learning_rate": 0.001, + "loss": 2.7218, "step": 2245 }, { "epoch": 0.25, - "grad_norm": 22.25, - "learning_rate": 1.8645296315557617e-05, - "loss": 4.5905, + "grad_norm": 0.439453125, + "learning_rate": 0.001, + "loss": 2.6825, "step": 2250 }, { "epoch": 0.25, - "grad_norm": 25.25, - "learning_rate": 1.863549980233893e-05, - "loss": 4.6482, + "grad_norm": 0.388671875, + "learning_rate": 0.001, + "loss": 2.7252, "step": 2255 }, { "epoch": 0.25, - "grad_norm": 29.5, - "learning_rate": 1.862567058783747e-05, - "loss": 4.5914, + "grad_norm": 0.40234375, + "learning_rate": 0.001, + "loss": 2.7026, "step": 2260 }, { "epoch": 0.25, - "grad_norm": 25.375, - "learning_rate": 1.861580870927492e-05, - "loss": 4.6039, + "grad_norm": 0.3828125, + "learning_rate": 0.001, + "loss": 2.7449, "step": 2265 }, { "epoch": 0.25, - "grad_norm": 26.875, - "learning_rate": 1.8605914203996677e-05, - "loss": 4.5517, + "grad_norm": 0.38671875, + "learning_rate": 0.001, + "loss": 2.5876, "step": 2270 }, { "epoch": 0.25, - "grad_norm": 21.75, - "learning_rate": 1.859598710947167e-05, - "loss": 4.6206, + "grad_norm": 0.373046875, + "learning_rate": 0.001, + "loss": 2.6584, "step": 2275 }, { "epoch": 0.25, - "grad_norm": 35.0, - "learning_rate": 1.858602746329226e-05, - "loss": 4.6942, + "grad_norm": 0.4140625, + "learning_rate": 0.001, + "loss": 2.6238, "step": 2280 }, { "epoch": 0.25, - "grad_norm": 25.875, - "learning_rate": 1.8576035303174053e-05, - "loss": 4.5425, + "grad_norm": 0.41015625, + "learning_rate": 0.001, + "loss": 2.7556, "step": 2285 }, { "epoch": 0.26, - "grad_norm": 24.5, - "learning_rate": 1.8566010666955798e-05, - "loss": 4.5764, + "grad_norm": 0.3671875, + "learning_rate": 0.001, + "loss": 2.6804, "step": 2290 }, { "epoch": 0.26, - "grad_norm": 23.375, - "learning_rate": 1.8555953592599217e-05, - "loss": 4.5512, + "grad_norm": 0.349609375, + "learning_rate": 0.001, + "loss": 2.6864, "step": 2295 }, { "epoch": 0.26, - "grad_norm": 29.625, - "learning_rate": 1.8545864118188875e-05, - "loss": 4.6351, + "grad_norm": 0.369140625, + "learning_rate": 0.001, + "loss": 2.5995, "step": 2300 }, { "epoch": 0.26, - "grad_norm": 24.125, - "learning_rate": 1.8535742281932024e-05, - "loss": 4.5737, + "grad_norm": 0.41796875, + "learning_rate": 0.001, + "loss": 2.6537, "step": 2305 }, { "epoch": 0.26, - "grad_norm": 25.0, - "learning_rate": 1.8525588122158473e-05, - "loss": 4.6426, + "grad_norm": 0.419921875, + "learning_rate": 0.001, + "loss": 2.6645, "step": 2310 }, { "epoch": 0.26, - "grad_norm": 31.625, - "learning_rate": 1.851540167732043e-05, - "loss": 4.6213, + "grad_norm": 0.39453125, + "learning_rate": 0.001, + "loss": 2.7136, "step": 2315 }, { "epoch": 0.26, - "grad_norm": 24.5, - "learning_rate": 1.8505182985992373e-05, - "loss": 4.647, + "grad_norm": 0.38671875, + "learning_rate": 0.001, + "loss": 2.6908, "step": 2320 }, { "epoch": 0.26, - "grad_norm": 23.5, - "learning_rate": 1.8494932086870865e-05, - "loss": 4.5992, + "grad_norm": 0.37890625, + "learning_rate": 0.001, + "loss": 2.7102, "step": 2325 }, { "epoch": 0.26, - "grad_norm": 25.625, - "learning_rate": 1.8484649018774465e-05, - "loss": 4.6298, + "grad_norm": 0.50390625, + "learning_rate": 0.001, + "loss": 2.7089, "step": 2330 }, { "epoch": 0.26, - "grad_norm": 27.0, - "learning_rate": 1.8474333820643535e-05, - "loss": 4.6113, + "grad_norm": 0.4140625, + "learning_rate": 0.001, + "loss": 2.6295, "step": 2335 }, { "epoch": 0.26, - "grad_norm": 22.875, - "learning_rate": 1.8463986531540113e-05, - "loss": 4.5707, + "grad_norm": 0.384765625, + "learning_rate": 0.001, + "loss": 2.6105, "step": 2340 }, { "epoch": 0.26, - "grad_norm": 28.625, - "learning_rate": 1.8453607190647758e-05, - "loss": 4.5548, + "grad_norm": 0.41796875, + "learning_rate": 0.001, + "loss": 2.6332, "step": 2345 }, { "epoch": 0.26, - "grad_norm": 30.375, - "learning_rate": 1.8443195837271404e-05, - "loss": 4.5454, + "grad_norm": 0.41015625, + "learning_rate": 0.001, + "loss": 2.6549, "step": 2350 }, { "epoch": 0.26, - "grad_norm": 34.5, - "learning_rate": 1.843275251083721e-05, - "loss": 4.6355, + "grad_norm": 0.3515625, + "learning_rate": 0.001, + "loss": 2.7258, "step": 2355 }, { "epoch": 0.26, - "grad_norm": 23.625, - "learning_rate": 1.8422277250892428e-05, - "loss": 4.5325, + "grad_norm": 0.361328125, + "learning_rate": 0.001, + "loss": 2.6932, "step": 2360 }, { "epoch": 0.26, - "grad_norm": 29.625, - "learning_rate": 1.841177009710521e-05, - "loss": 4.6371, + "grad_norm": 0.40234375, + "learning_rate": 0.001, + "loss": 2.5853, "step": 2365 }, { "epoch": 0.26, - "grad_norm": 34.5, - "learning_rate": 1.8401231089264504e-05, - "loss": 4.5994, + "grad_norm": 0.37109375, + "learning_rate": 0.001, + "loss": 2.6075, "step": 2370 }, { "epoch": 0.26, - "grad_norm": 21.5, - "learning_rate": 1.8390660267279875e-05, - "loss": 4.5155, + "grad_norm": 0.39453125, + "learning_rate": 0.001, + "loss": 2.7695, "step": 2375 }, { "epoch": 0.27, - "grad_norm": 22.25, - "learning_rate": 1.8380057671181375e-05, - "loss": 4.5878, + "grad_norm": 0.3515625, + "learning_rate": 0.001, + "loss": 2.675, "step": 2380 }, { "epoch": 0.27, - "grad_norm": 31.375, - "learning_rate": 1.8369423341119365e-05, - "loss": 4.5544, + "grad_norm": 0.37890625, + "learning_rate": 0.001, + "loss": 2.6556, "step": 2385 }, { "epoch": 0.27, - "grad_norm": 22.625, - "learning_rate": 1.8358757317364387e-05, - "loss": 4.6044, + "grad_norm": 0.390625, + "learning_rate": 0.001, + "loss": 2.6259, "step": 2390 }, { "epoch": 0.27, - "grad_norm": 25.375, - "learning_rate": 1.8348059640307e-05, - "loss": 4.5421, + "grad_norm": 0.341796875, + "learning_rate": 0.001, + "loss": 2.627, "step": 2395 }, { "epoch": 0.27, - "grad_norm": 28.875, - "learning_rate": 1.833733035045763e-05, - "loss": 4.578, + "grad_norm": 0.4453125, + "learning_rate": 0.001, + "loss": 2.6976, "step": 2400 }, { "epoch": 0.27, - "grad_norm": 29.125, - "learning_rate": 1.8326569488446408e-05, - "loss": 4.5544, + "grad_norm": 0.388671875, + "learning_rate": 0.001, + "loss": 2.6732, "step": 2405 }, { "epoch": 0.27, - "grad_norm": 32.25, - "learning_rate": 1.831577709502303e-05, - "loss": 4.5143, + "grad_norm": 0.3671875, + "learning_rate": 0.001, + "loss": 2.6387, "step": 2410 }, { "epoch": 0.27, - "grad_norm": 21.25, - "learning_rate": 1.830495321105661e-05, - "loss": 4.5148, + "grad_norm": 0.38671875, + "learning_rate": 0.001, + "loss": 2.613, "step": 2415 }, { "epoch": 0.27, - "grad_norm": 24.75, - "learning_rate": 1.8294097877535478e-05, - "loss": 4.6588, + "grad_norm": 0.37890625, + "learning_rate": 0.001, + "loss": 2.6863, "step": 2420 }, { "epoch": 0.27, - "grad_norm": 35.5, - "learning_rate": 1.8283211135567096e-05, - "loss": 4.6136, + "grad_norm": 0.384765625, + "learning_rate": 0.001, + "loss": 2.6637, "step": 2425 }, { "epoch": 0.27, - "grad_norm": 31.5, - "learning_rate": 1.8272293026377838e-05, - "loss": 4.5229, + "grad_norm": 0.40234375, + "learning_rate": 0.001, + "loss": 2.5994, "step": 2430 }, { "epoch": 0.27, - "grad_norm": 23.125, - "learning_rate": 1.8261343591312876e-05, - "loss": 4.5357, + "grad_norm": 0.376953125, + "learning_rate": 0.001, + "loss": 2.6521, "step": 2435 }, { "epoch": 0.27, - "grad_norm": 139.0, - "learning_rate": 1.8250362871835996e-05, - "loss": 4.6587, + "grad_norm": 0.423828125, + "learning_rate": 0.001, + "loss": 2.6928, "step": 2440 }, { "epoch": 0.27, - "grad_norm": 53.25, - "learning_rate": 1.8239350909529464e-05, - "loss": 4.5687, + "grad_norm": 0.408203125, + "learning_rate": 0.001, + "loss": 2.6497, "step": 2445 }, { "epoch": 0.27, - "grad_norm": 42.25, - "learning_rate": 1.8228307746093852e-05, - "loss": 4.6049, + "grad_norm": 0.412109375, + "learning_rate": 0.001, + "loss": 2.6095, "step": 2450 }, { "epoch": 0.27, - "grad_norm": 48.0, - "learning_rate": 1.8217233423347893e-05, - "loss": 4.5412, + "grad_norm": 0.37890625, + "learning_rate": 0.001, + "loss": 2.637, "step": 2455 }, { "epoch": 0.27, - "grad_norm": 60.0, - "learning_rate": 1.8206127983228302e-05, - "loss": 4.624, + "grad_norm": 0.400390625, + "learning_rate": 0.001, + "loss": 2.5598, "step": 2460 }, { "epoch": 0.27, - "grad_norm": 52.25, - "learning_rate": 1.819499146778964e-05, - "loss": 4.6308, + "grad_norm": 0.419921875, + "learning_rate": 0.001, + "loss": 2.5683, "step": 2465 }, { "epoch": 0.28, - "grad_norm": 36.75, - "learning_rate": 1.8183823919204142e-05, - "loss": 4.6034, + "grad_norm": 0.376953125, + "learning_rate": 0.001, + "loss": 2.7179, "step": 2470 }, { "epoch": 0.28, - "grad_norm": 45.75, - "learning_rate": 1.8172625379761568e-05, - "loss": 4.5788, + "grad_norm": 0.373046875, + "learning_rate": 0.001, + "loss": 2.5267, "step": 2475 }, { "epoch": 0.28, - "grad_norm": 39.75, - "learning_rate": 1.8161395891869025e-05, - "loss": 4.5703, + "grad_norm": 0.390625, + "learning_rate": 0.001, + "loss": 2.643, "step": 2480 }, { "epoch": 0.28, - "grad_norm": 25.125, - "learning_rate": 1.8150135498050826e-05, - "loss": 4.5628, + "grad_norm": 0.390625, + "learning_rate": 0.001, + "loss": 2.5632, "step": 2485 }, { "epoch": 0.28, - "grad_norm": 27.0, - "learning_rate": 1.8138844240948307e-05, - "loss": 4.5827, + "grad_norm": 0.36328125, + "learning_rate": 0.001, + "loss": 2.611, "step": 2490 }, { "epoch": 0.28, - "grad_norm": 23.25, - "learning_rate": 1.8127522163319694e-05, - "loss": 4.5527, + "grad_norm": 0.353515625, + "learning_rate": 0.001, + "loss": 2.572, "step": 2495 }, { "epoch": 0.28, - "grad_norm": 30.25, - "learning_rate": 1.811616930803992e-05, - "loss": 4.6231, + "grad_norm": 0.36328125, + "learning_rate": 0.001, + "loss": 2.6059, "step": 2500 }, { "epoch": 0.28, - "grad_norm": 30.375, - "learning_rate": 1.810478571810046e-05, - "loss": 4.5032, + "grad_norm": 0.384765625, + "learning_rate": 0.001, + "loss": 2.5368, "step": 2505 }, { "epoch": 0.28, - "grad_norm": 26.375, - "learning_rate": 1.809337143660919e-05, - "loss": 4.4816, + "grad_norm": 0.447265625, + "learning_rate": 0.001, + "loss": 2.6391, "step": 2510 }, { "epoch": 0.28, - "grad_norm": 30.375, - "learning_rate": 1.8081926506790195e-05, - "loss": 4.5926, + "grad_norm": 0.36328125, + "learning_rate": 0.001, + "loss": 2.6533, "step": 2515 }, { "epoch": 0.28, - "grad_norm": 32.75, - "learning_rate": 1.8070450971983635e-05, - "loss": 4.5054, + "grad_norm": 0.40625, + "learning_rate": 0.001, + "loss": 2.6177, "step": 2520 }, { "epoch": 0.28, - "grad_norm": 23.375, - "learning_rate": 1.805894487564556e-05, - "loss": 4.5004, + "grad_norm": 0.39453125, + "learning_rate": 0.001, + "loss": 2.5711, "step": 2525 }, { "epoch": 0.28, - "grad_norm": 33.25, - "learning_rate": 1.8047408261347745e-05, - "loss": 4.5099, + "grad_norm": 0.392578125, + "learning_rate": 0.001, + "loss": 2.6115, "step": 2530 }, { "epoch": 0.28, - "grad_norm": 23.375, - "learning_rate": 1.8035841172777543e-05, - "loss": 4.5714, + "grad_norm": 0.384765625, + "learning_rate": 0.001, + "loss": 2.674, "step": 2535 }, { "epoch": 0.28, - "grad_norm": 26.75, - "learning_rate": 1.8024243653737705e-05, - "loss": 4.5349, + "grad_norm": 0.37890625, + "learning_rate": 0.001, + "loss": 2.6826, "step": 2540 }, { "epoch": 0.28, - "grad_norm": 28.5, - "learning_rate": 1.8012615748146213e-05, - "loss": 4.5857, + "grad_norm": 0.4140625, + "learning_rate": 0.001, + "loss": 2.5226, "step": 2545 }, { "epoch": 0.28, - "grad_norm": 28.75, - "learning_rate": 1.800095750003612e-05, - "loss": 4.4982, + "grad_norm": 0.361328125, + "learning_rate": 0.001, + "loss": 2.6192, "step": 2550 }, { "epoch": 0.28, - "grad_norm": 34.75, - "learning_rate": 1.7989268953555387e-05, - "loss": 4.5348, + "grad_norm": 0.35546875, + "learning_rate": 0.001, + "loss": 2.6914, "step": 2555 }, { "epoch": 0.29, - "grad_norm": 33.75, - "learning_rate": 1.79775501529667e-05, - "loss": 4.6536, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 2.6802, "step": 2560 }, { "epoch": 0.29, - "grad_norm": 27.5, - "learning_rate": 1.796580114264732e-05, - "loss": 4.5878, + "grad_norm": 0.41015625, + "learning_rate": 0.001, + "loss": 2.5906, "step": 2565 }, { "epoch": 0.29, - "grad_norm": 20.375, - "learning_rate": 1.7954021967088902e-05, - "loss": 4.5389, + "grad_norm": 0.365234375, + "learning_rate": 0.001, + "loss": 2.5617, "step": 2570 }, { "epoch": 0.29, - "grad_norm": 28.875, - "learning_rate": 1.7942212670897347e-05, - "loss": 4.4892, + "grad_norm": 0.376953125, + "learning_rate": 0.001, + "loss": 2.5559, "step": 2575 }, { "epoch": 0.29, - "grad_norm": 36.0, - "learning_rate": 1.7930373298792596e-05, - "loss": 4.5471, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 2.5432, "step": 2580 }, { "epoch": 0.29, - "grad_norm": 21.25, - "learning_rate": 1.7918503895608497e-05, - "loss": 4.5834, + "grad_norm": 0.3984375, + "learning_rate": 0.001, + "loss": 2.6188, "step": 2585 }, { "epoch": 0.29, - "grad_norm": 24.75, - "learning_rate": 1.7906604506292616e-05, - "loss": 4.5235, + "grad_norm": 0.375, + "learning_rate": 0.001, + "loss": 2.594, "step": 2590 }, { "epoch": 0.29, - "grad_norm": 24.25, - "learning_rate": 1.7894675175906075e-05, - "loss": 4.4997, + "grad_norm": 0.359375, + "learning_rate": 0.001, + "loss": 2.6327, "step": 2595 }, { "epoch": 0.29, - "grad_norm": 31.875, - "learning_rate": 1.7882715949623377e-05, - "loss": 4.6126, + "grad_norm": 0.396484375, + "learning_rate": 0.001, + "loss": 2.6117, "step": 2600 }, { "epoch": 0.29, - "grad_norm": 25.875, - "learning_rate": 1.7870726872732234e-05, - "loss": 4.5349, + "grad_norm": 0.337890625, + "learning_rate": 0.001, + "loss": 2.6003, "step": 2605 }, { "epoch": 0.29, - "grad_norm": 26.25, - "learning_rate": 1.7858707990633402e-05, - "loss": 4.5452, + "grad_norm": 0.353515625, + "learning_rate": 0.001, + "loss": 2.5405, "step": 2610 }, { "epoch": 0.29, - "grad_norm": 18.75, - "learning_rate": 1.7846659348840495e-05, - "loss": 4.5724, + "grad_norm": 0.365234375, + "learning_rate": 0.001, + "loss": 2.5711, "step": 2615 }, { "epoch": 0.29, - "grad_norm": 26.625, - "learning_rate": 1.7834580992979833e-05, - "loss": 4.575, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.6618, "step": 2620 }, { "epoch": 0.29, - "grad_norm": 31.125, - "learning_rate": 1.7822472968790255e-05, - "loss": 4.5662, + "grad_norm": 0.400390625, + "learning_rate": 0.001, + "loss": 2.63, "step": 2625 }, { "epoch": 0.29, - "grad_norm": 26.375, - "learning_rate": 1.781033532212295e-05, - "loss": 4.5065, + "grad_norm": 0.404296875, + "learning_rate": 0.001, + "loss": 2.5909, "step": 2630 }, { "epoch": 0.29, - "grad_norm": 26.25, - "learning_rate": 1.779816809894128e-05, - "loss": 4.5161, + "grad_norm": 0.3515625, + "learning_rate": 0.001, + "loss": 2.5804, "step": 2635 }, { "epoch": 0.29, - "grad_norm": 23.0, - "learning_rate": 1.7785971345320604e-05, - "loss": 4.5769, + "grad_norm": 0.392578125, + "learning_rate": 0.001, + "loss": 2.6332, "step": 2640 }, { "epoch": 0.29, - "grad_norm": 24.25, - "learning_rate": 1.7773745107448124e-05, - "loss": 4.5774, + "grad_norm": 0.37109375, + "learning_rate": 0.001, + "loss": 2.6762, "step": 2645 }, { "epoch": 0.3, - "grad_norm": 30.375, - "learning_rate": 1.776148943162268e-05, - "loss": 4.6146, + "grad_norm": 0.390625, + "learning_rate": 0.001, + "loss": 2.5429, "step": 2650 }, { "epoch": 0.3, - "grad_norm": 21.75, - "learning_rate": 1.7749204364254593e-05, - "loss": 4.4498, + "grad_norm": 0.388671875, + "learning_rate": 0.001, + "loss": 2.6548, "step": 2655 }, { "epoch": 0.3, - "grad_norm": 20.75, - "learning_rate": 1.7736889951865488e-05, - "loss": 4.5506, + "grad_norm": 0.37890625, + "learning_rate": 0.001, + "loss": 2.5452, "step": 2660 }, { "epoch": 0.3, - "grad_norm": 29.75, - "learning_rate": 1.7724546241088106e-05, - "loss": 4.5672, + "grad_norm": 0.400390625, + "learning_rate": 0.001, + "loss": 2.6768, "step": 2665 }, { "epoch": 0.3, - "grad_norm": 25.375, - "learning_rate": 1.7712173278666154e-05, - "loss": 4.5557, + "grad_norm": 0.37890625, + "learning_rate": 0.001, + "loss": 2.6133, "step": 2670 }, { "epoch": 0.3, - "grad_norm": 26.0, - "learning_rate": 1.7699771111454086e-05, - "loss": 4.5976, + "grad_norm": 0.365234375, + "learning_rate": 0.001, + "loss": 2.6124, "step": 2675 }, { "epoch": 0.3, - "grad_norm": 26.625, - "learning_rate": 1.7687339786416975e-05, - "loss": 4.5367, + "grad_norm": 0.361328125, + "learning_rate": 0.001, + "loss": 2.5989, "step": 2680 }, { "epoch": 0.3, - "grad_norm": 26.375, - "learning_rate": 1.767487935063029e-05, - "loss": 4.5354, + "grad_norm": 0.34375, + "learning_rate": 0.001, + "loss": 2.5802, "step": 2685 }, { "epoch": 0.3, - "grad_norm": 35.5, - "learning_rate": 1.7662389851279752e-05, - "loss": 4.5963, + "grad_norm": 0.35546875, + "learning_rate": 0.001, + "loss": 2.5502, "step": 2690 }, { "epoch": 0.3, - "grad_norm": 27.875, - "learning_rate": 1.764987133566113e-05, - "loss": 4.5351, + "grad_norm": 0.416015625, + "learning_rate": 0.001, + "loss": 2.5251, "step": 2695 }, { "epoch": 0.3, - "grad_norm": 27.375, - "learning_rate": 1.7637323851180086e-05, - "loss": 4.5577, + "grad_norm": 0.3515625, + "learning_rate": 0.001, + "loss": 2.6167, "step": 2700 }, { "epoch": 0.3, - "grad_norm": 31.75, - "learning_rate": 1.762474744535197e-05, - "loss": 4.5742, + "grad_norm": 0.361328125, + "learning_rate": 0.001, + "loss": 2.5672, "step": 2705 }, { "epoch": 0.3, - "grad_norm": 18.125, - "learning_rate": 1.7612142165801653e-05, - "loss": 4.6541, + "grad_norm": 0.3828125, + "learning_rate": 0.001, + "loss": 2.5343, "step": 2710 }, { "epoch": 0.3, - "grad_norm": 27.125, - "learning_rate": 1.759950806026336e-05, - "loss": 4.5631, + "grad_norm": 0.357421875, + "learning_rate": 0.001, + "loss": 2.5582, "step": 2715 }, { "epoch": 0.3, - "grad_norm": 22.375, - "learning_rate": 1.758684517658046e-05, - "loss": 4.5362, + "grad_norm": 0.36328125, + "learning_rate": 0.001, + "loss": 2.5223, "step": 2720 }, { "epoch": 0.3, - "grad_norm": 23.125, - "learning_rate": 1.757415356270531e-05, - "loss": 4.5536, + "grad_norm": 0.376953125, + "learning_rate": 0.001, + "loss": 2.5679, "step": 2725 }, { "epoch": 0.3, - "grad_norm": 26.625, - "learning_rate": 1.756143326669905e-05, - "loss": 4.5698, + "grad_norm": 0.3671875, + "learning_rate": 0.001, + "loss": 2.5854, "step": 2730 }, { "epoch": 0.3, - "grad_norm": 22.25, - "learning_rate": 1.7548684336731466e-05, - "loss": 4.5265, + "grad_norm": 0.375, + "learning_rate": 0.001, + "loss": 2.5077, "step": 2735 }, { "epoch": 0.31, - "grad_norm": 24.5, - "learning_rate": 1.7535906821080738e-05, - "loss": 4.5249, + "grad_norm": 0.3671875, + "learning_rate": 0.001, + "loss": 2.5738, "step": 2740 }, { "epoch": 0.31, - "grad_norm": 23.5, - "learning_rate": 1.7523100768133317e-05, - "loss": 4.4959, + "grad_norm": 0.361328125, + "learning_rate": 0.001, + "loss": 2.5829, "step": 2745 }, { "epoch": 0.31, - "grad_norm": 24.375, - "learning_rate": 1.7510266226383722e-05, - "loss": 4.513, + "grad_norm": 0.388671875, + "learning_rate": 0.001, + "loss": 2.5914, "step": 2750 }, { "epoch": 0.31, - "grad_norm": 28.25, - "learning_rate": 1.749740324443434e-05, - "loss": 4.4716, + "grad_norm": 0.37890625, + "learning_rate": 0.001, + "loss": 2.5735, "step": 2755 }, { "epoch": 0.31, - "grad_norm": 25.625, - "learning_rate": 1.7484511870995267e-05, - "loss": 4.5822, + "grad_norm": 0.400390625, + "learning_rate": 0.001, + "loss": 2.5015, "step": 2760 }, { "epoch": 0.31, - "grad_norm": 27.125, - "learning_rate": 1.747159215488412e-05, - "loss": 4.4531, + "grad_norm": 0.37109375, + "learning_rate": 0.001, + "loss": 2.6339, "step": 2765 }, { "epoch": 0.31, - "grad_norm": 27.25, - "learning_rate": 1.7458644145025826e-05, - "loss": 4.5373, + "grad_norm": 0.396484375, + "learning_rate": 0.001, + "loss": 2.6208, "step": 2770 }, { "epoch": 0.31, - "grad_norm": 27.75, - "learning_rate": 1.7445667890452474e-05, - "loss": 4.5214, + "grad_norm": 0.373046875, + "learning_rate": 0.001, + "loss": 2.5872, "step": 2775 }, { "epoch": 0.31, - "grad_norm": 24.125, - "learning_rate": 1.74326634403031e-05, - "loss": 4.465, + "grad_norm": 0.38671875, + "learning_rate": 0.001, + "loss": 2.5484, "step": 2780 }, { "epoch": 0.31, - "grad_norm": 28.0, - "learning_rate": 1.741963084382352e-05, - "loss": 4.5572, + "grad_norm": 0.38671875, + "learning_rate": 0.001, + "loss": 2.6061, "step": 2785 }, { "epoch": 0.31, - "grad_norm": 19.25, - "learning_rate": 1.7406570150366127e-05, - "loss": 4.5029, + "grad_norm": 0.359375, + "learning_rate": 0.001, + "loss": 2.7287, "step": 2790 }, { "epoch": 0.31, - "grad_norm": 27.625, - "learning_rate": 1.739348140938972e-05, - "loss": 4.5208, + "grad_norm": 0.341796875, + "learning_rate": 0.001, + "loss": 2.4569, "step": 2795 }, { "epoch": 0.31, - "grad_norm": 23.5, - "learning_rate": 1.7380364670459306e-05, - "loss": 4.5184, + "grad_norm": 0.365234375, + "learning_rate": 0.001, + "loss": 2.5641, "step": 2800 }, { "epoch": 0.31, - "grad_norm": 25.125, - "learning_rate": 1.7367219983245922e-05, - "loss": 4.555, + "grad_norm": 0.357421875, + "learning_rate": 0.001, + "loss": 2.5877, "step": 2805 }, { "epoch": 0.31, - "grad_norm": 20.25, - "learning_rate": 1.7354047397526432e-05, - "loss": 4.467, + "grad_norm": 0.40234375, + "learning_rate": 0.001, + "loss": 2.5578, "step": 2810 }, { "epoch": 0.31, - "grad_norm": 22.5, - "learning_rate": 1.7340846963183354e-05, - "loss": 4.4862, + "grad_norm": 0.39453125, + "learning_rate": 0.001, + "loss": 2.5764, "step": 2815 }, { "epoch": 0.31, - "grad_norm": 28.75, - "learning_rate": 1.7327618730204657e-05, - "loss": 4.4936, + "grad_norm": 0.365234375, + "learning_rate": 0.001, + "loss": 2.5233, "step": 2820 }, { "epoch": 0.31, - "grad_norm": 24.625, - "learning_rate": 1.7314362748683583e-05, - "loss": 4.51, + "grad_norm": 0.39453125, + "learning_rate": 0.001, + "loss": 2.5845, "step": 2825 }, { "epoch": 0.32, - "grad_norm": 29.625, - "learning_rate": 1.7301079068818464e-05, - "loss": 4.4751, + "grad_norm": 0.337890625, + "learning_rate": 0.001, + "loss": 2.6143, "step": 2830 }, { "epoch": 0.32, - "grad_norm": 20.125, - "learning_rate": 1.7287767740912503e-05, - "loss": 4.5448, + "grad_norm": 0.38671875, + "learning_rate": 0.001, + "loss": 2.5839, "step": 2835 }, { "epoch": 0.32, - "grad_norm": 22.75, - "learning_rate": 1.727442881537361e-05, - "loss": 4.4828, + "grad_norm": 0.36328125, + "learning_rate": 0.001, + "loss": 2.5518, "step": 2840 }, { "epoch": 0.32, - "grad_norm": 29.75, - "learning_rate": 1.7261062342714202e-05, - "loss": 4.4932, + "grad_norm": 0.357421875, + "learning_rate": 0.001, + "loss": 2.5763, "step": 2845 }, { "epoch": 0.32, - "grad_norm": 20.875, - "learning_rate": 1.7247668373551023e-05, - "loss": 4.5749, + "grad_norm": 0.365234375, + "learning_rate": 0.001, + "loss": 2.5789, "step": 2850 }, { "epoch": 0.32, - "grad_norm": 24.375, - "learning_rate": 1.7234246958604924e-05, - "loss": 4.6417, + "grad_norm": 0.3984375, + "learning_rate": 0.001, + "loss": 2.6161, "step": 2855 }, { "epoch": 0.32, - "grad_norm": 29.0, - "learning_rate": 1.72207981487007e-05, - "loss": 4.4804, + "grad_norm": 0.37109375, + "learning_rate": 0.001, + "loss": 2.4979, "step": 2860 }, { "epoch": 0.32, - "grad_norm": 23.125, - "learning_rate": 1.7207321994766882e-05, - "loss": 4.5129, + "grad_norm": 0.462890625, + "learning_rate": 0.001, + "loss": 2.5239, "step": 2865 }, { "epoch": 0.32, - "grad_norm": 28.75, - "learning_rate": 1.7193818547835554e-05, - "loss": 4.5691, + "grad_norm": 0.369140625, + "learning_rate": 0.001, + "loss": 2.5278, "step": 2870 }, { "epoch": 0.32, - "grad_norm": 25.375, - "learning_rate": 1.7180287859042146e-05, - "loss": 4.5896, + "grad_norm": 0.353515625, + "learning_rate": 0.001, + "loss": 2.5839, "step": 2875 }, { "epoch": 0.32, - "grad_norm": 25.625, - "learning_rate": 1.7166729979625258e-05, - "loss": 4.5344, + "grad_norm": 0.361328125, + "learning_rate": 0.001, + "loss": 2.5451, "step": 2880 }, { "epoch": 0.32, - "grad_norm": 25.5, - "learning_rate": 1.715314496092645e-05, - "loss": 4.522, + "grad_norm": 0.369140625, + "learning_rate": 0.001, + "loss": 2.5832, "step": 2885 }, { "epoch": 0.32, - "grad_norm": 29.875, - "learning_rate": 1.7139532854390057e-05, - "loss": 4.5267, + "grad_norm": 0.3515625, + "learning_rate": 0.001, + "loss": 2.6994, "step": 2890 }, { "epoch": 0.32, - "grad_norm": 24.75, - "learning_rate": 1.712589371156299e-05, - "loss": 4.5198, + "grad_norm": 0.3671875, + "learning_rate": 0.001, + "loss": 2.5226, "step": 2895 }, { "epoch": 0.32, - "grad_norm": 21.0, - "learning_rate": 1.711222758409454e-05, - "loss": 4.4719, + "grad_norm": 0.36328125, + "learning_rate": 0.001, + "loss": 2.568, "step": 2900 }, { "epoch": 0.32, - "grad_norm": 23.875, - "learning_rate": 1.7098534523736194e-05, - "loss": 4.5094, + "grad_norm": 0.3984375, + "learning_rate": 0.001, + "loss": 2.5504, "step": 2905 }, { "epoch": 0.32, - "grad_norm": 28.5, - "learning_rate": 1.7084814582341417e-05, - "loss": 4.5853, + "grad_norm": 0.3984375, + "learning_rate": 0.001, + "loss": 2.5429, "step": 2910 }, { "epoch": 0.33, - "grad_norm": 25.25, - "learning_rate": 1.7071067811865477e-05, - "loss": 4.4989, + "grad_norm": 0.37890625, + "learning_rate": 0.001, + "loss": 2.5131, "step": 2915 }, { "epoch": 0.33, - "grad_norm": 25.0, - "learning_rate": 1.7057294264365238e-05, - "loss": 4.4765, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 2.5571, "step": 2920 }, { "epoch": 0.33, - "grad_norm": 22.5, - "learning_rate": 1.7043493991998958e-05, - "loss": 4.4977, + "grad_norm": 0.40625, + "learning_rate": 0.001, + "loss": 2.63, "step": 2925 }, { "epoch": 0.33, - "grad_norm": 21.625, - "learning_rate": 1.702966704702611e-05, - "loss": 4.4949, + "grad_norm": 0.369140625, + "learning_rate": 0.001, + "loss": 2.5114, "step": 2930 }, { "epoch": 0.33, - "grad_norm": 21.375, - "learning_rate": 1.701581348180716e-05, - "loss": 4.4798, + "grad_norm": 0.38671875, + "learning_rate": 0.001, + "loss": 2.587, "step": 2935 }, { "epoch": 0.33, - "grad_norm": 26.625, - "learning_rate": 1.7001933348803388e-05, - "loss": 4.4516, + "grad_norm": 0.37109375, + "learning_rate": 0.001, + "loss": 2.6734, "step": 2940 }, { "epoch": 0.33, - "grad_norm": 28.625, - "learning_rate": 1.698802670057668e-05, - "loss": 4.5682, + "grad_norm": 0.36328125, + "learning_rate": 0.001, + "loss": 2.6092, "step": 2945 }, { "epoch": 0.33, - "grad_norm": 21.5, - "learning_rate": 1.6974093589789327e-05, - "loss": 4.4673, + "grad_norm": 0.3671875, + "learning_rate": 0.001, + "loss": 2.4977, "step": 2950 }, { "epoch": 0.33, - "grad_norm": 25.625, - "learning_rate": 1.696013406920384e-05, - "loss": 4.5093, + "grad_norm": 0.34375, + "learning_rate": 0.001, + "loss": 2.5451, "step": 2955 }, { "epoch": 0.33, - "grad_norm": 25.5, - "learning_rate": 1.6946148191682727e-05, - "loss": 4.5398, + "grad_norm": 0.390625, + "learning_rate": 0.001, + "loss": 2.6318, "step": 2960 }, { "epoch": 0.33, - "grad_norm": 21.625, - "learning_rate": 1.6932136010188317e-05, - "loss": 4.5592, + "grad_norm": 0.3828125, + "learning_rate": 0.001, + "loss": 2.489, "step": 2965 }, { "epoch": 0.33, - "grad_norm": 31.75, - "learning_rate": 1.6918097577782537e-05, - "loss": 4.5212, + "grad_norm": 0.388671875, + "learning_rate": 0.001, + "loss": 2.4592, "step": 2970 }, { "epoch": 0.33, - "grad_norm": 20.5, - "learning_rate": 1.6904032947626733e-05, - "loss": 4.5201, + "grad_norm": 0.392578125, + "learning_rate": 0.001, + "loss": 2.5569, "step": 2975 }, { "epoch": 0.33, - "grad_norm": 27.125, - "learning_rate": 1.6889942172981445e-05, - "loss": 4.5581, + "grad_norm": 0.380859375, + "learning_rate": 0.001, + "loss": 2.4976, "step": 2980 }, { "epoch": 0.33, - "grad_norm": 29.75, - "learning_rate": 1.6875825307206236e-05, - "loss": 4.5133, + "grad_norm": 0.349609375, + "learning_rate": 0.001, + "loss": 2.5439, "step": 2985 }, { "epoch": 0.33, - "grad_norm": 21.625, - "learning_rate": 1.6861682403759456e-05, - "loss": 4.4606, + "grad_norm": 0.357421875, + "learning_rate": 0.001, + "loss": 2.5567, "step": 2990 }, { "epoch": 0.33, - "grad_norm": 20.375, - "learning_rate": 1.6847513516198063e-05, - "loss": 4.4428, + "grad_norm": 0.35546875, + "learning_rate": 0.001, + "loss": 2.5471, "step": 2995 }, { "epoch": 0.33, - "grad_norm": 26.0, - "learning_rate": 1.6833318698177406e-05, - "loss": 4.5224, + "grad_norm": 0.37109375, + "learning_rate": 0.001, + "loss": 2.5934, "step": 3000 }, { "epoch": 0.34, - "grad_norm": 23.75, - "learning_rate": 1.681909800345104e-05, - "loss": 4.5107, + "grad_norm": 0.353515625, + "learning_rate": 0.001, + "loss": 2.527, "step": 3005 }, { "epoch": 0.34, - "grad_norm": 27.5, - "learning_rate": 1.68048514858705e-05, - "loss": 4.5462, + "grad_norm": 0.361328125, + "learning_rate": 0.001, + "loss": 2.5382, "step": 3010 }, { "epoch": 0.34, - "grad_norm": 19.625, - "learning_rate": 1.6790579199385116e-05, - "loss": 4.5033, + "grad_norm": 0.34375, + "learning_rate": 0.001, + "loss": 2.5225, "step": 3015 }, { "epoch": 0.34, - "grad_norm": 23.25, - "learning_rate": 1.6776281198041797e-05, - "loss": 4.575, + "grad_norm": 0.373046875, + "learning_rate": 0.001, + "loss": 2.571, "step": 3020 }, { "epoch": 0.34, - "grad_norm": 24.0, - "learning_rate": 1.6761957535984826e-05, - "loss": 4.5549, + "grad_norm": 0.3671875, + "learning_rate": 0.001, + "loss": 2.5245, "step": 3025 }, { "epoch": 0.34, - "grad_norm": 21.5, - "learning_rate": 1.674760826745567e-05, - "loss": 4.5795, + "grad_norm": 0.357421875, + "learning_rate": 0.001, + "loss": 2.5326, "step": 3030 }, { "epoch": 0.34, - "grad_norm": 29.75, - "learning_rate": 1.6733233446792757e-05, - "loss": 4.4801, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 2.5773, "step": 3035 }, { "epoch": 0.34, - "grad_norm": 25.75, - "learning_rate": 1.6718833128431273e-05, - "loss": 4.4844, + "grad_norm": 0.34765625, + "learning_rate": 0.001, + "loss": 2.5171, "step": 3040 }, { "epoch": 0.34, - "grad_norm": 21.875, - "learning_rate": 1.6704407366902965e-05, - "loss": 4.5132, + "grad_norm": 0.337890625, + "learning_rate": 0.001, + "loss": 2.5531, "step": 3045 }, { "epoch": 0.34, - "grad_norm": 23.0, - "learning_rate": 1.6689956216835932e-05, - "loss": 4.5924, + "grad_norm": 0.333984375, + "learning_rate": 0.001, + "loss": 2.5284, "step": 3050 }, { "epoch": 0.34, - "grad_norm": 23.875, - "learning_rate": 1.6675479732954407e-05, - "loss": 4.4774, + "grad_norm": 0.369140625, + "learning_rate": 0.001, + "loss": 2.5586, "step": 3055 }, { "epoch": 0.34, - "grad_norm": 24.5, - "learning_rate": 1.666097797007857e-05, - "loss": 4.5004, + "grad_norm": 0.35546875, + "learning_rate": 0.001, + "loss": 2.5146, "step": 3060 }, { "epoch": 0.34, - "grad_norm": 24.75, - "learning_rate": 1.6646450983124315e-05, - "loss": 4.4458, + "grad_norm": 0.353515625, + "learning_rate": 0.001, + "loss": 2.5981, "step": 3065 }, { "epoch": 0.34, - "grad_norm": 20.75, - "learning_rate": 1.663189882710306e-05, - "loss": 4.6089, + "grad_norm": 0.388671875, + "learning_rate": 0.001, + "loss": 2.5523, "step": 3070 }, { "epoch": 0.34, - "grad_norm": 25.375, - "learning_rate": 1.661732155712154e-05, - "loss": 4.5065, + "grad_norm": 0.361328125, + "learning_rate": 0.001, + "loss": 2.5508, "step": 3075 }, { "epoch": 0.34, - "grad_norm": 28.75, - "learning_rate": 1.6602719228381595e-05, - "loss": 4.5444, + "grad_norm": 0.37109375, + "learning_rate": 0.001, + "loss": 2.5361, "step": 3080 }, { "epoch": 0.34, - "grad_norm": 26.25, - "learning_rate": 1.658809189617995e-05, - "loss": 4.5938, + "grad_norm": 0.396484375, + "learning_rate": 0.001, + "loss": 2.6033, "step": 3085 }, { "epoch": 0.34, - "grad_norm": 24.0, - "learning_rate": 1.657343961590801e-05, - "loss": 4.5835, + "grad_norm": 0.408203125, + "learning_rate": 0.001, + "loss": 2.6071, "step": 3090 }, { "epoch": 0.35, - "grad_norm": 22.875, - "learning_rate": 1.6558762443051666e-05, - "loss": 4.4857, + "grad_norm": 0.349609375, + "learning_rate": 0.001, + "loss": 2.5747, "step": 3095 }, { "epoch": 0.35, - "grad_norm": 21.25, - "learning_rate": 1.654406043319107e-05, - "loss": 4.4575, + "grad_norm": 0.353515625, + "learning_rate": 0.001, + "loss": 2.4875, "step": 3100 }, { "epoch": 0.35, - "grad_norm": 29.0, - "learning_rate": 1.6529333642000428e-05, - "loss": 4.5528, + "grad_norm": 0.36328125, + "learning_rate": 0.001, + "loss": 2.5199, "step": 3105 }, { "epoch": 0.35, - "grad_norm": 24.25, - "learning_rate": 1.6514582125247777e-05, - "loss": 4.4434, + "grad_norm": 0.3515625, + "learning_rate": 0.001, + "loss": 2.5829, "step": 3110 }, { "epoch": 0.35, - "grad_norm": 23.875, - "learning_rate": 1.649980593879481e-05, - "loss": 4.5009, + "grad_norm": 0.357421875, + "learning_rate": 0.001, + "loss": 2.4627, "step": 3115 }, { "epoch": 0.35, - "grad_norm": 22.125, - "learning_rate": 1.648500513859662e-05, - "loss": 4.5269, + "grad_norm": 0.375, + "learning_rate": 0.001, + "loss": 2.5592, "step": 3120 }, { "epoch": 0.35, - "grad_norm": 25.875, - "learning_rate": 1.6470179780701512e-05, - "loss": 4.4915, + "grad_norm": 0.37109375, + "learning_rate": 0.001, + "loss": 2.6057, "step": 3125 }, { "epoch": 0.35, - "grad_norm": 32.25, - "learning_rate": 1.6455329921250798e-05, - "loss": 4.4558, + "grad_norm": 0.380859375, + "learning_rate": 0.001, + "loss": 2.538, "step": 3130 }, { "epoch": 0.35, - "grad_norm": 26.25, - "learning_rate": 1.6440455616478558e-05, - "loss": 4.4909, + "grad_norm": 0.384765625, + "learning_rate": 0.001, + "loss": 2.5633, "step": 3135 }, { "epoch": 0.35, - "grad_norm": 20.625, - "learning_rate": 1.6425556922711455e-05, - "loss": 4.4819, + "grad_norm": 0.3828125, + "learning_rate": 0.001, + "loss": 2.5373, "step": 3140 }, { "epoch": 0.35, - "grad_norm": 22.75, - "learning_rate": 1.6410633896368502e-05, - "loss": 4.4865, + "grad_norm": 0.337890625, + "learning_rate": 0.001, + "loss": 2.4773, "step": 3145 }, { "epoch": 0.35, - "grad_norm": 24.25, - "learning_rate": 1.639568659396086e-05, - "loss": 4.3616, + "grad_norm": 0.41015625, + "learning_rate": 0.001, + "loss": 2.5214, "step": 3150 }, { "epoch": 0.35, - "grad_norm": 25.75, - "learning_rate": 1.6380715072091616e-05, - "loss": 4.4738, + "grad_norm": 0.349609375, + "learning_rate": 0.001, + "loss": 2.498, "step": 3155 }, { "epoch": 0.35, - "grad_norm": 27.75, - "learning_rate": 1.6365719387455578e-05, - "loss": 4.4533, + "grad_norm": 0.33984375, + "learning_rate": 0.001, + "loss": 2.4634, "step": 3160 }, { "epoch": 0.35, - "grad_norm": 26.625, - "learning_rate": 1.6350699596839045e-05, - "loss": 4.4487, + "grad_norm": 0.333984375, + "learning_rate": 0.001, + "loss": 2.4578, "step": 3165 }, { "epoch": 0.35, - "grad_norm": 17.0, - "learning_rate": 1.6335655757119614e-05, - "loss": 4.5274, + "grad_norm": 0.35546875, + "learning_rate": 0.001, + "loss": 2.5839, "step": 3170 }, { "epoch": 0.35, - "grad_norm": 24.375, - "learning_rate": 1.632058792526594e-05, - "loss": 4.6345, + "grad_norm": 0.40625, + "learning_rate": 0.001, + "loss": 2.5751, "step": 3175 }, { "epoch": 0.35, - "grad_norm": 27.625, - "learning_rate": 1.630549615833754e-05, - "loss": 4.5248, + "grad_norm": 0.357421875, + "learning_rate": 0.001, + "loss": 2.4664, "step": 3180 }, { "epoch": 0.36, - "grad_norm": 23.625, - "learning_rate": 1.629038051348457e-05, - "loss": 4.5414, + "grad_norm": 0.341796875, + "learning_rate": 0.001, + "loss": 2.4624, "step": 3185 }, { "epoch": 0.36, - "grad_norm": 23.375, - "learning_rate": 1.62752410479476e-05, - "loss": 4.4888, + "grad_norm": 0.42578125, + "learning_rate": 0.001, + "loss": 2.5735, "step": 3190 }, { "epoch": 0.36, - "grad_norm": 23.125, - "learning_rate": 1.6260077819057415e-05, - "loss": 4.5001, + "grad_norm": 0.3671875, + "learning_rate": 0.001, + "loss": 2.5191, "step": 3195 }, { "epoch": 0.36, - "grad_norm": 27.625, - "learning_rate": 1.624489088423478e-05, - "loss": 4.5573, + "grad_norm": 0.408203125, + "learning_rate": 0.001, + "loss": 2.4841, "step": 3200 }, { "epoch": 0.36, - "grad_norm": 26.25, - "learning_rate": 1.6229680300990237e-05, - "loss": 4.5601, + "grad_norm": 0.359375, + "learning_rate": 0.001, + "loss": 2.5304, "step": 3205 }, { "epoch": 0.36, - "grad_norm": 23.25, - "learning_rate": 1.6214446126923877e-05, - "loss": 4.4524, + "grad_norm": 0.361328125, + "learning_rate": 0.001, + "loss": 2.4461, "step": 3210 }, { "epoch": 0.36, - "grad_norm": 28.125, - "learning_rate": 1.6199188419725123e-05, - "loss": 4.4668, + "grad_norm": 0.384765625, + "learning_rate": 0.001, + "loss": 2.5183, "step": 3215 }, { "epoch": 0.36, - "grad_norm": 26.0, - "learning_rate": 1.618390723717253e-05, - "loss": 4.4078, + "grad_norm": 0.369140625, + "learning_rate": 0.001, + "loss": 2.481, "step": 3220 }, { "epoch": 0.36, - "grad_norm": 24.125, - "learning_rate": 1.6168602637133526e-05, - "loss": 4.482, + "grad_norm": 0.357421875, + "learning_rate": 0.001, + "loss": 2.6264, "step": 3225 }, { "epoch": 0.36, - "grad_norm": 22.125, - "learning_rate": 1.6153274677564235e-05, - "loss": 4.5078, + "grad_norm": 0.349609375, + "learning_rate": 0.001, + "loss": 2.544, "step": 3230 }, { "epoch": 0.36, - "grad_norm": 24.25, - "learning_rate": 1.6137923416509234e-05, - "loss": 4.5565, + "grad_norm": 0.36328125, + "learning_rate": 0.001, + "loss": 2.5118, "step": 3235 }, { "epoch": 0.36, - "grad_norm": 24.75, - "learning_rate": 1.6122548912101342e-05, - "loss": 4.4767, + "grad_norm": 0.365234375, + "learning_rate": 0.001, + "loss": 2.5537, "step": 3240 }, { "epoch": 0.36, - "grad_norm": 23.75, - "learning_rate": 1.6107151222561393e-05, - "loss": 4.5216, + "grad_norm": 0.3671875, + "learning_rate": 0.001, + "loss": 2.5551, "step": 3245 }, { "epoch": 0.36, - "grad_norm": 22.375, - "learning_rate": 1.609173040619802e-05, - "loss": 4.4764, + "grad_norm": 0.341796875, + "learning_rate": 0.001, + "loss": 2.4869, "step": 3250 }, { "epoch": 0.36, - "grad_norm": 24.0, - "learning_rate": 1.6076286521407437e-05, - "loss": 4.4542, + "grad_norm": 0.36328125, + "learning_rate": 0.001, + "loss": 2.5788, "step": 3255 }, { "epoch": 0.36, - "grad_norm": 20.75, - "learning_rate": 1.606081962667321e-05, - "loss": 4.5151, + "grad_norm": 0.37890625, + "learning_rate": 0.001, + "loss": 2.5389, "step": 3260 }, { "epoch": 0.36, - "grad_norm": 27.125, - "learning_rate": 1.6045329780566045e-05, - "loss": 4.5295, + "grad_norm": 0.361328125, + "learning_rate": 0.001, + "loss": 2.5014, "step": 3265 }, { "epoch": 0.36, - "grad_norm": 27.875, - "learning_rate": 1.602981704174356e-05, - "loss": 4.5136, + "grad_norm": 0.376953125, + "learning_rate": 0.001, + "loss": 2.5224, "step": 3270 }, { "epoch": 0.37, - "grad_norm": 24.375, - "learning_rate": 1.601428146895006e-05, - "loss": 4.4938, + "grad_norm": 0.357421875, + "learning_rate": 0.001, + "loss": 2.4611, "step": 3275 }, { "epoch": 0.37, - "grad_norm": 29.625, - "learning_rate": 1.599872312101632e-05, - "loss": 4.5055, + "grad_norm": 0.37109375, + "learning_rate": 0.001, + "loss": 2.5188, "step": 3280 }, { "epoch": 0.37, - "grad_norm": 22.375, - "learning_rate": 1.5983142056859368e-05, - "loss": 4.5855, + "grad_norm": 0.3671875, + "learning_rate": 0.001, + "loss": 2.5296, "step": 3285 }, { "epoch": 0.37, - "grad_norm": 26.375, - "learning_rate": 1.5967538335482245e-05, - "loss": 4.4816, + "grad_norm": 0.36328125, + "learning_rate": 0.001, + "loss": 2.5364, "step": 3290 }, { "epoch": 0.37, - "grad_norm": 23.75, - "learning_rate": 1.5951912015973796e-05, - "loss": 4.4574, + "grad_norm": 0.359375, + "learning_rate": 0.001, + "loss": 2.5207, "step": 3295 }, { "epoch": 0.37, - "grad_norm": 23.625, - "learning_rate": 1.5936263157508444e-05, - "loss": 4.4453, + "grad_norm": 0.3671875, + "learning_rate": 0.001, + "loss": 2.5104, "step": 3300 }, { "epoch": 0.37, - "grad_norm": 24.625, - "learning_rate": 1.5920591819345954e-05, - "loss": 4.4775, + "grad_norm": 0.376953125, + "learning_rate": 0.001, + "loss": 2.513, "step": 3305 }, { "epoch": 0.37, - "grad_norm": 24.0, - "learning_rate": 1.590489806083123e-05, - "loss": 4.4958, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 2.523, "step": 3310 }, { "epoch": 0.37, - "grad_norm": 32.25, - "learning_rate": 1.5889181941394073e-05, - "loss": 4.4361, + "grad_norm": 0.38671875, + "learning_rate": 0.001, + "loss": 2.4677, "step": 3315 }, { "epoch": 0.37, - "grad_norm": 22.875, - "learning_rate": 1.587344352054896e-05, - "loss": 4.477, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.528, "step": 3320 }, { "epoch": 0.37, - "grad_norm": 30.25, - "learning_rate": 1.5857682857894823e-05, - "loss": 4.465, + "grad_norm": 0.373046875, + "learning_rate": 0.001, + "loss": 2.5352, "step": 3325 }, { "epoch": 0.37, - "grad_norm": 21.375, - "learning_rate": 1.5841900013114816e-05, - "loss": 4.5173, + "grad_norm": 0.361328125, + "learning_rate": 0.001, + "loss": 2.4736, "step": 3330 }, { "epoch": 0.37, - "grad_norm": 21.75, - "learning_rate": 1.5826095045976095e-05, - "loss": 4.4933, + "grad_norm": 0.376953125, + "learning_rate": 0.001, + "loss": 2.5234, "step": 3335 }, { "epoch": 0.37, - "grad_norm": 24.125, - "learning_rate": 1.581026801632959e-05, - "loss": 4.4565, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.4768, "step": 3340 }, { "epoch": 0.37, - "grad_norm": 26.625, - "learning_rate": 1.579441898410979e-05, - "loss": 4.4812, + "grad_norm": 0.3515625, + "learning_rate": 0.001, + "loss": 2.5709, "step": 3345 }, { "epoch": 0.37, - "grad_norm": 23.0, - "learning_rate": 1.577854800933448e-05, - "loss": 4.4552, + "grad_norm": 0.36328125, + "learning_rate": 0.001, + "loss": 2.4996, "step": 3350 }, { "epoch": 0.37, - "grad_norm": 59.25, - "learning_rate": 1.576265515210455e-05, - "loss": 4.4522, + "grad_norm": 0.36328125, + "learning_rate": 0.001, + "loss": 2.4553, "step": 3355 }, { "epoch": 0.37, - "grad_norm": 38.5, - "learning_rate": 1.574674047260377e-05, - "loss": 4.4766, + "grad_norm": 0.376953125, + "learning_rate": 0.001, + "loss": 2.5061, "step": 3360 }, { "epoch": 0.38, - "grad_norm": 32.0, - "learning_rate": 1.5730804031098523e-05, - "loss": 4.4535, + "grad_norm": 0.388671875, + "learning_rate": 0.001, + "loss": 2.4686, "step": 3365 }, { "epoch": 0.38, - "grad_norm": 29.375, - "learning_rate": 1.5714845887937615e-05, - "loss": 4.5141, + "grad_norm": 0.34765625, + "learning_rate": 0.001, + "loss": 2.5281, "step": 3370 }, { "epoch": 0.38, - "grad_norm": 24.625, - "learning_rate": 1.5698866103552034e-05, - "loss": 4.5565, + "grad_norm": 0.35546875, + "learning_rate": 0.001, + "loss": 2.5412, "step": 3375 }, { "epoch": 0.38, - "grad_norm": 29.625, - "learning_rate": 1.568286473845471e-05, - "loss": 4.4602, + "grad_norm": 0.3671875, + "learning_rate": 0.001, + "loss": 2.4708, "step": 3380 }, { "epoch": 0.38, - "grad_norm": 19.375, - "learning_rate": 1.5666841853240306e-05, - "loss": 4.4419, + "grad_norm": 0.36328125, + "learning_rate": 0.001, + "loss": 2.4491, "step": 3385 }, { "epoch": 0.38, - "grad_norm": 23.5, - "learning_rate": 1.5650797508584973e-05, - "loss": 4.5953, + "grad_norm": 0.3515625, + "learning_rate": 0.001, + "loss": 2.4432, "step": 3390 }, { "epoch": 0.38, - "grad_norm": 25.0, - "learning_rate": 1.5634731765246125e-05, - "loss": 4.5052, + "grad_norm": 0.37890625, + "learning_rate": 0.001, + "loss": 2.5601, "step": 3395 }, { "epoch": 0.38, - "grad_norm": 25.375, - "learning_rate": 1.561864468406222e-05, - "loss": 4.4809, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.4834, "step": 3400 }, { "epoch": 0.38, - "grad_norm": 24.125, - "learning_rate": 1.5602536325952495e-05, - "loss": 4.4567, + "grad_norm": 0.37109375, + "learning_rate": 0.001, + "loss": 2.5377, "step": 3405 }, { "epoch": 0.38, - "grad_norm": 20.75, - "learning_rate": 1.558640675191679e-05, - "loss": 4.5108, + "grad_norm": 0.361328125, + "learning_rate": 0.001, + "loss": 2.4399, "step": 3410 }, { "epoch": 0.38, - "grad_norm": 22.25, - "learning_rate": 1.557025602303526e-05, - "loss": 4.5665, + "grad_norm": 0.349609375, + "learning_rate": 0.001, + "loss": 2.4336, "step": 3415 }, { "epoch": 0.38, - "grad_norm": 23.125, - "learning_rate": 1.5554084200468186e-05, - "loss": 4.4813, + "grad_norm": 0.39453125, + "learning_rate": 0.001, + "loss": 2.5348, "step": 3420 }, { "epoch": 0.38, - "grad_norm": 21.0, - "learning_rate": 1.553789134545572e-05, - "loss": 4.5685, + "grad_norm": 0.34375, + "learning_rate": 0.001, + "loss": 2.4751, "step": 3425 }, { "epoch": 0.38, - "grad_norm": 29.125, - "learning_rate": 1.552167751931767e-05, - "loss": 4.4496, + "grad_norm": 0.39453125, + "learning_rate": 0.001, + "loss": 2.4556, "step": 3430 }, { "epoch": 0.38, - "grad_norm": 23.0, - "learning_rate": 1.5505442783453242e-05, - "loss": 4.4157, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.426, "step": 3435 }, { "epoch": 0.38, - "grad_norm": 20.125, - "learning_rate": 1.548918719934084e-05, - "loss": 4.4678, + "grad_norm": 0.3515625, + "learning_rate": 0.001, + "loss": 2.4736, "step": 3440 }, { "epoch": 0.38, - "grad_norm": 27.5, - "learning_rate": 1.547291082853781e-05, - "loss": 4.4592, + "grad_norm": 0.359375, + "learning_rate": 0.001, + "loss": 2.4077, "step": 3445 }, { "epoch": 0.38, - "grad_norm": 25.375, - "learning_rate": 1.5456613732680205e-05, - "loss": 4.4839, + "grad_norm": 0.361328125, + "learning_rate": 0.001, + "loss": 2.5495, "step": 3450 }, { "epoch": 0.39, - "grad_norm": 28.0, - "learning_rate": 1.5440295973482584e-05, - "loss": 4.5114, + "grad_norm": 0.380859375, + "learning_rate": 0.001, + "loss": 2.4385, "step": 3455 }, { "epoch": 0.39, - "grad_norm": 20.25, - "learning_rate": 1.5423957612737734e-05, - "loss": 4.4257, + "grad_norm": 0.369140625, + "learning_rate": 0.001, + "loss": 2.5411, "step": 3460 }, { "epoch": 0.39, - "grad_norm": 27.0, - "learning_rate": 1.540759871231646e-05, - "loss": 4.4857, + "grad_norm": 0.34765625, + "learning_rate": 0.001, + "loss": 2.4391, "step": 3465 }, { "epoch": 0.39, - "grad_norm": 18.75, - "learning_rate": 1.5391219334167357e-05, - "loss": 4.4471, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 2.5512, "step": 3470 }, { "epoch": 0.39, - "grad_norm": 27.125, - "learning_rate": 1.537481954031656e-05, - "loss": 4.5025, + "grad_norm": 0.34375, + "learning_rate": 0.001, + "loss": 2.5375, "step": 3475 }, { "epoch": 0.39, - "grad_norm": 21.125, - "learning_rate": 1.535839939286751e-05, - "loss": 4.4778, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.4875, "step": 3480 }, { "epoch": 0.39, - "grad_norm": 19.125, - "learning_rate": 1.534195895400073e-05, - "loss": 4.4577, + "grad_norm": 0.333984375, + "learning_rate": 0.001, + "loss": 2.5244, "step": 3485 }, { "epoch": 0.39, - "grad_norm": 21.0, - "learning_rate": 1.532549828597359e-05, - "loss": 4.5316, + "grad_norm": 0.337890625, + "learning_rate": 0.001, + "loss": 2.5694, "step": 3490 }, { "epoch": 0.39, - "grad_norm": 21.5, - "learning_rate": 1.5309017451120046e-05, - "loss": 4.5672, + "grad_norm": 0.357421875, + "learning_rate": 0.001, + "loss": 2.5128, "step": 3495 }, { "epoch": 0.39, - "grad_norm": 23.375, - "learning_rate": 1.5292516511850443e-05, - "loss": 4.488, + "grad_norm": 0.361328125, + "learning_rate": 0.001, + "loss": 2.4621, "step": 3500 }, { "epoch": 0.39, - "grad_norm": 24.375, - "learning_rate": 1.527599553065124e-05, - "loss": 4.5594, + "grad_norm": 0.353515625, + "learning_rate": 0.001, + "loss": 2.3882, "step": 3505 }, { "epoch": 0.39, - "grad_norm": 25.5, - "learning_rate": 1.5259454570084808e-05, - "loss": 4.4813, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 2.4457, "step": 3510 }, { "epoch": 0.39, - "grad_norm": 19.875, - "learning_rate": 1.5242893692789164e-05, - "loss": 4.5288, + "grad_norm": 0.36328125, + "learning_rate": 0.001, + "loss": 2.6129, "step": 3515 }, { "epoch": 0.39, - "grad_norm": 23.875, - "learning_rate": 1.5226312961477756e-05, - "loss": 4.4874, + "grad_norm": 0.36328125, + "learning_rate": 0.001, + "loss": 2.4673, "step": 3520 }, { "epoch": 0.39, - "grad_norm": 22.375, - "learning_rate": 1.5209712438939211e-05, - "loss": 4.4649, + "grad_norm": 0.34375, + "learning_rate": 0.001, + "loss": 2.4322, "step": 3525 }, { "epoch": 0.39, - "grad_norm": 22.625, - "learning_rate": 1.5193092188037101e-05, - "loss": 4.501, + "grad_norm": 0.357421875, + "learning_rate": 0.001, + "loss": 2.4767, "step": 3530 }, { "epoch": 0.39, - "grad_norm": 26.0, - "learning_rate": 1.5176452271709709e-05, - "loss": 4.4946, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.5015, "step": 3535 }, { "epoch": 0.39, - "grad_norm": 26.25, - "learning_rate": 1.515979275296979e-05, - "loss": 4.5384, + "grad_norm": 0.349609375, + "learning_rate": 0.001, + "loss": 2.4496, "step": 3540 }, { "epoch": 0.4, - "grad_norm": 23.625, - "learning_rate": 1.5143113694904322e-05, - "loss": 4.5096, + "grad_norm": 0.35546875, + "learning_rate": 0.001, + "loss": 2.4416, "step": 3545 }, { "epoch": 0.4, - "grad_norm": 24.25, - "learning_rate": 1.5126415160674285e-05, - "loss": 4.4865, + "grad_norm": 0.37890625, + "learning_rate": 0.001, + "loss": 2.4924, "step": 3550 }, { "epoch": 0.4, - "grad_norm": 22.75, - "learning_rate": 1.5109697213514408e-05, - "loss": 4.4905, + "grad_norm": 0.349609375, + "learning_rate": 0.001, + "loss": 2.444, "step": 3555 }, { "epoch": 0.4, - "grad_norm": 20.25, - "learning_rate": 1.5092959916732933e-05, - "loss": 4.4451, + "grad_norm": 0.349609375, + "learning_rate": 0.001, + "loss": 2.5554, "step": 3560 }, { "epoch": 0.4, - "grad_norm": 21.125, - "learning_rate": 1.5076203333711377e-05, - "loss": 4.5032, + "grad_norm": 0.341796875, + "learning_rate": 0.001, + "loss": 2.5222, "step": 3565 }, { "epoch": 0.4, - "grad_norm": 23.875, - "learning_rate": 1.5059427527904285e-05, - "loss": 4.4619, + "grad_norm": 0.33984375, + "learning_rate": 0.001, + "loss": 2.4904, "step": 3570 }, { "epoch": 0.4, - "grad_norm": 20.625, - "learning_rate": 1.5042632562839012e-05, - "loss": 4.4372, + "grad_norm": 0.369140625, + "learning_rate": 0.001, + "loss": 2.51, "step": 3575 }, { "epoch": 0.4, - "grad_norm": 25.75, - "learning_rate": 1.5025818502115448e-05, - "loss": 4.4426, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.491, "step": 3580 }, { "epoch": 0.4, - "grad_norm": 24.5, - "learning_rate": 1.5008985409405808e-05, - "loss": 4.439, + "grad_norm": 0.33984375, + "learning_rate": 0.001, + "loss": 2.5569, "step": 3585 }, { "epoch": 0.4, - "grad_norm": 20.5, - "learning_rate": 1.4992133348454362e-05, - "loss": 4.4655, + "grad_norm": 0.357421875, + "learning_rate": 0.001, + "loss": 2.4702, "step": 3590 }, { "epoch": 0.4, - "grad_norm": 28.5, - "learning_rate": 1.4975262383077237e-05, - "loss": 4.4972, + "grad_norm": 0.353515625, + "learning_rate": 0.001, + "loss": 2.4147, "step": 3595 }, { "epoch": 0.4, - "grad_norm": 20.25, - "learning_rate": 1.4958372577162115e-05, - "loss": 4.5028, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.5733, "step": 3600 }, { "epoch": 0.4, - "grad_norm": 21.375, - "learning_rate": 1.4941463994668054e-05, - "loss": 4.4726, + "grad_norm": 0.359375, + "learning_rate": 0.001, + "loss": 2.4938, "step": 3605 }, { "epoch": 0.4, - "grad_norm": 27.375, - "learning_rate": 1.4924536699625194e-05, - "loss": 4.4476, + "grad_norm": 0.33984375, + "learning_rate": 0.001, + "loss": 2.4076, "step": 3610 }, { "epoch": 0.4, - "grad_norm": 23.875, - "learning_rate": 1.4907590756134549e-05, - "loss": 4.486, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.4868, "step": 3615 }, { "epoch": 0.4, - "grad_norm": 23.625, - "learning_rate": 1.489062622836775e-05, - "loss": 4.4966, + "grad_norm": 0.36328125, + "learning_rate": 0.001, + "loss": 2.4626, "step": 3620 }, { "epoch": 0.4, - "grad_norm": 23.125, - "learning_rate": 1.48736431805668e-05, - "loss": 4.426, + "grad_norm": 0.353515625, + "learning_rate": 0.001, + "loss": 2.5284, "step": 3625 }, { "epoch": 0.4, - "grad_norm": 25.25, - "learning_rate": 1.4856641677043834e-05, - "loss": 4.5031, + "grad_norm": 0.357421875, + "learning_rate": 0.001, + "loss": 2.4889, "step": 3630 }, { "epoch": 0.41, - "grad_norm": 19.25, - "learning_rate": 1.483962178218088e-05, - "loss": 4.5073, + "grad_norm": 0.3828125, + "learning_rate": 0.001, + "loss": 2.4571, "step": 3635 }, { "epoch": 0.41, - "grad_norm": 24.875, - "learning_rate": 1.4822583560429612e-05, - "loss": 4.4749, + "grad_norm": 0.349609375, + "learning_rate": 0.001, + "loss": 2.5282, "step": 3640 }, { "epoch": 0.41, - "grad_norm": 22.0, - "learning_rate": 1.4805527076311097e-05, - "loss": 4.4949, + "grad_norm": 0.341796875, + "learning_rate": 0.001, + "loss": 2.5391, "step": 3645 }, { "epoch": 0.41, - "grad_norm": 21.125, - "learning_rate": 1.4788452394415567e-05, - "loss": 4.4627, + "grad_norm": 0.388671875, + "learning_rate": 0.001, + "loss": 2.5301, "step": 3650 }, { "epoch": 0.41, - "grad_norm": 25.125, - "learning_rate": 1.4771359579402163e-05, - "loss": 4.5026, + "grad_norm": 0.369140625, + "learning_rate": 0.001, + "loss": 2.4556, "step": 3655 }, { "epoch": 0.41, - "grad_norm": 29.625, - "learning_rate": 1.4754248695998694e-05, - "loss": 4.4731, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 2.4623, "step": 3660 }, { "epoch": 0.41, - "grad_norm": 22.75, - "learning_rate": 1.473711980900139e-05, - "loss": 4.5352, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.5289, "step": 3665 }, { "epoch": 0.41, - "grad_norm": 23.5, - "learning_rate": 1.471997298327466e-05, - "loss": 4.5059, + "grad_norm": 0.373046875, + "learning_rate": 0.001, + "loss": 2.4913, "step": 3670 }, { "epoch": 0.41, - "grad_norm": 25.5, - "learning_rate": 1.470280828375084e-05, - "loss": 4.5068, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.4986, "step": 3675 }, { "epoch": 0.41, - "grad_norm": 18.25, - "learning_rate": 1.4685625775429953e-05, - "loss": 4.4399, + "grad_norm": 0.33203125, + "learning_rate": 0.001, + "loss": 2.5109, "step": 3680 }, { "epoch": 0.41, - "grad_norm": 25.75, - "learning_rate": 1.4668425523379466e-05, - "loss": 4.5205, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.5675, "step": 3685 }, { "epoch": 0.41, - "grad_norm": 19.875, - "learning_rate": 1.4651207592734031e-05, - "loss": 4.3715, + "grad_norm": 0.404296875, + "learning_rate": 0.001, + "loss": 2.5055, "step": 3690 }, { "epoch": 0.41, - "grad_norm": 24.375, - "learning_rate": 1.4633972048695253e-05, - "loss": 4.5203, + "grad_norm": 0.361328125, + "learning_rate": 0.001, + "loss": 2.4677, "step": 3695 }, { "epoch": 0.41, - "grad_norm": 21.625, - "learning_rate": 1.4616718956531428e-05, - "loss": 4.464, + "grad_norm": 0.357421875, + "learning_rate": 0.001, + "loss": 2.5347, "step": 3700 }, { "epoch": 0.41, - "grad_norm": 20.875, - "learning_rate": 1.4599448381577315e-05, - "loss": 4.4959, + "grad_norm": 0.4140625, + "learning_rate": 0.001, + "loss": 2.4168, "step": 3705 }, { "epoch": 0.41, - "grad_norm": 25.25, - "learning_rate": 1.4582160389233865e-05, - "loss": 4.5098, + "grad_norm": 0.349609375, + "learning_rate": 0.001, + "loss": 2.4156, "step": 3710 }, { "epoch": 0.41, - "grad_norm": 25.875, - "learning_rate": 1.4564855044968e-05, - "loss": 4.4954, + "grad_norm": 0.3515625, + "learning_rate": 0.001, + "loss": 2.5117, "step": 3715 }, { "epoch": 0.41, - "grad_norm": 23.25, - "learning_rate": 1.4547532414312338e-05, - "loss": 4.5187, + "grad_norm": 0.34765625, + "learning_rate": 0.001, + "loss": 2.5008, "step": 3720 }, { "epoch": 0.42, - "grad_norm": 23.25, - "learning_rate": 1.4530192562864964e-05, - "loss": 4.4988, + "grad_norm": 0.35546875, + "learning_rate": 0.001, + "loss": 2.5145, "step": 3725 }, { "epoch": 0.42, - "grad_norm": 20.0, - "learning_rate": 1.4512835556289176e-05, - "loss": 4.5878, + "grad_norm": 0.357421875, + "learning_rate": 0.001, + "loss": 2.4302, "step": 3730 }, { "epoch": 0.42, - "grad_norm": 24.375, - "learning_rate": 1.4495461460313232e-05, - "loss": 4.4954, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.5553, "step": 3735 }, { "epoch": 0.42, - "grad_norm": 24.75, - "learning_rate": 1.447807034073011e-05, - "loss": 4.4944, + "grad_norm": 0.349609375, + "learning_rate": 0.001, + "loss": 2.4049, "step": 3740 }, { "epoch": 0.42, - "grad_norm": 21.625, - "learning_rate": 1.446066226339725e-05, - "loss": 4.5226, + "grad_norm": 0.341796875, + "learning_rate": 0.001, + "loss": 2.4319, "step": 3745 }, { "epoch": 0.42, - "grad_norm": 23.25, - "learning_rate": 1.4443237294236312e-05, - "loss": 4.4849, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.5425, "step": 3750 }, { "epoch": 0.42, - "grad_norm": 23.125, - "learning_rate": 1.4425795499232913e-05, - "loss": 4.4273, + "grad_norm": 0.33203125, + "learning_rate": 0.001, + "loss": 2.5042, "step": 3755 }, { "epoch": 0.42, - "grad_norm": 22.875, - "learning_rate": 1.44083369444364e-05, - "loss": 4.5185, + "grad_norm": 0.34765625, + "learning_rate": 0.001, + "loss": 2.4445, "step": 3760 }, { "epoch": 0.42, - "grad_norm": 23.375, - "learning_rate": 1.4390861695959581e-05, - "loss": 4.5165, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 2.4256, "step": 3765 }, { "epoch": 0.42, - "grad_norm": 19.25, - "learning_rate": 1.4373369819978478e-05, - "loss": 4.4712, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.4933, "step": 3770 }, { "epoch": 0.42, - "grad_norm": 19.25, - "learning_rate": 1.4355861382732079e-05, - "loss": 4.5457, + "grad_norm": 0.37109375, + "learning_rate": 0.001, + "loss": 2.4679, "step": 3775 }, { "epoch": 0.42, - "grad_norm": 24.5, - "learning_rate": 1.433833645052209e-05, - "loss": 4.5127, + "grad_norm": 0.357421875, + "learning_rate": 0.001, + "loss": 2.4482, "step": 3780 }, { "epoch": 0.42, - "grad_norm": 21.375, - "learning_rate": 1.4320795089712683e-05, - "loss": 4.4277, + "grad_norm": 0.349609375, + "learning_rate": 0.001, + "loss": 2.4219, "step": 3785 }, { "epoch": 0.42, - "grad_norm": 23.125, - "learning_rate": 1.4303237366730231e-05, - "loss": 4.4787, + "grad_norm": 0.341796875, + "learning_rate": 0.001, + "loss": 2.5463, "step": 3790 }, { "epoch": 0.42, - "grad_norm": 21.5, - "learning_rate": 1.4285663348063085e-05, - "loss": 4.4767, + "grad_norm": 0.361328125, + "learning_rate": 0.001, + "loss": 2.4565, "step": 3795 }, { "epoch": 0.42, - "grad_norm": 17.375, - "learning_rate": 1.4268073100261292e-05, - "loss": 4.4226, + "grad_norm": 0.3515625, + "learning_rate": 0.001, + "loss": 2.4338, "step": 3800 }, { "epoch": 0.42, - "grad_norm": 21.375, - "learning_rate": 1.4250466689936363e-05, - "loss": 4.5254, + "grad_norm": 0.34375, + "learning_rate": 0.001, + "loss": 2.4545, "step": 3805 }, { "epoch": 0.42, - "grad_norm": 22.375, - "learning_rate": 1.4232844183761003e-05, - "loss": 4.436, + "grad_norm": 0.333984375, + "learning_rate": 0.001, + "loss": 2.4431, "step": 3810 }, { "epoch": 0.43, - "grad_norm": 24.5, - "learning_rate": 1.4215205648468888e-05, - "loss": 4.465, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.5105, "step": 3815 }, { "epoch": 0.43, - "grad_norm": 25.125, - "learning_rate": 1.4197551150854376e-05, - "loss": 4.4651, + "grad_norm": 0.361328125, + "learning_rate": 0.001, + "loss": 2.4994, "step": 3820 }, { "epoch": 0.43, - "grad_norm": 20.75, - "learning_rate": 1.4179880757772285e-05, - "loss": 4.4632, + "grad_norm": 0.373046875, + "learning_rate": 0.001, + "loss": 2.4036, "step": 3825 }, { "epoch": 0.43, - "grad_norm": 24.25, - "learning_rate": 1.4162194536137618e-05, - "loss": 4.4024, + "grad_norm": 0.365234375, + "learning_rate": 0.001, + "loss": 2.3943, "step": 3830 }, { "epoch": 0.43, - "grad_norm": 23.0, - "learning_rate": 1.4144492552925322e-05, - "loss": 4.5479, + "grad_norm": 0.34375, + "learning_rate": 0.001, + "loss": 2.4016, "step": 3835 }, { "epoch": 0.43, - "grad_norm": 21.0, - "learning_rate": 1.412677487517003e-05, - "loss": 4.5043, + "grad_norm": 0.3671875, + "learning_rate": 0.001, + "loss": 2.3733, "step": 3840 }, { "epoch": 0.43, - "grad_norm": 26.5, - "learning_rate": 1.4109041569965808e-05, - "loss": 4.504, + "grad_norm": 0.373046875, + "learning_rate": 0.001, + "loss": 2.4746, "step": 3845 }, { "epoch": 0.43, - "grad_norm": 26.5, - "learning_rate": 1.4091292704465898e-05, - "loss": 4.5351, + "grad_norm": 0.3671875, + "learning_rate": 0.001, + "loss": 2.4641, "step": 3850 }, { "epoch": 0.43, - "grad_norm": 21.375, - "learning_rate": 1.4073528345882468e-05, - "loss": 4.5544, + "grad_norm": 0.337890625, + "learning_rate": 0.001, + "loss": 2.4163, "step": 3855 }, { "epoch": 0.43, - "grad_norm": 20.25, - "learning_rate": 1.4055748561486361e-05, - "loss": 4.5277, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.5641, "step": 3860 }, { "epoch": 0.43, - "grad_norm": 21.0, - "learning_rate": 1.4037953418606832e-05, - "loss": 4.4184, + "grad_norm": 0.365234375, + "learning_rate": 0.001, + "loss": 2.4459, "step": 3865 }, { "epoch": 0.43, - "grad_norm": 22.0, - "learning_rate": 1.402014298463129e-05, - "loss": 4.5431, + "grad_norm": 0.349609375, + "learning_rate": 0.001, + "loss": 2.48, "step": 3870 }, { "epoch": 0.43, - "grad_norm": 27.375, - "learning_rate": 1.4002317327005051e-05, - "loss": 4.5264, + "grad_norm": 0.384765625, + "learning_rate": 0.001, + "loss": 2.3711, "step": 3875 }, { "epoch": 0.43, - "grad_norm": 18.375, - "learning_rate": 1.3984476513231089e-05, - "loss": 4.5365, + "grad_norm": 0.33203125, + "learning_rate": 0.001, + "loss": 2.4004, "step": 3880 }, { "epoch": 0.43, - "grad_norm": 22.375, - "learning_rate": 1.396662061086977e-05, - "loss": 4.5621, + "grad_norm": 0.34375, + "learning_rate": 0.001, + "loss": 2.3915, "step": 3885 }, { "epoch": 0.43, - "grad_norm": 23.75, - "learning_rate": 1.3948749687538585e-05, - "loss": 4.4408, + "grad_norm": 0.3828125, + "learning_rate": 0.001, + "loss": 2.4762, "step": 3890 }, { "epoch": 0.43, - "grad_norm": 26.625, - "learning_rate": 1.3930863810911925e-05, - "loss": 4.4805, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.4994, "step": 3895 }, { "epoch": 0.43, - "grad_norm": 20.75, - "learning_rate": 1.391296304872079e-05, - "loss": 4.4842, + "grad_norm": 0.361328125, + "learning_rate": 0.001, + "loss": 2.4486, "step": 3900 }, { "epoch": 0.44, - "grad_norm": 25.125, - "learning_rate": 1.389504746875257e-05, - "loss": 4.5349, + "grad_norm": 0.376953125, + "learning_rate": 0.001, + "loss": 2.4192, "step": 3905 }, { "epoch": 0.44, - "grad_norm": 26.375, - "learning_rate": 1.387711713885074e-05, - "loss": 4.4424, + "grad_norm": 0.357421875, + "learning_rate": 0.001, + "loss": 2.4359, "step": 3910 }, { "epoch": 0.44, - "grad_norm": 23.75, - "learning_rate": 1.3859172126914658e-05, - "loss": 4.4697, + "grad_norm": 0.353515625, + "learning_rate": 0.001, + "loss": 2.4987, "step": 3915 }, { "epoch": 0.44, - "grad_norm": 22.125, - "learning_rate": 1.384121250089926e-05, - "loss": 4.5127, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.494, "step": 3920 }, { "epoch": 0.44, - "grad_norm": 22.25, - "learning_rate": 1.3823238328814838e-05, - "loss": 4.5203, + "grad_norm": 0.333984375, + "learning_rate": 0.001, + "loss": 2.4081, "step": 3925 }, { "epoch": 0.44, - "grad_norm": 22.5, - "learning_rate": 1.3805249678726762e-05, - "loss": 4.5191, + "grad_norm": 0.361328125, + "learning_rate": 0.001, + "loss": 2.4449, "step": 3930 }, { "epoch": 0.44, - "grad_norm": 23.125, - "learning_rate": 1.3787246618755222e-05, - "loss": 4.4247, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.4008, "step": 3935 }, { "epoch": 0.44, - "grad_norm": 23.125, - "learning_rate": 1.3769229217074984e-05, - "loss": 4.4506, + "grad_norm": 0.357421875, + "learning_rate": 0.001, + "loss": 2.4596, "step": 3940 }, { "epoch": 0.44, - "grad_norm": 22.875, - "learning_rate": 1.375119754191512e-05, - "loss": 4.4483, + "grad_norm": 0.333984375, + "learning_rate": 0.001, + "loss": 2.4376, "step": 3945 }, { "epoch": 0.44, - "grad_norm": 19.625, - "learning_rate": 1.3733151661558756e-05, - "loss": 4.3467, + "grad_norm": 0.34375, + "learning_rate": 0.001, + "loss": 2.4765, "step": 3950 }, { "epoch": 0.44, - "grad_norm": 24.75, - "learning_rate": 1.3715091644342809e-05, - "loss": 4.4866, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 2.4726, "step": 3955 }, { "epoch": 0.44, - "grad_norm": 19.125, - "learning_rate": 1.3697017558657732e-05, - "loss": 4.4714, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.4462, "step": 3960 }, { "epoch": 0.44, - "grad_norm": 20.375, - "learning_rate": 1.3678929472947246e-05, - "loss": 4.428, + "grad_norm": 0.341796875, + "learning_rate": 0.001, + "loss": 2.3776, "step": 3965 }, { "epoch": 0.44, - "grad_norm": 22.25, - "learning_rate": 1.3660827455708098e-05, - "loss": 4.4559, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.3864, "step": 3970 }, { "epoch": 0.44, - "grad_norm": 25.375, - "learning_rate": 1.364271157548979e-05, - "loss": 4.5765, + "grad_norm": 0.33984375, + "learning_rate": 0.001, + "loss": 2.4396, "step": 3975 }, { "epoch": 0.44, - "grad_norm": 21.625, - "learning_rate": 1.362458190089431e-05, - "loss": 4.5804, + "grad_norm": 0.3515625, + "learning_rate": 0.001, + "loss": 2.5275, "step": 3980 }, { "epoch": 0.44, - "grad_norm": 24.625, - "learning_rate": 1.3606438500575899e-05, - "loss": 4.5222, + "grad_norm": 0.349609375, + "learning_rate": 0.001, + "loss": 2.5021, "step": 3985 }, { "epoch": 0.44, - "grad_norm": 20.375, - "learning_rate": 1.3588281443240759e-05, - "loss": 4.4897, + "grad_norm": 0.349609375, + "learning_rate": 0.001, + "loss": 2.5053, "step": 3990 }, { "epoch": 0.45, - "grad_norm": 21.375, - "learning_rate": 1.3570110797646825e-05, - "loss": 4.5199, + "grad_norm": 0.33984375, + "learning_rate": 0.001, + "loss": 2.4067, "step": 3995 }, { "epoch": 0.45, - "grad_norm": 24.0, - "learning_rate": 1.3551926632603474e-05, - "loss": 4.4303, + "grad_norm": 0.35546875, + "learning_rate": 0.001, + "loss": 2.3718, "step": 4000 }, { "epoch": 0.45, - "grad_norm": 21.875, - "learning_rate": 1.3533729016971291e-05, - "loss": 4.4726, + "grad_norm": 0.3671875, + "learning_rate": 0.001, + "loss": 2.396, "step": 4005 }, { "epoch": 0.45, - "grad_norm": 22.875, - "learning_rate": 1.3515518019661787e-05, - "loss": 4.4333, + "grad_norm": 0.3515625, + "learning_rate": 0.001, + "loss": 2.4531, "step": 4010 }, { "epoch": 0.45, - "grad_norm": 24.5, - "learning_rate": 1.3497293709637155e-05, - "loss": 4.4819, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.4581, "step": 4015 }, { "epoch": 0.45, - "grad_norm": 23.125, - "learning_rate": 1.3479056155909993e-05, - "loss": 4.4251, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.4616, "step": 4020 }, { "epoch": 0.45, - "grad_norm": 23.0, - "learning_rate": 1.3460805427543062e-05, - "loss": 4.4175, + "grad_norm": 0.369140625, + "learning_rate": 0.001, + "loss": 2.4446, "step": 4025 }, { "epoch": 0.45, - "grad_norm": 25.5, - "learning_rate": 1.3442541593648998e-05, - "loss": 4.5106, + "grad_norm": 0.359375, + "learning_rate": 0.001, + "loss": 2.3241, "step": 4030 }, { "epoch": 0.45, - "grad_norm": 19.75, - "learning_rate": 1.3424264723390085e-05, - "loss": 4.4186, + "grad_norm": 0.36328125, + "learning_rate": 0.001, + "loss": 2.5278, "step": 4035 }, { "epoch": 0.45, - "grad_norm": 22.5, - "learning_rate": 1.3405974885977954e-05, - "loss": 4.4796, + "grad_norm": 0.34765625, + "learning_rate": 0.001, + "loss": 2.4402, "step": 4040 }, { "epoch": 0.45, - "grad_norm": 22.0, - "learning_rate": 1.3387672150673352e-05, - "loss": 4.4905, + "grad_norm": 0.375, + "learning_rate": 0.001, + "loss": 2.526, "step": 4045 }, { "epoch": 0.45, - "grad_norm": 23.125, - "learning_rate": 1.3369356586785868e-05, - "loss": 4.4685, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 2.4471, "step": 4050 }, { "epoch": 0.45, - "grad_norm": 23.75, - "learning_rate": 1.3351028263673667e-05, - "loss": 4.5113, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.5027, "step": 4055 }, { "epoch": 0.45, - "grad_norm": 21.0, - "learning_rate": 1.3332687250743233e-05, - "loss": 4.5914, + "grad_norm": 0.3515625, + "learning_rate": 0.001, + "loss": 2.4513, "step": 4060 }, { "epoch": 0.45, - "grad_norm": 21.875, - "learning_rate": 1.3314333617449103e-05, - "loss": 4.4291, + "grad_norm": 0.341796875, + "learning_rate": 0.001, + "loss": 2.4194, "step": 4065 }, { "epoch": 0.45, - "grad_norm": 21.875, - "learning_rate": 1.3295967433293606e-05, - "loss": 4.4856, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 2.4691, "step": 4070 }, { "epoch": 0.45, - "grad_norm": 24.0, - "learning_rate": 1.3277588767826602e-05, - "loss": 4.4668, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.4777, "step": 4075 }, { "epoch": 0.45, - "grad_norm": 27.25, - "learning_rate": 1.3259197690645212e-05, - "loss": 4.4973, + "grad_norm": 0.376953125, + "learning_rate": 0.001, + "loss": 2.3957, "step": 4080 }, { "epoch": 0.46, - "grad_norm": 23.125, - "learning_rate": 1.3240794271393558e-05, - "loss": 4.4897, + "grad_norm": 0.38671875, + "learning_rate": 0.001, + "loss": 2.4746, "step": 4085 }, { "epoch": 0.46, - "grad_norm": 25.75, - "learning_rate": 1.3222378579762504e-05, - "loss": 4.5456, + "grad_norm": 0.341796875, + "learning_rate": 0.001, + "loss": 2.3968, "step": 4090 }, { "epoch": 0.46, - "grad_norm": 21.375, - "learning_rate": 1.3203950685489384e-05, - "loss": 4.4695, + "grad_norm": 0.359375, + "learning_rate": 0.001, + "loss": 2.4137, "step": 4095 }, { "epoch": 0.46, - "grad_norm": 27.125, - "learning_rate": 1.318551065835774e-05, - "loss": 4.5567, + "grad_norm": 0.376953125, + "learning_rate": 0.001, + "loss": 2.5123, "step": 4100 }, { "epoch": 0.46, - "grad_norm": 23.875, - "learning_rate": 1.3167058568197067e-05, - "loss": 4.4236, + "grad_norm": 0.34375, + "learning_rate": 0.001, + "loss": 2.4194, "step": 4105 }, { "epoch": 0.46, - "grad_norm": 21.25, - "learning_rate": 1.3148594484882528e-05, - "loss": 4.4317, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.3577, "step": 4110 }, { "epoch": 0.46, - "grad_norm": 23.375, - "learning_rate": 1.3130118478334715e-05, - "loss": 4.5094, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.4128, "step": 4115 }, { "epoch": 0.46, - "grad_norm": 24.625, - "learning_rate": 1.311163061851936e-05, - "loss": 4.4885, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.4063, "step": 4120 }, { "epoch": 0.46, - "grad_norm": 23.875, - "learning_rate": 1.3093130975447093e-05, - "loss": 4.4789, + "grad_norm": 0.37890625, + "learning_rate": 0.001, + "loss": 2.4535, "step": 4125 }, { "epoch": 0.46, - "grad_norm": 26.875, - "learning_rate": 1.3074619619173157e-05, - "loss": 4.526, + "grad_norm": 0.353515625, + "learning_rate": 0.001, + "loss": 2.3966, "step": 4130 }, { "epoch": 0.46, - "grad_norm": 22.0, - "learning_rate": 1.3056096619797151e-05, - "loss": 4.4829, + "grad_norm": 0.376953125, + "learning_rate": 0.001, + "loss": 2.4123, "step": 4135 }, { "epoch": 0.46, - "grad_norm": 22.125, - "learning_rate": 1.3037562047462773e-05, - "loss": 4.616, + "grad_norm": 0.45703125, + "learning_rate": 0.001, + "loss": 2.4634, "step": 4140 }, { "epoch": 0.46, - "grad_norm": 21.25, - "learning_rate": 1.3019015972357536e-05, - "loss": 4.4451, + "grad_norm": 0.3515625, + "learning_rate": 0.001, + "loss": 2.3939, "step": 4145 }, { "epoch": 0.46, - "grad_norm": 25.75, - "learning_rate": 1.3000458464712518e-05, - "loss": 4.3929, + "grad_norm": 0.33203125, + "learning_rate": 0.001, + "loss": 2.3884, "step": 4150 }, { "epoch": 0.46, - "grad_norm": 23.75, - "learning_rate": 1.2981889594802089e-05, - "loss": 4.4795, + "grad_norm": 0.34375, + "learning_rate": 0.001, + "loss": 2.4213, "step": 4155 }, { "epoch": 0.46, - "grad_norm": 23.875, - "learning_rate": 1.2963309432943643e-05, - "loss": 4.4679, + "grad_norm": 0.35546875, + "learning_rate": 0.001, + "loss": 2.3715, "step": 4160 }, { "epoch": 0.46, - "grad_norm": 20.5, - "learning_rate": 1.2944718049497342e-05, - "loss": 4.5112, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.4166, "step": 4165 }, { "epoch": 0.46, - "grad_norm": 35.5, - "learning_rate": 1.2926115514865835e-05, - "loss": 4.4882, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.4946, "step": 4170 }, { "epoch": 0.47, - "grad_norm": 24.125, - "learning_rate": 1.2907501899494e-05, - "loss": 4.4449, + "grad_norm": 0.33203125, + "learning_rate": 0.001, + "loss": 2.4918, "step": 4175 }, { "epoch": 0.47, - "grad_norm": 25.375, - "learning_rate": 1.2888877273868684e-05, - "loss": 4.5325, + "grad_norm": 0.359375, + "learning_rate": 0.001, + "loss": 2.381, "step": 4180 }, { "epoch": 0.47, - "grad_norm": 19.875, - "learning_rate": 1.2870241708518413e-05, - "loss": 4.4406, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 2.3991, "step": 4185 }, { "epoch": 0.47, - "grad_norm": 22.875, - "learning_rate": 1.2851595274013158e-05, - "loss": 4.4777, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.4139, "step": 4190 }, { "epoch": 0.47, - "grad_norm": 22.375, - "learning_rate": 1.2832938040964037e-05, - "loss": 4.584, + "grad_norm": 0.34375, + "learning_rate": 0.001, + "loss": 2.5107, "step": 4195 }, { "epoch": 0.47, - "grad_norm": 24.0, - "learning_rate": 1.2814270080023063e-05, - "loss": 4.4707, + "grad_norm": 0.353515625, + "learning_rate": 0.001, + "loss": 2.387, "step": 4200 }, { "epoch": 0.47, - "grad_norm": 21.25, - "learning_rate": 1.2795591461882877e-05, - "loss": 4.4909, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.447, "step": 4205 }, { "epoch": 0.47, - "grad_norm": 21.875, - "learning_rate": 1.2776902257276471e-05, - "loss": 4.5669, + "grad_norm": 0.333984375, + "learning_rate": 0.001, + "loss": 2.3473, "step": 4210 }, { "epoch": 0.47, - "grad_norm": 25.125, - "learning_rate": 1.2758202536976936e-05, - "loss": 4.5154, + "grad_norm": 0.337890625, + "learning_rate": 0.001, + "loss": 2.3907, "step": 4215 }, { "epoch": 0.47, - "grad_norm": 20.875, - "learning_rate": 1.2739492371797172e-05, - "loss": 4.5008, + "grad_norm": 0.361328125, + "learning_rate": 0.001, + "loss": 2.3656, "step": 4220 }, { "epoch": 0.47, - "grad_norm": 25.875, - "learning_rate": 1.2720771832589647e-05, - "loss": 4.4727, + "grad_norm": 0.357421875, + "learning_rate": 0.001, + "loss": 2.5121, "step": 4225 }, { "epoch": 0.47, - "grad_norm": 23.125, - "learning_rate": 1.2702040990246099e-05, - "loss": 4.437, + "grad_norm": 0.33984375, + "learning_rate": 0.001, + "loss": 2.4212, "step": 4230 }, { "epoch": 0.47, - "grad_norm": 24.25, - "learning_rate": 1.2683299915697292e-05, - "loss": 4.4722, + "grad_norm": 0.34375, + "learning_rate": 0.001, + "loss": 2.3811, "step": 4235 }, { "epoch": 0.47, - "grad_norm": 22.0, - "learning_rate": 1.2664548679912732e-05, - "loss": 4.5076, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.368, "step": 4240 }, { "epoch": 0.47, - "grad_norm": 19.125, - "learning_rate": 1.2645787353900409e-05, - "loss": 4.5031, + "grad_norm": 0.36328125, + "learning_rate": 0.001, + "loss": 2.4631, "step": 4245 }, { "epoch": 0.47, - "grad_norm": 20.375, - "learning_rate": 1.2627016008706523e-05, - "loss": 4.4988, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.4267, "step": 4250 }, { "epoch": 0.47, - "grad_norm": 19.625, - "learning_rate": 1.2608234715415208e-05, - "loss": 4.4595, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.4453, "step": 4255 }, { "epoch": 0.47, - "grad_norm": 21.0, - "learning_rate": 1.2589443545148278e-05, - "loss": 4.4619, + "grad_norm": 0.337890625, + "learning_rate": 0.001, + "loss": 2.3846, "step": 4260 }, { "epoch": 0.48, - "grad_norm": 24.75, - "learning_rate": 1.2570642569064946e-05, - "loss": 4.4257, + "grad_norm": 0.33984375, + "learning_rate": 0.001, + "loss": 2.3578, "step": 4265 }, { "epoch": 0.48, - "grad_norm": 23.375, - "learning_rate": 1.2551831858361556e-05, - "loss": 4.3744, + "grad_norm": 0.34375, + "learning_rate": 0.001, + "loss": 2.4811, "step": 4270 }, { "epoch": 0.48, - "grad_norm": 21.625, - "learning_rate": 1.253301148427132e-05, - "loss": 4.4514, + "grad_norm": 0.35546875, + "learning_rate": 0.001, + "loss": 2.4718, "step": 4275 }, { "epoch": 0.48, - "grad_norm": 21.75, - "learning_rate": 1.2514181518064044e-05, - "loss": 4.4791, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.4631, "step": 4280 }, { "epoch": 0.48, - "grad_norm": 30.25, - "learning_rate": 1.249534203104585e-05, - "loss": 4.5157, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 2.428, "step": 4285 }, { "epoch": 0.48, - "grad_norm": 20.625, - "learning_rate": 1.2476493094558922e-05, - "loss": 4.4425, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.4018, "step": 4290 }, { "epoch": 0.48, - "grad_norm": 25.625, - "learning_rate": 1.2457634779981227e-05, - "loss": 4.4951, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.3948, "step": 4295 }, { "epoch": 0.48, - "grad_norm": 28.75, - "learning_rate": 1.243876715872624e-05, - "loss": 4.4835, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.3939, "step": 4300 }, { "epoch": 0.48, - "grad_norm": 23.375, - "learning_rate": 1.2419890302242683e-05, - "loss": 4.4824, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.3406, "step": 4305 }, { "epoch": 0.48, - "grad_norm": 23.25, - "learning_rate": 1.2401004282014251e-05, - "loss": 4.5057, + "grad_norm": 0.384765625, + "learning_rate": 0.001, + "loss": 2.427, "step": 4310 }, { "epoch": 0.48, - "grad_norm": 22.875, - "learning_rate": 1.2382109169559337e-05, - "loss": 4.4626, + "grad_norm": 0.341796875, + "learning_rate": 0.001, + "loss": 2.4806, "step": 4315 }, { "epoch": 0.48, - "grad_norm": 22.625, - "learning_rate": 1.236320503643077e-05, - "loss": 4.5516, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.3565, "step": 4320 }, { "epoch": 0.48, - "grad_norm": 20.75, - "learning_rate": 1.2344291954215534e-05, - "loss": 4.5318, + "grad_norm": 0.341796875, + "learning_rate": 0.001, + "loss": 2.4658, "step": 4325 }, { "epoch": 0.48, - "grad_norm": 22.625, - "learning_rate": 1.2325369994534503e-05, - "loss": 4.5469, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.3383, "step": 4330 }, { "epoch": 0.48, - "grad_norm": 21.75, - "learning_rate": 1.2306439229042172e-05, - "loss": 4.5379, + "grad_norm": 0.333984375, + "learning_rate": 0.001, + "loss": 2.4433, "step": 4335 }, { "epoch": 0.48, - "grad_norm": 19.375, - "learning_rate": 1.2287499729426378e-05, - "loss": 4.5257, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.4428, "step": 4340 }, { "epoch": 0.48, - "grad_norm": 21.125, - "learning_rate": 1.2268551567408034e-05, - "loss": 4.4628, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.3914, "step": 4345 }, { "epoch": 0.49, - "grad_norm": 23.625, - "learning_rate": 1.2249594814740855e-05, - "loss": 4.5005, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.4283, "step": 4350 }, { "epoch": 0.49, - "grad_norm": 17.125, - "learning_rate": 1.2230629543211095e-05, - "loss": 4.5027, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.4238, "step": 4355 }, { "epoch": 0.49, - "grad_norm": 28.0, - "learning_rate": 1.2211655824637253e-05, - "loss": 4.4872, + "grad_norm": 0.341796875, + "learning_rate": 0.001, + "loss": 2.3536, "step": 4360 }, { "epoch": 0.49, - "grad_norm": 22.875, - "learning_rate": 1.2192673730869828e-05, - "loss": 4.5564, + "grad_norm": 0.33984375, + "learning_rate": 0.001, + "loss": 2.3306, "step": 4365 }, { "epoch": 0.49, - "grad_norm": 20.875, - "learning_rate": 1.2173683333791036e-05, - "loss": 4.4685, + "grad_norm": 0.369140625, + "learning_rate": 0.001, + "loss": 2.4716, "step": 4370 }, { "epoch": 0.49, - "grad_norm": 24.0, - "learning_rate": 1.2154684705314522e-05, - "loss": 4.4932, + "grad_norm": 0.337890625, + "learning_rate": 0.001, + "loss": 2.3698, "step": 4375 }, { "epoch": 0.49, - "grad_norm": 25.75, - "learning_rate": 1.2135677917385115e-05, - "loss": 4.5181, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.4516, "step": 4380 }, { "epoch": 0.49, - "grad_norm": 27.125, - "learning_rate": 1.2116663041978539e-05, - "loss": 4.5899, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.3544, "step": 4385 }, { "epoch": 0.49, - "grad_norm": 22.75, - "learning_rate": 1.2097640151101144e-05, - "loss": 4.5447, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 2.4579, "step": 4390 }, { "epoch": 0.49, - "grad_norm": 23.75, - "learning_rate": 1.2078609316789629e-05, - "loss": 4.4627, + "grad_norm": 0.33984375, + "learning_rate": 0.001, + "loss": 2.4471, "step": 4395 }, { "epoch": 0.49, - "grad_norm": 18.125, - "learning_rate": 1.2059570611110785e-05, - "loss": 4.476, + "grad_norm": 0.34765625, + "learning_rate": 0.001, + "loss": 2.4688, "step": 4400 }, { "epoch": 0.49, - "grad_norm": 18.125, - "learning_rate": 1.2040524106161196e-05, - "loss": 4.4871, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.4272, "step": 4405 }, { "epoch": 0.49, - "grad_norm": 20.875, - "learning_rate": 1.2021469874066988e-05, - "loss": 4.4557, + "grad_norm": 0.333984375, + "learning_rate": 0.001, + "loss": 2.3917, "step": 4410 }, { "epoch": 0.49, - "grad_norm": 20.125, - "learning_rate": 1.2002407986983556e-05, - "loss": 4.5595, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.4167, "step": 4415 }, { "epoch": 0.49, - "grad_norm": 23.25, - "learning_rate": 1.1983338517095266e-05, - "loss": 4.5685, + "grad_norm": 0.359375, + "learning_rate": 0.001, + "loss": 2.4342, "step": 4420 }, { "epoch": 0.49, - "grad_norm": 21.625, - "learning_rate": 1.1964261536615213e-05, - "loss": 4.5091, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 2.5076, "step": 4425 }, { "epoch": 0.49, - "grad_norm": 25.5, - "learning_rate": 1.194517711778493e-05, - "loss": 4.4676, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.4897, "step": 4430 }, { "epoch": 0.49, - "grad_norm": 23.5, - "learning_rate": 1.1926085332874114e-05, - "loss": 4.5009, + "grad_norm": 0.349609375, + "learning_rate": 0.001, + "loss": 2.4231, "step": 4435 }, { "epoch": 0.5, - "grad_norm": 25.625, - "learning_rate": 1.1906986254180357e-05, - "loss": 4.605, + "grad_norm": 0.3515625, + "learning_rate": 0.001, + "loss": 2.358, "step": 4440 }, { "epoch": 0.5, - "grad_norm": 24.125, - "learning_rate": 1.1887879954028878e-05, - "loss": 4.4869, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.3515, "step": 4445 }, { "epoch": 0.5, - "grad_norm": 17.5, - "learning_rate": 1.1868766504772233e-05, - "loss": 4.4555, + "grad_norm": 0.333984375, + "learning_rate": 0.001, + "loss": 2.4139, "step": 4450 }, { "epoch": 0.5, - "grad_norm": 20.5, - "learning_rate": 1.1849645978790061e-05, - "loss": 4.4505, + "grad_norm": 0.34375, + "learning_rate": 0.001, + "loss": 2.5031, "step": 4455 }, { "epoch": 0.5, - "grad_norm": 22.25, - "learning_rate": 1.1830518448488788e-05, - "loss": 4.5176, + "grad_norm": 0.34375, + "learning_rate": 0.001, + "loss": 2.3742, "step": 4460 }, { "epoch": 0.5, - "grad_norm": 21.625, - "learning_rate": 1.1811383986301373e-05, - "loss": 4.4826, + "grad_norm": 0.333984375, + "learning_rate": 0.001, + "loss": 2.467, "step": 4465 }, { "epoch": 0.5, - "grad_norm": 22.0, - "learning_rate": 1.1792242664687023e-05, - "loss": 4.5374, + "grad_norm": 0.341796875, + "learning_rate": 0.001, + "loss": 2.3063, "step": 4470 }, { "epoch": 0.5, - "grad_norm": 25.25, - "learning_rate": 1.1773094556130916e-05, - "loss": 4.4535, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.2742, "step": 4475 }, { "epoch": 0.5, - "grad_norm": 21.5, - "learning_rate": 1.1753939733143937e-05, - "loss": 4.467, + "grad_norm": 0.390625, + "learning_rate": 0.001, + "loss": 2.3993, "step": 4480 }, { "epoch": 0.5, - "grad_norm": 22.75, - "learning_rate": 1.1734778268262395e-05, - "loss": 4.6161, + "grad_norm": 0.3515625, + "learning_rate": 0.001, + "loss": 2.3711, "step": 4485 }, { "epoch": 0.5, - "grad_norm": 19.75, - "learning_rate": 1.1715610234047752e-05, - "loss": 4.4712, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.3366, "step": 4490 }, { "epoch": 0.5, - "grad_norm": 22.0, - "learning_rate": 1.1696435703086341e-05, - "loss": 4.5308, + "grad_norm": 0.359375, + "learning_rate": 0.001, + "loss": 2.3711, "step": 4495 }, { "epoch": 0.5, - "grad_norm": 20.625, - "learning_rate": 1.167725474798911e-05, - "loss": 4.4886, + "grad_norm": 0.33984375, + "learning_rate": 0.001, + "loss": 2.4112, "step": 4500 }, { "epoch": 0.5, - "grad_norm": 24.0, - "learning_rate": 1.1658067441391321e-05, - "loss": 4.4825, + "grad_norm": 0.34765625, + "learning_rate": 0.001, + "loss": 2.5269, "step": 4505 }, { "epoch": 0.5, - "grad_norm": 23.625, - "learning_rate": 1.1638873855952291e-05, - "loss": 4.5291, + "grad_norm": 0.333984375, + "learning_rate": 0.001, + "loss": 2.4256, "step": 4510 }, { "epoch": 0.5, - "grad_norm": 26.0, - "learning_rate": 1.1619674064355124e-05, - "loss": 4.4875, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.4714, "step": 4515 }, { "epoch": 0.5, - "grad_norm": 24.125, - "learning_rate": 1.160046813930641e-05, - "loss": 4.4646, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 2.4565, "step": 4520 }, { "epoch": 0.5, - "grad_norm": 18.125, - "learning_rate": 1.1581256153535984e-05, - "loss": 4.5294, + "grad_norm": 0.3515625, + "learning_rate": 0.001, + "loss": 2.4071, "step": 4525 }, { "epoch": 0.51, - "grad_norm": 22.0, - "learning_rate": 1.156203817979661e-05, - "loss": 4.4504, + "grad_norm": 0.333984375, + "learning_rate": 0.001, + "loss": 2.4281, "step": 4530 }, { "epoch": 0.51, - "grad_norm": 20.875, - "learning_rate": 1.1542814290863749e-05, - "loss": 4.5137, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.4384, "step": 4535 }, { "epoch": 0.51, - "grad_norm": 17.625, - "learning_rate": 1.1523584559535244e-05, - "loss": 4.4628, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 2.4587, "step": 4540 }, { "epoch": 0.51, - "grad_norm": 23.25, - "learning_rate": 1.1504349058631075e-05, - "loss": 4.5838, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.4184, "step": 4545 }, { "epoch": 0.51, - "grad_norm": 21.75, - "learning_rate": 1.1485107860993064e-05, - "loss": 4.4518, + "grad_norm": 0.341796875, + "learning_rate": 0.001, + "loss": 2.3867, "step": 4550 }, { "epoch": 0.51, - "grad_norm": 23.75, - "learning_rate": 1.1465861039484607e-05, - "loss": 4.4922, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.3673, "step": 4555 }, { "epoch": 0.51, - "grad_norm": 27.5, - "learning_rate": 1.1446608666990397e-05, - "loss": 4.4627, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.4, "step": 4560 }, { "epoch": 0.51, - "grad_norm": 20.25, - "learning_rate": 1.1427350816416152e-05, - "loss": 4.485, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.4027, "step": 4565 }, { "epoch": 0.51, - "grad_norm": 22.75, - "learning_rate": 1.1408087560688326e-05, - "loss": 4.5073, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.4282, "step": 4570 }, { "epoch": 0.51, - "grad_norm": 22.5, - "learning_rate": 1.1388818972753846e-05, - "loss": 4.3852, + "grad_norm": 0.349609375, + "learning_rate": 0.001, + "loss": 2.4324, "step": 4575 }, { "epoch": 0.51, - "grad_norm": 23.625, - "learning_rate": 1.1369545125579832e-05, - "loss": 4.4487, + "grad_norm": 0.3515625, + "learning_rate": 0.001, + "loss": 2.4159, "step": 4580 }, { "epoch": 0.51, - "grad_norm": 21.625, - "learning_rate": 1.135026609215332e-05, - "loss": 4.4202, + "grad_norm": 0.34765625, + "learning_rate": 0.001, + "loss": 2.3382, "step": 4585 }, { "epoch": 0.51, - "grad_norm": 19.875, - "learning_rate": 1.1330981945480985e-05, - "loss": 4.5208, + "grad_norm": 0.341796875, + "learning_rate": 0.001, + "loss": 2.3968, "step": 4590 }, { "epoch": 0.51, - "grad_norm": 21.25, - "learning_rate": 1.1311692758588862e-05, - "loss": 4.4729, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.3331, "step": 4595 }, { "epoch": 0.51, - "grad_norm": 19.5, - "learning_rate": 1.1292398604522076e-05, - "loss": 4.5609, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.4229, "step": 4600 }, { "epoch": 0.51, - "grad_norm": 21.625, - "learning_rate": 1.1273099556344563e-05, - "loss": 4.4376, + "grad_norm": 0.37109375, + "learning_rate": 0.001, + "loss": 2.3908, "step": 4605 }, { "epoch": 0.51, - "grad_norm": 22.625, - "learning_rate": 1.1253795687138787e-05, - "loss": 4.4811, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 2.4211, "step": 4610 }, { "epoch": 0.51, - "grad_norm": 24.125, - "learning_rate": 1.1234487070005476e-05, - "loss": 4.4778, + "grad_norm": 0.3515625, + "learning_rate": 0.001, + "loss": 2.4101, "step": 4615 }, { "epoch": 0.52, - "grad_norm": 23.625, - "learning_rate": 1.1215173778063329e-05, - "loss": 4.5821, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.3875, "step": 4620 }, { "epoch": 0.52, - "grad_norm": 23.125, - "learning_rate": 1.1195855884448757e-05, - "loss": 4.4502, + "grad_norm": 0.34765625, + "learning_rate": 0.001, + "loss": 2.3807, "step": 4625 }, { "epoch": 0.52, - "grad_norm": 22.375, - "learning_rate": 1.1176533462315584e-05, - "loss": 4.4743, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.3716, "step": 4630 }, { "epoch": 0.52, - "grad_norm": 28.0, - "learning_rate": 1.1157206584834802e-05, - "loss": 4.3541, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.4305, "step": 4635 }, { "epoch": 0.52, - "grad_norm": 24.375, - "learning_rate": 1.1137875325194253e-05, - "loss": 4.4734, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.4326, "step": 4640 }, { "epoch": 0.52, - "grad_norm": 24.875, - "learning_rate": 1.1118539756598393e-05, - "loss": 4.5791, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.4278, "step": 4645 }, { "epoch": 0.52, - "grad_norm": 23.625, - "learning_rate": 1.1099199952267976e-05, - "loss": 4.547, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.3428, "step": 4650 }, { "epoch": 0.52, - "grad_norm": 18.75, - "learning_rate": 1.1079855985439815e-05, - "loss": 4.4843, + "grad_norm": 0.33984375, + "learning_rate": 0.001, + "loss": 2.3823, "step": 4655 }, { "epoch": 0.52, - "grad_norm": 26.0, - "learning_rate": 1.1060507929366475e-05, - "loss": 4.4857, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.4197, "step": 4660 }, { "epoch": 0.52, - "grad_norm": 24.375, - "learning_rate": 1.104115585731601e-05, - "loss": 4.537, + "grad_norm": 0.341796875, + "learning_rate": 0.001, + "loss": 2.4177, "step": 4665 }, { "epoch": 0.52, - "grad_norm": 24.875, - "learning_rate": 1.1021799842571676e-05, - "loss": 4.4704, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.2897, "step": 4670 }, { "epoch": 0.52, - "grad_norm": 21.375, - "learning_rate": 1.1002439958431668e-05, - "loss": 4.5169, + "grad_norm": 0.33203125, + "learning_rate": 0.001, + "loss": 2.3867, "step": 4675 }, { "epoch": 0.52, - "grad_norm": 29.125, - "learning_rate": 1.0983076278208826e-05, - "loss": 4.5436, + "grad_norm": 0.33203125, + "learning_rate": 0.001, + "loss": 2.2943, "step": 4680 }, { "epoch": 0.52, - "grad_norm": 21.125, - "learning_rate": 1.096370887523037e-05, - "loss": 4.424, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.4413, "step": 4685 }, { "epoch": 0.52, - "grad_norm": 17.75, - "learning_rate": 1.0944337822837618e-05, - "loss": 4.5384, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.3828, "step": 4690 }, { "epoch": 0.52, - "grad_norm": 19.5, - "learning_rate": 1.0924963194385703e-05, - "loss": 4.5604, + "grad_norm": 0.333984375, + "learning_rate": 0.001, + "loss": 2.451, "step": 4695 }, { "epoch": 0.52, - "grad_norm": 22.375, - "learning_rate": 1.0905585063243307e-05, - "loss": 4.4633, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.3077, "step": 4700 }, { "epoch": 0.52, - "grad_norm": 24.25, - "learning_rate": 1.0886203502792365e-05, - "loss": 4.4982, + "grad_norm": 0.361328125, + "learning_rate": 0.001, + "loss": 2.4278, "step": 4705 }, { "epoch": 0.53, - "grad_norm": 25.5, - "learning_rate": 1.0866818586427814e-05, - "loss": 4.4854, + "grad_norm": 0.341796875, + "learning_rate": 0.001, + "loss": 2.371, "step": 4710 }, { "epoch": 0.53, - "grad_norm": 26.0, - "learning_rate": 1.0847430387557285e-05, - "loss": 4.5265, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.429, "step": 4715 }, { "epoch": 0.53, - "grad_norm": 25.0, - "learning_rate": 1.0828038979600848e-05, - "loss": 4.5484, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.3383, "step": 4720 }, { "epoch": 0.53, - "grad_norm": 22.5, - "learning_rate": 1.0808644435990718e-05, - "loss": 4.5287, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 2.3408, "step": 4725 }, { "epoch": 0.53, - "grad_norm": 25.875, - "learning_rate": 1.0789246830170992e-05, - "loss": 4.5603, + "grad_norm": 0.388671875, + "learning_rate": 0.001, + "loss": 2.4862, "step": 4730 }, { "epoch": 0.53, - "grad_norm": 22.75, - "learning_rate": 1.0769846235597361e-05, - "loss": 4.446, + "grad_norm": 0.333984375, + "learning_rate": 0.001, + "loss": 2.5457, "step": 4735 }, { "epoch": 0.53, - "grad_norm": 26.0, - "learning_rate": 1.0750442725736827e-05, - "loss": 4.5241, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.4219, "step": 4740 }, { "epoch": 0.53, - "grad_norm": 23.125, - "learning_rate": 1.0731036374067444e-05, - "loss": 4.4543, + "grad_norm": 0.349609375, + "learning_rate": 0.001, + "loss": 2.277, "step": 4745 }, { "epoch": 0.53, - "grad_norm": 20.25, - "learning_rate": 1.0711627254078013e-05, - "loss": 4.3908, + "grad_norm": 0.349609375, + "learning_rate": 0.001, + "loss": 2.3562, "step": 4750 }, { "epoch": 0.53, - "grad_norm": 20.0, - "learning_rate": 1.0692215439267834e-05, - "loss": 4.4717, + "grad_norm": 0.333984375, + "learning_rate": 0.001, + "loss": 2.4333, "step": 4755 }, { "epoch": 0.53, - "grad_norm": 19.875, - "learning_rate": 1.06728010031464e-05, - "loss": 4.4853, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.3555, "step": 4760 }, { "epoch": 0.53, - "grad_norm": 19.75, - "learning_rate": 1.0653384019233135e-05, - "loss": 4.4341, + "grad_norm": 0.365234375, + "learning_rate": 0.001, + "loss": 2.4844, "step": 4765 }, { "epoch": 0.53, - "grad_norm": 28.75, - "learning_rate": 1.063396456105711e-05, - "loss": 4.4389, + "grad_norm": 0.435546875, + "learning_rate": 0.001, + "loss": 2.4931, "step": 4770 }, { "epoch": 0.53, - "grad_norm": 19.875, - "learning_rate": 1.0614542702156765e-05, - "loss": 4.4922, + "grad_norm": 0.337890625, + "learning_rate": 0.001, + "loss": 2.3959, "step": 4775 }, { "epoch": 0.53, - "grad_norm": 20.875, - "learning_rate": 1.059511851607963e-05, - "loss": 4.4694, + "grad_norm": 0.33203125, + "learning_rate": 0.001, + "loss": 2.3608, "step": 4780 }, { "epoch": 0.53, - "grad_norm": 29.0, - "learning_rate": 1.0575692076382056e-05, - "loss": 4.4844, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.3661, "step": 4785 }, { "epoch": 0.53, - "grad_norm": 26.75, - "learning_rate": 1.055626345662892e-05, - "loss": 4.5797, + "grad_norm": 0.33984375, + "learning_rate": 0.001, + "loss": 2.4261, "step": 4790 }, { "epoch": 0.53, - "grad_norm": 28.625, - "learning_rate": 1.0536832730393354e-05, - "loss": 4.5481, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.3362, "step": 4795 }, { "epoch": 0.54, - "grad_norm": 21.375, - "learning_rate": 1.0517399971256474e-05, - "loss": 4.5397, + "grad_norm": 0.33984375, + "learning_rate": 0.001, + "loss": 2.4043, "step": 4800 }, { "epoch": 0.54, - "grad_norm": 22.375, - "learning_rate": 1.0497965252807082e-05, - "loss": 4.4869, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.3257, "step": 4805 }, { "epoch": 0.54, - "grad_norm": 21.75, - "learning_rate": 1.0478528648641417e-05, - "loss": 4.5376, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.4065, "step": 4810 }, { "epoch": 0.54, - "grad_norm": 30.5, - "learning_rate": 1.045909023236284e-05, - "loss": 4.5406, + "grad_norm": 0.349609375, + "learning_rate": 0.001, + "loss": 2.3205, "step": 4815 }, { "epoch": 0.54, - "grad_norm": 23.625, - "learning_rate": 1.0439650077581592e-05, - "loss": 4.4913, + "grad_norm": 0.35546875, + "learning_rate": 0.001, + "loss": 2.4849, "step": 4820 }, { "epoch": 0.54, - "grad_norm": 24.25, - "learning_rate": 1.0420208257914481e-05, - "loss": 4.4513, + "grad_norm": 0.373046875, + "learning_rate": 0.001, + "loss": 2.4492, "step": 4825 }, { "epoch": 0.54, - "grad_norm": 22.375, - "learning_rate": 1.0400764846984633e-05, - "loss": 4.494, + "grad_norm": 0.353515625, + "learning_rate": 0.001, + "loss": 2.3474, "step": 4830 }, { "epoch": 0.54, - "grad_norm": 23.875, - "learning_rate": 1.0381319918421191e-05, - "loss": 4.4977, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 2.3716, "step": 4835 }, { "epoch": 0.54, - "grad_norm": 19.875, - "learning_rate": 1.0361873545859049e-05, - "loss": 4.4747, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.3561, "step": 4840 }, { "epoch": 0.54, - "grad_norm": 23.25, - "learning_rate": 1.0342425802938573e-05, - "loss": 4.5203, + "grad_norm": 0.337890625, + "learning_rate": 0.001, + "loss": 2.3908, "step": 4845 }, { "epoch": 0.54, - "grad_norm": 25.25, - "learning_rate": 1.0322976763305308e-05, - "loss": 4.4646, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.4253, "step": 4850 }, { "epoch": 0.54, - "grad_norm": 22.0, - "learning_rate": 1.030352650060972e-05, - "loss": 4.5321, + "grad_norm": 0.357421875, + "learning_rate": 0.001, + "loss": 2.3876, "step": 4855 }, { "epoch": 0.54, - "grad_norm": 24.625, - "learning_rate": 1.0284075088506898e-05, - "loss": 4.4667, + "grad_norm": 0.333984375, + "learning_rate": 0.001, + "loss": 2.3442, "step": 4860 }, { "epoch": 0.54, - "grad_norm": 21.875, - "learning_rate": 1.026462260065629e-05, - "loss": 4.372, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.3322, "step": 4865 }, { "epoch": 0.54, - "grad_norm": 22.625, - "learning_rate": 1.0245169110721411e-05, - "loss": 4.4409, + "grad_norm": 0.337890625, + "learning_rate": 0.001, + "loss": 2.3931, "step": 4870 }, { "epoch": 0.54, - "grad_norm": 24.5, - "learning_rate": 1.0225714692369582e-05, - "loss": 4.4908, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.4097, "step": 4875 }, { "epoch": 0.54, - "grad_norm": 23.625, - "learning_rate": 1.020625941927163e-05, - "loss": 4.4568, + "grad_norm": 0.33203125, + "learning_rate": 0.001, + "loss": 2.3871, "step": 4880 }, { "epoch": 0.54, - "grad_norm": 23.0, - "learning_rate": 1.018680336510162e-05, - "loss": 4.5082, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.383, "step": 4885 }, { "epoch": 0.55, - "grad_norm": 21.375, - "learning_rate": 1.0167346603536577e-05, - "loss": 4.4756, + "grad_norm": 0.375, + "learning_rate": 0.001, + "loss": 2.3096, "step": 4890 }, { "epoch": 0.55, - "grad_norm": 34.5, - "learning_rate": 1.0147889208256203e-05, - "loss": 4.5365, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.3468, "step": 4895 }, { "epoch": 0.55, - "grad_norm": 23.125, - "learning_rate": 1.0128431252942603e-05, - "loss": 4.469, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.339, "step": 4900 }, { "epoch": 0.55, - "grad_norm": 26.0, - "learning_rate": 1.010897281128e-05, - "loss": 4.4557, + "grad_norm": 0.365234375, + "learning_rate": 0.001, + "loss": 2.3719, "step": 4905 }, { "epoch": 0.55, - "grad_norm": 20.125, - "learning_rate": 1.008951395695446e-05, - "loss": 4.432, + "grad_norm": 0.33203125, + "learning_rate": 0.001, + "loss": 2.3645, "step": 4910 }, { "epoch": 0.55, - "grad_norm": 22.625, - "learning_rate": 1.007005476365361e-05, - "loss": 4.5067, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.3295, "step": 4915 }, { "epoch": 0.55, - "grad_norm": 24.625, - "learning_rate": 1.0050595305066364e-05, - "loss": 4.4981, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.4452, "step": 4920 }, { "epoch": 0.55, - "grad_norm": 22.25, - "learning_rate": 1.0031135654882634e-05, - "loss": 4.4405, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.3908, "step": 4925 }, { "epoch": 0.55, - "grad_norm": 23.625, - "learning_rate": 1.0011675886793063e-05, - "loss": 4.4477, + "grad_norm": 0.33203125, + "learning_rate": 0.001, + "loss": 2.3639, "step": 4930 }, { "epoch": 0.55, - "grad_norm": 20.0, - "learning_rate": 9.992216074488744e-06, - "loss": 4.443, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.3453, "step": 4935 }, { "epoch": 0.55, - "grad_norm": 21.375, - "learning_rate": 9.972756291660928e-06, - "loss": 4.4854, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.4583, "step": 4940 }, { "epoch": 0.55, - "grad_norm": 25.25, - "learning_rate": 9.953296612000762e-06, - "loss": 4.4655, + "grad_norm": 0.333984375, + "learning_rate": 0.001, + "loss": 2.3326, "step": 4945 }, { "epoch": 0.55, - "grad_norm": 26.5, - "learning_rate": 9.933837109198995e-06, - "loss": 4.4724, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.3368, "step": 4950 }, { "epoch": 0.55, - "grad_norm": 22.25, - "learning_rate": 9.914377856945718e-06, - "loss": 4.6013, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.3238, "step": 4955 }, { "epoch": 0.55, - "grad_norm": 22.125, - "learning_rate": 9.894918928930059e-06, - "loss": 4.4671, + "grad_norm": 0.337890625, + "learning_rate": 0.001, + "loss": 2.3857, "step": 4960 }, { "epoch": 0.55, - "grad_norm": 21.25, - "learning_rate": 9.875460398839932e-06, - "loss": 4.4695, + "grad_norm": 0.341796875, + "learning_rate": 0.001, + "loss": 2.4269, "step": 4965 }, { "epoch": 0.55, - "grad_norm": 19.5, - "learning_rate": 9.85600234036173e-06, - "loss": 4.5066, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.4069, "step": 4970 }, { "epoch": 0.55, - "grad_norm": 23.625, - "learning_rate": 9.83654482718007e-06, - "loss": 4.4804, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.4184, "step": 4975 }, { "epoch": 0.56, - "grad_norm": 26.25, - "learning_rate": 9.817087932977505e-06, - "loss": 4.4701, + "grad_norm": 0.337890625, + "learning_rate": 0.001, + "loss": 2.3375, "step": 4980 }, { "epoch": 0.56, - "grad_norm": 20.5, - "learning_rate": 9.797631731434237e-06, - "loss": 4.4796, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.3276, "step": 4985 }, { "epoch": 0.56, - "grad_norm": 19.0, - "learning_rate": 9.778176296227849e-06, - "loss": 4.4486, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.387, "step": 4990 }, { "epoch": 0.56, - "grad_norm": 17.875, - "learning_rate": 9.75872170103302e-06, - "loss": 4.4735, + "grad_norm": 0.341796875, + "learning_rate": 0.001, + "loss": 2.4152, "step": 4995 }, { "epoch": 0.56, - "grad_norm": 23.875, - "learning_rate": 9.739268019521252e-06, - "loss": 4.4576, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.4344, "step": 5000 }, { "epoch": 0.56, - "grad_norm": 22.75, - "learning_rate": 9.719815325360582e-06, - "loss": 4.4523, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.3746, "step": 5005 }, { "epoch": 0.56, - "grad_norm": 19.875, - "learning_rate": 9.700363692215313e-06, - "loss": 4.4976, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.3692, "step": 5010 }, { "epoch": 0.56, - "grad_norm": 18.375, - "learning_rate": 9.680913193745725e-06, - "loss": 4.4731, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.3626, "step": 5015 }, { "epoch": 0.56, - "grad_norm": 22.625, - "learning_rate": 9.661463903607809e-06, - "loss": 4.4024, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.4002, "step": 5020 }, { "epoch": 0.56, - "grad_norm": 24.625, - "learning_rate": 9.642015895452967e-06, - "loss": 4.5334, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.4398, "step": 5025 }, { "epoch": 0.56, - "grad_norm": 18.875, - "learning_rate": 9.622569242927759e-06, - "loss": 4.5363, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.2824, "step": 5030 }, { "epoch": 0.56, - "grad_norm": 22.0, - "learning_rate": 9.60312401967361e-06, - "loss": 4.4672, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.3933, "step": 5035 }, { "epoch": 0.56, - "grad_norm": 28.625, - "learning_rate": 9.58368029932652e-06, - "loss": 4.4229, + "grad_norm": 0.333984375, + "learning_rate": 0.001, + "loss": 2.3513, "step": 5040 }, { "epoch": 0.56, - "grad_norm": 23.125, - "learning_rate": 9.564238155516817e-06, - "loss": 4.3939, + "grad_norm": 0.34375, + "learning_rate": 0.001, + "loss": 2.3421, "step": 5045 }, { "epoch": 0.56, - "grad_norm": 21.25, - "learning_rate": 9.544797661868844e-06, - "loss": 4.5521, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.3887, "step": 5050 }, { "epoch": 0.56, - "grad_norm": 19.0, - "learning_rate": 9.5253588920007e-06, - "loss": 4.4645, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.3859, "step": 5055 }, { "epoch": 0.56, - "grad_norm": 26.375, - "learning_rate": 9.505921919523959e-06, - "loss": 4.5131, + "grad_norm": 0.33984375, + "learning_rate": 0.001, + "loss": 2.3026, "step": 5060 }, { "epoch": 0.56, - "grad_norm": 18.625, - "learning_rate": 9.486486818043383e-06, - "loss": 4.4888, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.326, "step": 5065 }, { "epoch": 0.57, - "grad_norm": 22.25, - "learning_rate": 9.467053661156654e-06, - "loss": 4.4655, + "grad_norm": 0.3515625, + "learning_rate": 0.001, + "loss": 2.3422, "step": 5070 }, { "epoch": 0.57, - "grad_norm": 22.75, - "learning_rate": 9.447622522454083e-06, - "loss": 4.5144, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.3424, "step": 5075 }, { "epoch": 0.57, - "grad_norm": 19.625, - "learning_rate": 9.428193475518354e-06, - "loss": 4.4091, + "grad_norm": 0.341796875, + "learning_rate": 0.001, + "loss": 2.339, "step": 5080 }, { "epoch": 0.57, - "grad_norm": 18.5, - "learning_rate": 9.408766593924206e-06, - "loss": 4.5175, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.3462, "step": 5085 }, { "epoch": 0.57, - "grad_norm": 20.625, - "learning_rate": 9.389341951238204e-06, - "loss": 4.4197, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.3583, "step": 5090 }, { "epoch": 0.57, - "grad_norm": 19.375, - "learning_rate": 9.369919621018411e-06, - "loss": 4.5675, + "grad_norm": 0.333984375, + "learning_rate": 0.001, + "loss": 2.4476, "step": 5095 }, { "epoch": 0.57, - "grad_norm": 21.875, - "learning_rate": 9.350499676814152e-06, - "loss": 4.4866, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.3964, "step": 5100 }, { "epoch": 0.57, - "grad_norm": 23.625, - "learning_rate": 9.331082192165704e-06, - "loss": 4.4966, + "grad_norm": 0.33984375, + "learning_rate": 0.001, + "loss": 2.3881, "step": 5105 }, { "epoch": 0.57, - "grad_norm": 23.0, - "learning_rate": 9.311667240604038e-06, - "loss": 4.4743, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.3989, "step": 5110 }, { "epoch": 0.57, - "grad_norm": 20.0, - "learning_rate": 9.292254895650524e-06, - "loss": 4.5309, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.3513, "step": 5115 }, { "epoch": 0.57, - "grad_norm": 24.625, - "learning_rate": 9.272845230816673e-06, - "loss": 4.4766, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 2.4226, "step": 5120 }, { "epoch": 0.57, - "grad_norm": 19.75, - "learning_rate": 9.25343831960383e-06, - "loss": 4.4562, + "grad_norm": 0.384765625, + "learning_rate": 0.001, + "loss": 2.318, "step": 5125 }, { "epoch": 0.57, - "grad_norm": 25.25, - "learning_rate": 9.234034235502935e-06, - "loss": 4.4636, + "grad_norm": 0.55078125, + "learning_rate": 0.001, + "loss": 2.3471, "step": 5130 }, { "epoch": 0.57, - "grad_norm": 24.25, - "learning_rate": 9.214633051994204e-06, - "loss": 4.4302, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.3504, "step": 5135 }, { "epoch": 0.57, - "grad_norm": 21.5, - "learning_rate": 9.195234842546877e-06, - "loss": 4.4909, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.3703, "step": 5140 }, { "epoch": 0.57, - "grad_norm": 25.75, - "learning_rate": 9.175839680618925e-06, - "loss": 4.5535, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.356, "step": 5145 }, { "epoch": 0.57, - "grad_norm": 25.25, - "learning_rate": 9.15644763965679e-06, - "loss": 4.4893, + "grad_norm": 0.34765625, + "learning_rate": 0.001, + "loss": 2.3881, "step": 5150 }, { "epoch": 0.57, - "grad_norm": 26.25, - "learning_rate": 9.13705879309508e-06, - "loss": 4.4217, + "grad_norm": 0.333984375, + "learning_rate": 0.001, + "loss": 2.3372, "step": 5155 }, { "epoch": 0.58, - "grad_norm": 30.125, - "learning_rate": 9.11767321435632e-06, - "loss": 4.4868, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.2871, "step": 5160 }, { "epoch": 0.58, - "grad_norm": 22.375, - "learning_rate": 9.098290976850652e-06, - "loss": 4.459, + "grad_norm": 0.341796875, + "learning_rate": 0.001, + "loss": 2.4192, "step": 5165 }, { "epoch": 0.58, - "grad_norm": 23.75, - "learning_rate": 9.07891215397557e-06, - "loss": 4.5241, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.3266, "step": 5170 }, { "epoch": 0.58, - "grad_norm": 23.375, - "learning_rate": 9.059536819115638e-06, - "loss": 4.4932, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.3311, "step": 5175 }, { "epoch": 0.58, - "grad_norm": 21.5, - "learning_rate": 9.040165045642204e-06, - "loss": 4.4991, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.3426, "step": 5180 }, { "epoch": 0.58, - "grad_norm": 21.625, - "learning_rate": 9.02079690691314e-06, - "loss": 4.4207, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.3464, "step": 5185 }, { "epoch": 0.58, - "grad_norm": 24.625, - "learning_rate": 9.001432476272541e-06, - "loss": 4.4409, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.4051, "step": 5190 }, { "epoch": 0.58, - "grad_norm": 21.875, - "learning_rate": 8.982071827050475e-06, - "loss": 4.5309, + "grad_norm": 0.34765625, + "learning_rate": 0.001, + "loss": 2.3669, "step": 5195 }, { "epoch": 0.58, - "grad_norm": 22.125, - "learning_rate": 8.962715032562678e-06, - "loss": 4.4744, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.4088, "step": 5200 }, { "epoch": 0.58, - "grad_norm": 22.875, - "learning_rate": 8.943362166110297e-06, - "loss": 4.4669, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.3947, "step": 5205 }, { "epoch": 0.58, - "grad_norm": 28.5, - "learning_rate": 8.924013300979595e-06, - "loss": 4.4515, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.3573, "step": 5210 }, { "epoch": 0.58, - "grad_norm": 27.125, - "learning_rate": 8.904668510441694e-06, - "loss": 4.4235, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 2.4141, "step": 5215 }, { "epoch": 0.58, - "grad_norm": 20.875, - "learning_rate": 8.885327867752278e-06, - "loss": 4.5492, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.3164, "step": 5220 }, { "epoch": 0.58, - "grad_norm": 19.625, - "learning_rate": 8.865991446151327e-06, - "loss": 4.4744, + "grad_norm": 0.349609375, + "learning_rate": 0.001, + "loss": 2.3358, "step": 5225 }, { "epoch": 0.58, - "grad_norm": 20.125, - "learning_rate": 8.846659318862836e-06, - "loss": 4.5155, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.324, "step": 5230 }, { "epoch": 0.58, - "grad_norm": 19.125, - "learning_rate": 8.827331559094535e-06, - "loss": 4.5716, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.4203, "step": 5235 }, { "epoch": 0.58, - "grad_norm": 36.25, - "learning_rate": 8.80800824003762e-06, - "loss": 4.5285, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.3093, "step": 5240 }, { "epoch": 0.58, - "grad_norm": 19.25, - "learning_rate": 8.788689434866467e-06, - "loss": 4.5596, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.3323, "step": 5245 }, { "epoch": 0.59, - "grad_norm": 22.625, - "learning_rate": 8.769375216738361e-06, - "loss": 4.4601, + "grad_norm": 0.357421875, + "learning_rate": 0.001, + "loss": 2.3769, "step": 5250 }, { "epoch": 0.59, - "grad_norm": 22.375, - "learning_rate": 8.750065658793216e-06, - "loss": 4.4177, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.3383, "step": 5255 }, { "epoch": 0.59, - "grad_norm": 23.5, - "learning_rate": 8.730760834153296e-06, - "loss": 4.5101, + "grad_norm": 0.298828125, + "learning_rate": 0.001, + "loss": 2.2656, "step": 5260 }, { "epoch": 0.59, - "grad_norm": 24.375, - "learning_rate": 8.711460815922943e-06, - "loss": 4.4751, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.2729, "step": 5265 }, { "epoch": 0.59, - "grad_norm": 18.625, - "learning_rate": 8.6921656771883e-06, - "loss": 4.4307, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 2.3279, "step": 5270 }, { "epoch": 0.59, - "grad_norm": 21.625, - "learning_rate": 8.67287549101703e-06, - "loss": 4.5472, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.4237, "step": 5275 }, { "epoch": 0.59, - "grad_norm": 20.5, - "learning_rate": 8.653590330458038e-06, - "loss": 4.4656, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.3641, "step": 5280 }, { "epoch": 0.59, - "grad_norm": 20.625, - "learning_rate": 8.634310268541205e-06, - "loss": 4.5035, + "grad_norm": 0.36328125, + "learning_rate": 0.001, + "loss": 2.3602, "step": 5285 }, { "epoch": 0.59, - "grad_norm": 19.5, - "learning_rate": 8.615035378277095e-06, - "loss": 4.47, + "grad_norm": 0.33203125, + "learning_rate": 0.001, + "loss": 2.385, "step": 5290 }, { "epoch": 0.59, - "grad_norm": 18.125, - "learning_rate": 8.5957657326567e-06, - "loss": 4.4663, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.3023, "step": 5295 }, { "epoch": 0.59, - "grad_norm": 22.875, - "learning_rate": 8.576501404651138e-06, - "loss": 4.4334, + "grad_norm": 0.361328125, + "learning_rate": 0.001, + "loss": 2.3724, "step": 5300 }, { "epoch": 0.59, - "grad_norm": 22.0, - "learning_rate": 8.5572424672114e-06, - "loss": 4.4131, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.3638, "step": 5305 }, { "epoch": 0.59, - "grad_norm": 22.5, - "learning_rate": 8.53798899326806e-06, - "loss": 4.4516, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.3663, "step": 5310 }, { "epoch": 0.59, - "grad_norm": 21.0, - "learning_rate": 8.518741055731e-06, - "loss": 4.4504, + "grad_norm": 0.365234375, + "learning_rate": 0.001, + "loss": 2.4862, "step": 5315 }, { "epoch": 0.59, - "grad_norm": 20.5, - "learning_rate": 8.499498727489144e-06, - "loss": 4.5071, + "grad_norm": 0.333984375, + "learning_rate": 0.001, + "loss": 2.3922, "step": 5320 }, { "epoch": 0.59, - "grad_norm": 19.0, - "learning_rate": 8.480262081410165e-06, - "loss": 4.4416, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.3677, "step": 5325 }, { "epoch": 0.59, - "grad_norm": 21.5, - "learning_rate": 8.461031190340228e-06, - "loss": 4.475, + "grad_norm": 0.337890625, + "learning_rate": 0.001, + "loss": 2.3606, "step": 5330 }, { "epoch": 0.59, - "grad_norm": 23.875, - "learning_rate": 8.441806127103695e-06, - "loss": 4.5658, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.3088, "step": 5335 }, { "epoch": 0.6, - "grad_norm": 17.75, - "learning_rate": 8.422586964502868e-06, - "loss": 4.5125, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.4073, "step": 5340 }, { "epoch": 0.6, - "grad_norm": 22.0, - "learning_rate": 8.403373775317698e-06, - "loss": 4.4962, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.3641, "step": 5345 }, { "epoch": 0.6, - "grad_norm": 20.125, - "learning_rate": 8.38416663230552e-06, - "loss": 4.5726, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.3953, "step": 5350 }, { "epoch": 0.6, - "grad_norm": 21.0, - "learning_rate": 8.364965608200767e-06, - "loss": 4.5521, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.3133, "step": 5355 }, { "epoch": 0.6, - "grad_norm": 19.0, - "learning_rate": 8.34577077571471e-06, - "loss": 4.4084, + "grad_norm": 0.333984375, + "learning_rate": 0.001, + "loss": 2.3236, "step": 5360 }, { "epoch": 0.6, - "grad_norm": 21.875, - "learning_rate": 8.32658220753516e-06, - "loss": 4.5043, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.3984, "step": 5365 }, { "epoch": 0.6, - "grad_norm": 19.625, - "learning_rate": 8.30739997632622e-06, - "loss": 4.4022, + "grad_norm": 0.33984375, + "learning_rate": 0.001, + "loss": 2.3896, "step": 5370 }, { "epoch": 0.6, - "grad_norm": 20.25, - "learning_rate": 8.28822415472799e-06, - "loss": 4.5108, + "grad_norm": 0.34765625, + "learning_rate": 0.001, + "loss": 2.3215, "step": 5375 }, { "epoch": 0.6, - "grad_norm": 20.25, - "learning_rate": 8.269054815356293e-06, - "loss": 4.477, + "grad_norm": 0.376953125, + "learning_rate": 0.001, + "loss": 2.4614, "step": 5380 }, { "epoch": 0.6, - "grad_norm": 23.875, - "learning_rate": 8.249892030802416e-06, - "loss": 4.4787, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.3136, "step": 5385 }, { "epoch": 0.6, - "grad_norm": 21.5, - "learning_rate": 8.230735873632809e-06, - "loss": 4.484, + "grad_norm": 0.302734375, + "learning_rate": 0.001, + "loss": 2.3548, "step": 5390 }, { "epoch": 0.6, - "grad_norm": 25.625, - "learning_rate": 8.211586416388842e-06, - "loss": 4.4651, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.3609, "step": 5395 }, { "epoch": 0.6, - "grad_norm": 25.75, - "learning_rate": 8.192443731586498e-06, - "loss": 4.4805, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.3043, "step": 5400 }, { "epoch": 0.6, - "grad_norm": 21.875, - "learning_rate": 8.173307891716125e-06, - "loss": 4.4718, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.3837, "step": 5405 }, { "epoch": 0.6, - "grad_norm": 22.875, - "learning_rate": 8.154178969242143e-06, - "loss": 4.4185, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.4236, "step": 5410 }, { "epoch": 0.6, - "grad_norm": 22.375, - "learning_rate": 8.13505703660278e-06, - "loss": 4.5002, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 2.3492, "step": 5415 }, { "epoch": 0.6, - "grad_norm": 22.375, - "learning_rate": 8.115942166209802e-06, - "loss": 4.4753, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.3594, "step": 5420 }, { "epoch": 0.6, - "grad_norm": 22.0, - "learning_rate": 8.096834430448213e-06, - "loss": 4.4393, + "grad_norm": 0.341796875, + "learning_rate": 0.001, + "loss": 2.4392, "step": 5425 }, { "epoch": 0.61, - "grad_norm": 25.0, - "learning_rate": 8.077733901676018e-06, - "loss": 4.5197, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.3757, "step": 5430 }, { "epoch": 0.61, - "grad_norm": 19.875, - "learning_rate": 8.058640652223916e-06, - "loss": 4.5328, + "grad_norm": 0.337890625, + "learning_rate": 0.001, + "loss": 2.3445, "step": 5435 }, { "epoch": 0.61, - "grad_norm": 26.5, - "learning_rate": 8.039554754395054e-06, - "loss": 4.4491, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.4049, "step": 5440 }, { "epoch": 0.61, - "grad_norm": 19.625, - "learning_rate": 8.020476280464726e-06, - "loss": 4.4588, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.4535, "step": 5445 }, { "epoch": 0.61, - "grad_norm": 20.875, - "learning_rate": 8.001405302680124e-06, - "loss": 4.5287, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.3116, "step": 5450 }, { "epoch": 0.61, - "grad_norm": 21.25, - "learning_rate": 7.982341893260045e-06, - "loss": 4.5035, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.3645, "step": 5455 }, { "epoch": 0.61, - "grad_norm": 20.25, - "learning_rate": 7.963286124394632e-06, - "loss": 4.5071, + "grad_norm": 0.359375, + "learning_rate": 0.001, + "loss": 2.3229, "step": 5460 }, { "epoch": 0.61, - "grad_norm": 25.125, - "learning_rate": 7.944238068245089e-06, - "loss": 4.3897, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.3648, "step": 5465 }, { "epoch": 0.61, - "grad_norm": 23.0, - "learning_rate": 7.925197796943419e-06, - "loss": 4.4547, + "grad_norm": 0.33984375, + "learning_rate": 0.001, + "loss": 2.4006, "step": 5470 }, { "epoch": 0.61, - "grad_norm": 24.25, - "learning_rate": 7.90616538259214e-06, - "loss": 4.4572, + "grad_norm": 0.357421875, + "learning_rate": 0.001, + "loss": 2.3397, "step": 5475 }, { "epoch": 0.61, - "grad_norm": 23.125, - "learning_rate": 7.887140897264016e-06, - "loss": 4.5621, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.3543, "step": 5480 }, { "epoch": 0.61, - "grad_norm": 22.875, - "learning_rate": 7.868124413001794e-06, - "loss": 4.4864, + "grad_norm": 0.34765625, + "learning_rate": 0.001, + "loss": 2.3028, "step": 5485 }, { "epoch": 0.61, - "grad_norm": 24.375, - "learning_rate": 7.849116001817912e-06, - "loss": 4.425, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.3388, "step": 5490 }, { "epoch": 0.61, - "grad_norm": 19.75, - "learning_rate": 7.830115735694242e-06, - "loss": 4.4854, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.3664, "step": 5495 }, { "epoch": 0.61, - "grad_norm": 21.75, - "learning_rate": 7.811123686581807e-06, - "loss": 4.4898, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.2125, "step": 5500 }, { "epoch": 0.61, - "grad_norm": 16.5, - "learning_rate": 7.792139926400525e-06, - "loss": 4.4735, + "grad_norm": 0.349609375, + "learning_rate": 0.001, + "loss": 2.4234, "step": 5505 }, { "epoch": 0.61, - "grad_norm": 23.375, - "learning_rate": 7.773164527038908e-06, - "loss": 4.4574, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.3316, "step": 5510 }, { "epoch": 0.61, - "grad_norm": 23.75, - "learning_rate": 7.75419756035382e-06, - "loss": 4.4736, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.367, "step": 5515 }, { "epoch": 0.62, - "grad_norm": 23.625, - "learning_rate": 7.735239098170192e-06, - "loss": 4.427, + "grad_norm": 0.349609375, + "learning_rate": 0.001, + "loss": 2.4614, "step": 5520 }, { "epoch": 0.62, - "grad_norm": 20.125, - "learning_rate": 7.716289212280737e-06, - "loss": 4.3837, + "grad_norm": 0.33984375, + "learning_rate": 0.001, + "loss": 2.2681, "step": 5525 }, { "epoch": 0.62, - "grad_norm": 23.375, - "learning_rate": 7.697347974445708e-06, - "loss": 4.5295, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.3821, "step": 5530 }, { "epoch": 0.62, - "grad_norm": 21.0, - "learning_rate": 7.678415456392594e-06, - "loss": 4.4218, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.3998, "step": 5535 }, { "epoch": 0.62, - "grad_norm": 19.0, - "learning_rate": 7.659491729815875e-06, - "loss": 4.4223, + "grad_norm": 0.337890625, + "learning_rate": 0.001, + "loss": 2.3163, "step": 5540 }, { "epoch": 0.62, - "grad_norm": 24.25, - "learning_rate": 7.640576866376729e-06, - "loss": 4.4424, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.3876, "step": 5545 }, { "epoch": 0.62, - "grad_norm": 18.375, - "learning_rate": 7.621670937702782e-06, - "loss": 4.4212, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.3965, "step": 5550 }, { "epoch": 0.62, - "grad_norm": 26.0, - "learning_rate": 7.602774015387814e-06, - "loss": 4.6063, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.382, "step": 5555 }, { "epoch": 0.62, - "grad_norm": 18.5, - "learning_rate": 7.583886170991509e-06, - "loss": 4.5128, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.2978, "step": 5560 }, { "epoch": 0.62, - "grad_norm": 27.5, - "learning_rate": 7.565007476039167e-06, - "loss": 4.448, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.3291, "step": 5565 }, { "epoch": 0.62, - "grad_norm": 22.0, - "learning_rate": 7.546138002021444e-06, - "loss": 4.5346, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.3339, "step": 5570 }, { "epoch": 0.62, - "grad_norm": 25.625, - "learning_rate": 7.527277820394081e-06, - "loss": 4.5118, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.3565, "step": 5575 }, { "epoch": 0.62, - "grad_norm": 21.625, - "learning_rate": 7.508427002577622e-06, - "loss": 4.4902, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.3237, "step": 5580 }, { "epoch": 0.62, - "grad_norm": 23.375, - "learning_rate": 7.489585619957161e-06, - "loss": 4.427, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.4315, "step": 5585 }, { "epoch": 0.62, - "grad_norm": 21.125, - "learning_rate": 7.47075374388205e-06, - "loss": 4.5392, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.231, "step": 5590 }, { "epoch": 0.62, - "grad_norm": 26.0, - "learning_rate": 7.451931445665656e-06, - "loss": 4.454, + "grad_norm": 0.341796875, + "learning_rate": 0.001, + "loss": 2.3903, "step": 5595 }, { "epoch": 0.62, - "grad_norm": 21.625, - "learning_rate": 7.433118796585066e-06, - "loss": 4.5177, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.2526, "step": 5600 }, { "epoch": 0.62, - "grad_norm": 18.375, - "learning_rate": 7.414315867880832e-06, - "loss": 4.4704, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.3466, "step": 5605 }, { "epoch": 0.63, - "grad_norm": 23.25, - "learning_rate": 7.39552273075669e-06, - "loss": 4.4935, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.3012, "step": 5610 }, { "epoch": 0.63, - "grad_norm": 22.0, - "learning_rate": 7.376739456379307e-06, - "loss": 4.4556, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 2.4253, "step": 5615 }, { "epoch": 0.63, - "grad_norm": 26.0, - "learning_rate": 7.357966115877995e-06, - "loss": 4.4683, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.2762, "step": 5620 }, { "epoch": 0.63, - "grad_norm": 24.375, - "learning_rate": 7.3392027803444446e-06, - "loss": 4.5329, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.2793, "step": 5625 }, { "epoch": 0.63, - "grad_norm": 22.125, - "learning_rate": 7.3204495208324685e-06, - "loss": 4.5018, + "grad_norm": 0.349609375, + "learning_rate": 0.001, + "loss": 2.3468, "step": 5630 }, { "epoch": 0.63, - "grad_norm": 26.5, - "learning_rate": 7.301706408357714e-06, - "loss": 4.5101, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.4084, "step": 5635 }, { "epoch": 0.63, - "grad_norm": 22.625, - "learning_rate": 7.2829735138974114e-06, - "loss": 4.5234, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.347, "step": 5640 }, { "epoch": 0.63, - "grad_norm": 26.0, - "learning_rate": 7.264250908390087e-06, - "loss": 4.533, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.3839, "step": 5645 }, { "epoch": 0.63, - "grad_norm": 21.125, - "learning_rate": 7.245538662735316e-06, - "loss": 4.6303, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.3001, "step": 5650 }, { "epoch": 0.63, - "grad_norm": 24.375, - "learning_rate": 7.2268368477934305e-06, - "loss": 4.4931, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.2755, "step": 5655 }, { "epoch": 0.63, - "grad_norm": 22.0, - "learning_rate": 7.208145534385275e-06, - "loss": 4.5871, + "grad_norm": 0.33203125, + "learning_rate": 0.001, + "loss": 2.2986, "step": 5660 }, { "epoch": 0.63, - "grad_norm": 25.5, - "learning_rate": 7.189464793291918e-06, - "loss": 4.5874, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.3897, "step": 5665 }, { "epoch": 0.63, - "grad_norm": 20.375, - "learning_rate": 7.170794695254394e-06, - "loss": 4.4837, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.3523, "step": 5670 }, { "epoch": 0.63, - "grad_norm": 22.25, - "learning_rate": 7.152135310973438e-06, - "loss": 4.5024, + "grad_norm": 0.337890625, + "learning_rate": 0.001, + "loss": 2.3523, "step": 5675 }, { "epoch": 0.63, - "grad_norm": 20.375, - "learning_rate": 7.1334867111092056e-06, - "loss": 4.4262, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.3354, "step": 5680 }, { "epoch": 0.63, - "grad_norm": 21.875, - "learning_rate": 7.114848966281023e-06, - "loss": 4.4852, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.3529, "step": 5685 }, { "epoch": 0.63, - "grad_norm": 25.25, - "learning_rate": 7.096222147067102e-06, - "loss": 4.5084, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.357, "step": 5690 }, { "epoch": 0.63, - "grad_norm": 18.75, - "learning_rate": 7.077606324004288e-06, - "loss": 4.4285, + "grad_norm": 0.333984375, + "learning_rate": 0.001, + "loss": 2.368, "step": 5695 }, { "epoch": 0.64, - "grad_norm": 25.25, - "learning_rate": 7.059001567587777e-06, - "loss": 4.4432, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.3112, "step": 5700 }, { "epoch": 0.64, - "grad_norm": 23.375, - "learning_rate": 7.0404079482708684e-06, - "loss": 4.5151, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.3848, "step": 5705 }, { "epoch": 0.64, - "grad_norm": 23.5, - "learning_rate": 7.021825536464675e-06, - "loss": 4.4839, + "grad_norm": 0.341796875, + "learning_rate": 0.001, + "loss": 2.3793, "step": 5710 }, { "epoch": 0.64, - "grad_norm": 22.25, - "learning_rate": 7.003254402537877e-06, - "loss": 4.5195, + "grad_norm": 0.333984375, + "learning_rate": 0.001, + "loss": 2.3272, "step": 5715 }, { "epoch": 0.64, - "grad_norm": 25.125, - "learning_rate": 6.984694616816448e-06, - "loss": 4.5127, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.4029, "step": 5720 }, { "epoch": 0.64, - "grad_norm": 20.0, - "learning_rate": 6.966146249583378e-06, - "loss": 4.4502, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.3996, "step": 5725 }, { "epoch": 0.64, - "grad_norm": 18.875, - "learning_rate": 6.94760937107843e-06, - "loss": 4.4745, + "grad_norm": 0.34765625, + "learning_rate": 0.001, + "loss": 2.329, "step": 5730 }, { "epoch": 0.64, - "grad_norm": 19.5, - "learning_rate": 6.929084051497847e-06, - "loss": 4.5018, + "grad_norm": 0.353515625, + "learning_rate": 0.001, + "loss": 2.3392, "step": 5735 }, { "epoch": 0.64, - "grad_norm": 22.25, - "learning_rate": 6.910570360994116e-06, - "loss": 4.4945, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.3056, "step": 5740 }, { "epoch": 0.64, - "grad_norm": 25.25, - "learning_rate": 6.892068369675671e-06, - "loss": 4.5662, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.3413, "step": 5745 }, { "epoch": 0.64, - "grad_norm": 21.125, - "learning_rate": 6.8735781476066545e-06, - "loss": 4.4972, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.3949, "step": 5750 }, { "epoch": 0.64, - "grad_norm": 25.75, - "learning_rate": 6.855099764806627e-06, - "loss": 4.5228, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.438, "step": 5755 }, { "epoch": 0.64, - "grad_norm": 21.625, - "learning_rate": 6.8366332912503385e-06, - "loss": 4.4508, + "grad_norm": 0.337890625, + "learning_rate": 0.001, + "loss": 2.4453, "step": 5760 }, { "epoch": 0.64, - "grad_norm": 21.75, - "learning_rate": 6.818178796867419e-06, - "loss": 4.4484, + "grad_norm": 0.337890625, + "learning_rate": 0.001, + "loss": 2.3045, "step": 5765 }, { "epoch": 0.64, - "grad_norm": 21.875, - "learning_rate": 6.799736351542147e-06, - "loss": 4.5147, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.2906, "step": 5770 }, { "epoch": 0.64, - "grad_norm": 18.5, - "learning_rate": 6.7813060251131635e-06, - "loss": 4.5427, + "grad_norm": 0.294921875, + "learning_rate": 0.001, + "loss": 2.2219, "step": 5775 }, { "epoch": 0.64, - "grad_norm": 21.125, - "learning_rate": 6.762887887373229e-06, - "loss": 4.5083, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.3613, "step": 5780 }, { "epoch": 0.64, - "grad_norm": 19.875, - "learning_rate": 6.744482008068935e-06, - "loss": 4.4537, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.3199, "step": 5785 }, { "epoch": 0.65, - "grad_norm": 21.625, - "learning_rate": 6.726088456900465e-06, - "loss": 4.531, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.3443, "step": 5790 }, { "epoch": 0.65, - "grad_norm": 21.5, - "learning_rate": 6.707707303521305e-06, - "loss": 4.4304, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.2965, "step": 5795 }, { "epoch": 0.65, - "grad_norm": 19.75, - "learning_rate": 6.6893386175380026e-06, - "loss": 4.5292, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.2862, "step": 5800 }, { "epoch": 0.65, - "grad_norm": 19.625, - "learning_rate": 6.670982468509888e-06, - "loss": 4.4071, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.3733, "step": 5805 }, { "epoch": 0.65, - "grad_norm": 22.875, - "learning_rate": 6.652638925948816e-06, - "loss": 4.4536, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.2991, "step": 5810 }, { "epoch": 0.65, - "grad_norm": 21.625, - "learning_rate": 6.634308059318908e-06, - "loss": 4.434, + "grad_norm": 0.35546875, + "learning_rate": 0.001, + "loss": 2.3591, "step": 5815 }, { "epoch": 0.65, - "grad_norm": 21.125, - "learning_rate": 6.6159899380362754e-06, - "loss": 4.4139, + "grad_norm": 0.34375, + "learning_rate": 0.001, + "loss": 2.3919, "step": 5820 }, { "epoch": 0.65, - "grad_norm": 22.25, - "learning_rate": 6.5976846314687745e-06, - "loss": 4.4419, + "grad_norm": 0.341796875, + "learning_rate": 0.001, + "loss": 2.2326, "step": 5825 }, { "epoch": 0.65, - "grad_norm": 23.625, - "learning_rate": 6.579392208935722e-06, - "loss": 4.4561, + "grad_norm": 0.341796875, + "learning_rate": 0.001, + "loss": 2.3035, "step": 5830 }, { "epoch": 0.65, - "grad_norm": 20.5, - "learning_rate": 6.561112739707659e-06, - "loss": 4.584, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.3228, "step": 5835 }, { "epoch": 0.65, - "grad_norm": 22.375, - "learning_rate": 6.542846293006062e-06, - "loss": 4.415, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.3327, "step": 5840 }, { "epoch": 0.65, - "grad_norm": 24.5, - "learning_rate": 6.524592938003102e-06, - "loss": 4.5266, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.265, "step": 5845 }, { "epoch": 0.65, - "grad_norm": 24.375, - "learning_rate": 6.506352743821365e-06, - "loss": 4.5679, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.3723, "step": 5850 }, { "epoch": 0.65, - "grad_norm": 24.75, - "learning_rate": 6.48812577953361e-06, - "loss": 4.4841, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.2181, "step": 5855 }, { "epoch": 0.65, - "grad_norm": 19.75, - "learning_rate": 6.46991211416249e-06, - "loss": 4.4658, + "grad_norm": 0.33984375, + "learning_rate": 0.001, + "loss": 2.314, "step": 5860 }, { "epoch": 0.65, - "grad_norm": 22.875, - "learning_rate": 6.451711816680295e-06, - "loss": 4.5259, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 2.2494, "step": 5865 }, { "epoch": 0.65, - "grad_norm": 19.5, - "learning_rate": 6.433524956008699e-06, - "loss": 4.4923, + "grad_norm": 0.38671875, + "learning_rate": 0.001, + "loss": 2.2576, "step": 5870 }, { "epoch": 0.66, - "grad_norm": 21.625, - "learning_rate": 6.415351601018487e-06, - "loss": 4.4493, + "grad_norm": 0.3515625, + "learning_rate": 0.001, + "loss": 2.3208, "step": 5875 }, { "epoch": 0.66, - "grad_norm": 24.375, - "learning_rate": 6.397191820529306e-06, - "loss": 4.4815, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.3072, "step": 5880 }, { "epoch": 0.66, - "grad_norm": 23.625, - "learning_rate": 6.379045683309391e-06, - "loss": 4.4839, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.3374, "step": 5885 }, { "epoch": 0.66, - "grad_norm": 27.125, - "learning_rate": 6.360913258075321e-06, - "loss": 4.4346, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.3156, "step": 5890 }, { "epoch": 0.66, - "grad_norm": 26.5, - "learning_rate": 6.34279461349174e-06, - "loss": 4.5383, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.3659, "step": 5895 }, { "epoch": 0.66, - "grad_norm": 21.875, - "learning_rate": 6.324689818171119e-06, - "loss": 4.4979, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.3847, "step": 5900 }, { "epoch": 0.66, - "grad_norm": 19.0, - "learning_rate": 6.306598940673471e-06, - "loss": 4.4508, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.29, "step": 5905 }, { "epoch": 0.66, - "grad_norm": 17.5, - "learning_rate": 6.288522049506111e-06, - "loss": 4.4946, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.282, "step": 5910 }, { "epoch": 0.66, - "grad_norm": 21.875, - "learning_rate": 6.270459213123395e-06, - "loss": 4.4874, + "grad_norm": 0.33203125, + "learning_rate": 0.001, + "loss": 2.3172, "step": 5915 }, { "epoch": 0.66, - "grad_norm": 19.25, - "learning_rate": 6.252410499926442e-06, - "loss": 4.5908, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.2918, "step": 5920 }, { "epoch": 0.66, - "grad_norm": 20.5, - "learning_rate": 6.234375978262905e-06, - "loss": 4.5826, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.3004, "step": 5925 }, { "epoch": 0.66, - "grad_norm": 22.75, - "learning_rate": 6.216355716426683e-06, - "loss": 4.5211, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.3571, "step": 5930 }, { "epoch": 0.66, - "grad_norm": 17.125, - "learning_rate": 6.198349782657681e-06, - "loss": 4.5585, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.3381, "step": 5935 }, { "epoch": 0.66, - "grad_norm": 18.125, - "learning_rate": 6.1803582451415445e-06, - "loss": 4.5106, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.2733, "step": 5940 }, { "epoch": 0.66, - "grad_norm": 21.0, - "learning_rate": 6.162381172009404e-06, - "loss": 4.5323, + "grad_norm": 0.333984375, + "learning_rate": 0.001, + "loss": 2.3277, "step": 5945 }, { "epoch": 0.66, - "grad_norm": 20.875, - "learning_rate": 6.144418631337614e-06, - "loss": 4.5863, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.4188, "step": 5950 }, { "epoch": 0.66, - "grad_norm": 23.5, - "learning_rate": 6.126470691147497e-06, - "loss": 4.5104, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.2578, "step": 5955 }, { "epoch": 0.66, - "grad_norm": 17.5, - "learning_rate": 6.10853741940509e-06, - "loss": 4.4218, + "grad_norm": 0.392578125, + "learning_rate": 0.001, + "loss": 2.2164, "step": 5960 }, { "epoch": 0.67, - "grad_norm": 21.625, - "learning_rate": 6.090618884020873e-06, - "loss": 4.4925, + "grad_norm": 0.361328125, + "learning_rate": 0.001, + "loss": 2.3124, "step": 5965 }, { "epoch": 0.67, - "grad_norm": 20.625, - "learning_rate": 6.072715152849533e-06, - "loss": 4.4499, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.2956, "step": 5970 }, { "epoch": 0.67, - "grad_norm": 18.5, - "learning_rate": 6.0548262936896885e-06, - "loss": 4.4662, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.2811, "step": 5975 }, { "epoch": 0.67, - "grad_norm": 18.625, - "learning_rate": 6.0369523742836444e-06, - "loss": 4.4995, + "grad_norm": 0.34375, + "learning_rate": 0.001, + "loss": 2.3955, "step": 5980 }, { "epoch": 0.67, - "grad_norm": 20.875, - "learning_rate": 6.019093462317125e-06, - "loss": 4.5146, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.3116, "step": 5985 }, { "epoch": 0.67, - "grad_norm": 23.5, - "learning_rate": 6.0012496254190324e-06, - "loss": 4.5435, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.3184, "step": 5990 }, { "epoch": 0.67, - "grad_norm": 20.875, - "learning_rate": 5.983420931161173e-06, - "loss": 4.4317, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.3773, "step": 5995 }, { "epoch": 0.67, - "grad_norm": 19.25, - "learning_rate": 5.96560744705802e-06, - "loss": 4.447, + "grad_norm": 0.33203125, + "learning_rate": 0.001, + "loss": 2.3402, "step": 6000 }, { "epoch": 0.67, - "grad_norm": 25.375, - "learning_rate": 5.9478092405664335e-06, - "loss": 4.4866, + "grad_norm": 0.333984375, + "learning_rate": 0.001, + "loss": 2.3409, "step": 6005 }, { "epoch": 0.67, - "grad_norm": 28.125, - "learning_rate": 5.930026379085436e-06, - "loss": 4.4889, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.3015, "step": 6010 }, { "epoch": 0.67, - "grad_norm": 27.25, - "learning_rate": 5.912258929955934e-06, - "loss": 4.4795, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.2731, "step": 6015 }, { "epoch": 0.67, - "grad_norm": 22.125, - "learning_rate": 5.894506960460464e-06, - "loss": 4.4754, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.352, "step": 6020 }, { "epoch": 0.67, - "grad_norm": 23.625, - "learning_rate": 5.876770537822956e-06, - "loss": 4.5156, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.3109, "step": 6025 }, { "epoch": 0.67, - "grad_norm": 19.625, - "learning_rate": 5.859049729208454e-06, - "loss": 4.4548, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.3099, "step": 6030 }, { "epoch": 0.67, - "grad_norm": 20.75, - "learning_rate": 5.841344601722884e-06, - "loss": 4.4397, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.2662, "step": 6035 }, { "epoch": 0.67, - "grad_norm": 19.875, - "learning_rate": 5.8236552224127806e-06, - "loss": 4.4769, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.3723, "step": 6040 }, { "epoch": 0.67, - "grad_norm": 20.875, - "learning_rate": 5.805981658265053e-06, - "loss": 4.4571, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.3657, "step": 6045 }, { "epoch": 0.67, - "grad_norm": 22.75, - "learning_rate": 5.788323976206709e-06, - "loss": 4.4971, + "grad_norm": 0.33984375, + "learning_rate": 0.001, + "loss": 2.3337, "step": 6050 }, { "epoch": 0.68, - "grad_norm": 22.875, - "learning_rate": 5.770682243104627e-06, - "loss": 4.4765, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.2631, "step": 6055 }, { "epoch": 0.68, - "grad_norm": 19.75, - "learning_rate": 5.7530565257652805e-06, - "loss": 4.4715, + "grad_norm": 0.333984375, + "learning_rate": 0.001, + "loss": 2.3135, "step": 6060 }, { "epoch": 0.68, - "grad_norm": 19.375, - "learning_rate": 5.735446890934493e-06, - "loss": 4.5654, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.3438, "step": 6065 }, { "epoch": 0.68, - "grad_norm": 26.875, - "learning_rate": 5.717853405297187e-06, - "loss": 4.4541, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.2664, "step": 6070 }, { "epoch": 0.68, - "grad_norm": 19.25, - "learning_rate": 5.700276135477136e-06, - "loss": 4.3978, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.3946, "step": 6075 }, { "epoch": 0.68, - "grad_norm": 24.625, - "learning_rate": 5.682715148036704e-06, - "loss": 4.5655, + "grad_norm": 0.296875, + "learning_rate": 0.001, + "loss": 2.2588, "step": 6080 }, { "epoch": 0.68, - "grad_norm": 25.25, - "learning_rate": 5.665170509476592e-06, - "loss": 4.4319, + "grad_norm": 0.3515625, + "learning_rate": 0.001, + "loss": 2.2975, "step": 6085 }, { "epoch": 0.68, - "grad_norm": 24.0, - "learning_rate": 5.647642286235588e-06, - "loss": 4.4944, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.3101, "step": 6090 }, { "epoch": 0.68, - "grad_norm": 21.875, - "learning_rate": 5.630130544690336e-06, - "loss": 4.4746, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.3642, "step": 6095 }, { "epoch": 0.68, - "grad_norm": 23.0, - "learning_rate": 5.612635351155042e-06, - "loss": 4.4549, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.3494, "step": 6100 }, { "epoch": 0.68, - "grad_norm": 21.125, - "learning_rate": 5.595156771881263e-06, - "loss": 4.4282, + "grad_norm": 0.298828125, + "learning_rate": 0.001, + "loss": 2.2839, "step": 6105 }, { "epoch": 0.68, - "grad_norm": 21.75, - "learning_rate": 5.577694873057638e-06, - "loss": 4.5157, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.3428, "step": 6110 }, { "epoch": 0.68, - "grad_norm": 24.375, - "learning_rate": 5.560249720809639e-06, - "loss": 4.5201, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.2826, "step": 6115 }, { "epoch": 0.68, - "grad_norm": 21.375, - "learning_rate": 5.5428213811993145e-06, - "loss": 4.4931, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.3481, "step": 6120 }, { "epoch": 0.68, - "grad_norm": 20.0, - "learning_rate": 5.525409920225053e-06, - "loss": 4.487, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.3043, "step": 6125 }, { "epoch": 0.68, - "grad_norm": 22.25, - "learning_rate": 5.508015403821331e-06, - "loss": 4.4575, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.2965, "step": 6130 }, { "epoch": 0.68, - "grad_norm": 18.75, - "learning_rate": 5.4906378978584486e-06, - "loss": 4.491, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.2599, "step": 6135 }, { "epoch": 0.68, - "grad_norm": 19.375, - "learning_rate": 5.473277468142295e-06, - "loss": 4.4687, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.2636, "step": 6140 }, { "epoch": 0.69, - "grad_norm": 22.75, - "learning_rate": 5.455934180414089e-06, - "loss": 4.4652, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.3423, "step": 6145 }, { "epoch": 0.69, - "grad_norm": 26.5, - "learning_rate": 5.438608100350149e-06, - "loss": 4.501, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.2692, "step": 6150 }, { "epoch": 0.69, - "grad_norm": 22.875, - "learning_rate": 5.421299293561616e-06, - "loss": 4.3324, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.2963, "step": 6155 }, { "epoch": 0.69, - "grad_norm": 25.25, - "learning_rate": 5.404007825594222e-06, - "loss": 4.4516, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.2212, "step": 6160 }, { "epoch": 0.69, - "grad_norm": 25.625, - "learning_rate": 5.386733761928052e-06, - "loss": 4.4855, + "grad_norm": 0.296875, + "learning_rate": 0.001, + "loss": 2.3136, "step": 6165 }, { "epoch": 0.69, - "grad_norm": 23.125, - "learning_rate": 5.369477167977271e-06, - "loss": 4.4456, + "grad_norm": 0.34765625, + "learning_rate": 0.001, + "loss": 2.3659, "step": 6170 }, { "epoch": 0.69, - "grad_norm": 20.375, - "learning_rate": 5.352238109089889e-06, - "loss": 4.5596, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.2902, "step": 6175 }, { "epoch": 0.69, - "grad_norm": 22.875, - "learning_rate": 5.335016650547518e-06, - "loss": 4.4768, + "grad_norm": 0.294921875, + "learning_rate": 0.001, + "loss": 2.3175, "step": 6180 }, { "epoch": 0.69, - "grad_norm": 24.0, - "learning_rate": 5.317812857565123e-06, - "loss": 4.5208, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.3527, "step": 6185 }, { "epoch": 0.69, - "grad_norm": 19.0, - "learning_rate": 5.300626795290767e-06, - "loss": 4.5265, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.3352, "step": 6190 }, { "epoch": 0.69, - "grad_norm": 25.125, - "learning_rate": 5.283458528805372e-06, - "loss": 4.5313, + "grad_norm": 0.458984375, + "learning_rate": 0.001, + "loss": 2.3264, "step": 6195 }, { "epoch": 0.69, - "grad_norm": 25.125, - "learning_rate": 5.266308123122463e-06, - "loss": 4.473, + "grad_norm": 0.337890625, + "learning_rate": 0.001, + "loss": 2.3021, "step": 6200 }, { "epoch": 0.69, - "grad_norm": 22.125, - "learning_rate": 5.249175643187945e-06, - "loss": 4.4303, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.3128, "step": 6205 }, { "epoch": 0.69, - "grad_norm": 24.25, - "learning_rate": 5.232061153879826e-06, - "loss": 4.5105, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.3982, "step": 6210 }, { "epoch": 0.69, - "grad_norm": 21.125, - "learning_rate": 5.214964720007989e-06, - "loss": 4.3693, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.3052, "step": 6215 }, { "epoch": 0.69, - "grad_norm": 21.75, - "learning_rate": 5.197886406313954e-06, - "loss": 4.5768, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.269, "step": 6220 }, { "epoch": 0.69, - "grad_norm": 22.375, - "learning_rate": 5.18082627747061e-06, - "loss": 4.5081, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.357, "step": 6225 }, { "epoch": 0.69, - "grad_norm": 21.125, - "learning_rate": 5.16378439808199e-06, - "loss": 4.4849, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.3996, "step": 6230 }, { "epoch": 0.7, - "grad_norm": 21.625, - "learning_rate": 5.146760832683013e-06, - "loss": 4.442, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.3695, "step": 6235 }, { "epoch": 0.7, - "grad_norm": 19.75, - "learning_rate": 5.129755645739256e-06, - "loss": 4.5532, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.3206, "step": 6240 }, { "epoch": 0.7, - "grad_norm": 20.625, - "learning_rate": 5.112768901646692e-06, - "loss": 4.401, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.2325, "step": 6245 }, { "epoch": 0.7, - "grad_norm": 24.25, - "learning_rate": 5.095800664731451e-06, - "loss": 4.553, + "grad_norm": 0.33984375, + "learning_rate": 0.001, + "loss": 2.334, "step": 6250 }, { "epoch": 0.7, - "grad_norm": 21.875, - "learning_rate": 5.078850999249591e-06, - "loss": 4.5065, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.2761, "step": 6255 }, { "epoch": 0.7, - "grad_norm": 21.125, - "learning_rate": 5.061919969386833e-06, - "loss": 4.491, + "grad_norm": 0.34765625, + "learning_rate": 0.001, + "loss": 2.3011, "step": 6260 }, { "epoch": 0.7, - "grad_norm": 19.0, - "learning_rate": 5.045007639258329e-06, - "loss": 4.5695, + "grad_norm": 0.34765625, + "learning_rate": 0.001, + "loss": 2.3958, "step": 6265 }, { "epoch": 0.7, - "grad_norm": 20.0, - "learning_rate": 5.028114072908417e-06, - "loss": 4.5523, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.3143, "step": 6270 }, { "epoch": 0.7, - "grad_norm": 19.5, - "learning_rate": 5.011239334310387e-06, - "loss": 4.4133, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.3211, "step": 6275 }, { "epoch": 0.7, - "grad_norm": 24.125, - "learning_rate": 4.9943834873662265e-06, - "loss": 4.4795, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.2919, "step": 6280 }, { "epoch": 0.7, - "grad_norm": 22.75, - "learning_rate": 4.977546595906381e-06, - "loss": 4.5098, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.3387, "step": 6285 }, { "epoch": 0.7, - "grad_norm": 20.625, - "learning_rate": 4.960728723689511e-06, - "loss": 4.4876, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.3049, "step": 6290 }, { "epoch": 0.7, - "grad_norm": 22.75, - "learning_rate": 4.943929934402271e-06, - "loss": 4.5146, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.3557, "step": 6295 }, { "epoch": 0.7, - "grad_norm": 21.875, - "learning_rate": 4.927150291659034e-06, - "loss": 4.3766, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.3384, "step": 6300 }, { "epoch": 0.7, - "grad_norm": 21.75, - "learning_rate": 4.9103898590016705e-06, - "loss": 4.5283, + "grad_norm": 0.30078125, + "learning_rate": 0.001, + "loss": 2.3372, "step": 6305 }, { "epoch": 0.7, - "grad_norm": 20.0, - "learning_rate": 4.893648699899318e-06, - "loss": 4.4735, + "grad_norm": 0.302734375, + "learning_rate": 0.001, + "loss": 2.2197, "step": 6310 }, { "epoch": 0.7, - "grad_norm": 23.625, - "learning_rate": 4.876926877748114e-06, - "loss": 4.4419, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.2533, "step": 6315 }, { "epoch": 0.7, - "grad_norm": 25.125, - "learning_rate": 4.860224455870977e-06, - "loss": 4.4927, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.2072, "step": 6320 }, { "epoch": 0.71, - "grad_norm": 24.25, - "learning_rate": 4.843541497517351e-06, - "loss": 4.4993, + "grad_norm": 0.376953125, + "learning_rate": 0.001, + "loss": 2.2987, "step": 6325 }, { "epoch": 0.71, - "grad_norm": 24.125, - "learning_rate": 4.8268780658629934e-06, - "loss": 4.4471, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.2634, "step": 6330 }, { "epoch": 0.71, - "grad_norm": 28.125, - "learning_rate": 4.8102342240096974e-06, - "loss": 4.5003, + "grad_norm": 0.353515625, + "learning_rate": 0.001, + "loss": 2.3597, "step": 6335 }, { "epoch": 0.71, - "grad_norm": 20.75, - "learning_rate": 4.793610034985083e-06, - "loss": 4.5518, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.2872, "step": 6340 }, { "epoch": 0.71, - "grad_norm": 21.125, - "learning_rate": 4.777005561742342e-06, - "loss": 4.4247, + "grad_norm": 0.337890625, + "learning_rate": 0.001, + "loss": 2.3196, "step": 6345 }, { "epoch": 0.71, - "grad_norm": 26.75, - "learning_rate": 4.760420867160015e-06, - "loss": 4.4752, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.2671, "step": 6350 }, { "epoch": 0.71, - "grad_norm": 25.0, - "learning_rate": 4.743856014041735e-06, - "loss": 4.4177, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.2474, "step": 6355 }, { "epoch": 0.71, - "grad_norm": 20.625, - "learning_rate": 4.727311065115999e-06, - "loss": 4.4572, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.334, "step": 6360 }, { "epoch": 0.71, - "grad_norm": 26.375, - "learning_rate": 4.71078608303594e-06, - "loss": 4.4303, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.3749, "step": 6365 }, { "epoch": 0.71, - "grad_norm": 19.25, - "learning_rate": 4.694281130379067e-06, - "loss": 4.4585, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.2718, "step": 6370 }, { "epoch": 0.71, - "grad_norm": 23.875, - "learning_rate": 4.677796269647046e-06, - "loss": 4.4785, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.3395, "step": 6375 }, { "epoch": 0.71, - "grad_norm": 27.125, - "learning_rate": 4.661331563265455e-06, - "loss": 4.4844, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.3243, "step": 6380 }, { "epoch": 0.71, - "grad_norm": 25.625, - "learning_rate": 4.6448870735835585e-06, - "loss": 4.4885, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.3901, "step": 6385 }, { "epoch": 0.71, - "grad_norm": 27.625, - "learning_rate": 4.62846286287406e-06, - "loss": 4.4087, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.2454, "step": 6390 }, { "epoch": 0.71, - "grad_norm": 19.0, - "learning_rate": 4.612058993332866e-06, - "loss": 4.5189, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.287, "step": 6395 }, { "epoch": 0.71, - "grad_norm": 22.375, - "learning_rate": 4.595675527078853e-06, - "loss": 4.5308, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.283, "step": 6400 }, { "epoch": 0.71, - "grad_norm": 22.25, - "learning_rate": 4.579312526153644e-06, - "loss": 4.5792, + "grad_norm": 0.298828125, + "learning_rate": 0.001, + "loss": 2.3251, "step": 6405 }, { "epoch": 0.71, - "grad_norm": 23.5, - "learning_rate": 4.5629700525213526e-06, - "loss": 4.5423, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.3293, "step": 6410 }, { "epoch": 0.72, - "grad_norm": 22.0, - "learning_rate": 4.5466481680683615e-06, - "loss": 4.5317, + "grad_norm": 0.353515625, + "learning_rate": 0.001, + "loss": 2.2905, "step": 6415 }, { "epoch": 0.72, - "grad_norm": 19.75, - "learning_rate": 4.530346934603084e-06, - "loss": 4.4421, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 2.2483, "step": 6420 }, { "epoch": 0.72, - "grad_norm": 18.75, - "learning_rate": 4.5140664138557385e-06, - "loss": 4.4592, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.3314, "step": 6425 }, { "epoch": 0.72, - "grad_norm": 21.875, - "learning_rate": 4.497806667478102e-06, - "loss": 4.4956, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.3804, "step": 6430 }, { "epoch": 0.72, - "grad_norm": 20.75, - "learning_rate": 4.481567757043279e-06, - "loss": 4.5115, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 2.3254, "step": 6435 }, { "epoch": 0.72, - "grad_norm": 22.5, - "learning_rate": 4.465349744045474e-06, - "loss": 4.4632, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.3487, "step": 6440 }, { "epoch": 0.72, - "grad_norm": 22.375, - "learning_rate": 4.449152689899765e-06, - "loss": 4.513, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.3102, "step": 6445 }, { "epoch": 0.72, - "grad_norm": 26.125, - "learning_rate": 4.432976655941849e-06, - "loss": 4.5246, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.2659, "step": 6450 }, { "epoch": 0.72, - "grad_norm": 21.5, - "learning_rate": 4.416821703427828e-06, - "loss": 4.483, + "grad_norm": 0.30078125, + "learning_rate": 0.001, + "loss": 2.3119, "step": 6455 }, { "epoch": 0.72, - "grad_norm": 24.125, - "learning_rate": 4.400687893533977e-06, - "loss": 4.5556, + "grad_norm": 0.302734375, + "learning_rate": 0.001, + "loss": 2.3058, "step": 6460 }, { "epoch": 0.72, - "grad_norm": 20.25, - "learning_rate": 4.384575287356499e-06, - "loss": 4.5503, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.2881, "step": 6465 }, { "epoch": 0.72, - "grad_norm": 23.25, - "learning_rate": 4.368483945911307e-06, - "loss": 4.4172, + "grad_norm": 0.349609375, + "learning_rate": 0.001, + "loss": 2.2166, "step": 6470 }, { "epoch": 0.72, - "grad_norm": 24.375, - "learning_rate": 4.352413930133781e-06, - "loss": 4.4983, + "grad_norm": 0.33203125, + "learning_rate": 0.001, + "loss": 2.3242, "step": 6475 }, { "epoch": 0.72, - "grad_norm": 21.0, - "learning_rate": 4.3363653008785566e-06, - "loss": 4.5063, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.3057, "step": 6480 }, { "epoch": 0.72, - "grad_norm": 20.75, - "learning_rate": 4.3203381189192725e-06, - "loss": 4.4832, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.2805, "step": 6485 }, { "epoch": 0.72, - "grad_norm": 25.375, - "learning_rate": 4.304332444948347e-06, - "loss": 4.5694, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.2795, "step": 6490 }, { "epoch": 0.72, - "grad_norm": 27.75, - "learning_rate": 4.2883483395767644e-06, - "loss": 4.5368, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.2733, "step": 6495 }, { "epoch": 0.72, - "grad_norm": 19.375, - "learning_rate": 4.272385863333819e-06, - "loss": 4.482, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.2482, "step": 6500 }, { "epoch": 0.73, - "grad_norm": 18.75, - "learning_rate": 4.256445076666903e-06, - "loss": 4.4403, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.2718, "step": 6505 }, { "epoch": 0.73, - "grad_norm": 29.0, - "learning_rate": 4.2405260399412726e-06, - "loss": 4.4659, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.2996, "step": 6510 }, { "epoch": 0.73, - "grad_norm": 21.25, - "learning_rate": 4.224628813439828e-06, - "loss": 4.5126, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.2118, "step": 6515 }, { "epoch": 0.73, - "grad_norm": 19.125, - "learning_rate": 4.20875345736287e-06, - "loss": 4.5624, + "grad_norm": 0.28515625, + "learning_rate": 0.001, + "loss": 2.3748, "step": 6520 }, { "epoch": 0.73, - "grad_norm": 23.5, - "learning_rate": 4.192900031827878e-06, - "loss": 4.5243, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.3675, "step": 6525 }, { "epoch": 0.73, - "grad_norm": 22.625, - "learning_rate": 4.1770685968692856e-06, - "loss": 4.4891, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.2712, "step": 6530 }, { "epoch": 0.73, - "grad_norm": 23.5, - "learning_rate": 4.161259212438259e-06, - "loss": 4.5518, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.2544, "step": 6535 }, { "epoch": 0.73, - "grad_norm": 22.625, - "learning_rate": 4.145471938402453e-06, - "loss": 4.4735, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.3075, "step": 6540 }, { "epoch": 0.73, - "grad_norm": 20.625, - "learning_rate": 4.129706834545795e-06, - "loss": 4.4265, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.299, "step": 6545 }, { "epoch": 0.73, - "grad_norm": 23.375, - "learning_rate": 4.113963960568265e-06, - "loss": 4.5174, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.327, "step": 6550 }, { "epoch": 0.73, - "grad_norm": 22.875, - "learning_rate": 4.0982433760856535e-06, - "loss": 4.4684, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.3941, "step": 6555 }, { "epoch": 0.73, - "grad_norm": 21.625, - "learning_rate": 4.082545140629347e-06, - "loss": 4.48, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.2842, "step": 6560 }, { "epoch": 0.73, - "grad_norm": 21.125, - "learning_rate": 4.066869313646097e-06, - "loss": 4.4201, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.3262, "step": 6565 }, { "epoch": 0.73, - "grad_norm": 24.375, - "learning_rate": 4.051215954497808e-06, - "loss": 4.4189, + "grad_norm": 0.359375, + "learning_rate": 0.001, + "loss": 2.3961, "step": 6570 }, { "epoch": 0.73, - "grad_norm": 23.125, - "learning_rate": 4.035585122461292e-06, - "loss": 4.4098, + "grad_norm": 0.33984375, + "learning_rate": 0.001, + "loss": 2.3889, "step": 6575 }, { "epoch": 0.73, - "grad_norm": 22.5, - "learning_rate": 4.019976876728055e-06, - "loss": 4.5393, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.3397, "step": 6580 }, { "epoch": 0.73, - "grad_norm": 20.625, - "learning_rate": 4.004391276404072e-06, - "loss": 4.5088, + "grad_norm": 0.333984375, + "learning_rate": 0.001, + "loss": 2.2399, "step": 6585 }, { "epoch": 0.73, - "grad_norm": 19.25, - "learning_rate": 3.988828380509575e-06, - "loss": 4.4184, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.3182, "step": 6590 }, { "epoch": 0.74, - "grad_norm": 20.25, - "learning_rate": 3.9732882479788024e-06, - "loss": 4.5726, + "grad_norm": 0.349609375, + "learning_rate": 0.001, + "loss": 2.3264, "step": 6595 }, { "epoch": 0.74, - "grad_norm": 21.25, - "learning_rate": 3.9577709376597975e-06, - "loss": 4.4135, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.3071, "step": 6600 }, { "epoch": 0.74, - "grad_norm": 22.5, - "learning_rate": 3.9422765083141826e-06, - "loss": 4.4714, + "grad_norm": 0.33203125, + "learning_rate": 0.001, + "loss": 2.4024, "step": 6605 }, { "epoch": 0.74, - "grad_norm": 24.625, - "learning_rate": 3.926805018616929e-06, - "loss": 4.4191, + "grad_norm": 0.302734375, + "learning_rate": 0.001, + "loss": 2.2123, "step": 6610 }, { "epoch": 0.74, - "grad_norm": 16.0, - "learning_rate": 3.911356527156142e-06, - "loss": 4.4731, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.2309, "step": 6615 }, { "epoch": 0.74, - "grad_norm": 20.875, - "learning_rate": 3.895931092432829e-06, - "loss": 4.4611, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.1942, "step": 6620 }, { "epoch": 0.74, - "grad_norm": 18.5, - "learning_rate": 3.880528772860699e-06, - "loss": 4.5291, + "grad_norm": 0.298828125, + "learning_rate": 0.001, + "loss": 2.2871, "step": 6625 }, { "epoch": 0.74, - "grad_norm": 23.875, - "learning_rate": 3.865149626765916e-06, - "loss": 4.4931, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.3449, "step": 6630 }, { "epoch": 0.74, - "grad_norm": 20.875, - "learning_rate": 3.849793712386893e-06, - "loss": 4.5322, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.3352, "step": 6635 }, { "epoch": 0.74, - "grad_norm": 24.25, - "learning_rate": 3.8344610878740654e-06, - "loss": 4.4064, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.3952, "step": 6640 }, { "epoch": 0.74, - "grad_norm": 17.875, - "learning_rate": 3.819151811289682e-06, - "loss": 4.535, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.2695, "step": 6645 }, { "epoch": 0.74, - "grad_norm": 22.5, - "learning_rate": 3.8038659406075685e-06, - "loss": 4.5857, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.3078, "step": 6650 }, { "epoch": 0.74, - "grad_norm": 26.875, - "learning_rate": 3.7886035337129146e-06, - "loss": 4.5199, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.3105, "step": 6655 }, { "epoch": 0.74, - "grad_norm": 23.75, - "learning_rate": 3.773364648402068e-06, - "loss": 4.4622, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.3317, "step": 6660 }, { "epoch": 0.74, - "grad_norm": 20.125, - "learning_rate": 3.7581493423822936e-06, - "loss": 4.604, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.3116, "step": 6665 }, { "epoch": 0.74, - "grad_norm": 22.625, - "learning_rate": 3.7429576732715665e-06, - "loss": 4.5318, + "grad_norm": 0.337890625, + "learning_rate": 0.001, + "loss": 2.2339, "step": 6670 }, { "epoch": 0.74, - "grad_norm": 21.625, - "learning_rate": 3.7277896985983507e-06, - "loss": 4.5349, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.3095, "step": 6675 }, { "epoch": 0.74, - "grad_norm": 19.875, - "learning_rate": 3.712645475801394e-06, - "loss": 4.4545, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.239, "step": 6680 }, { "epoch": 0.75, - "grad_norm": 18.625, - "learning_rate": 3.6975250622294877e-06, - "loss": 4.4684, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.4118, "step": 6685 }, { "epoch": 0.75, - "grad_norm": 23.25, - "learning_rate": 3.682428515141263e-06, - "loss": 4.4629, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.2235, "step": 6690 }, { "epoch": 0.75, - "grad_norm": 23.75, - "learning_rate": 3.66735589170498e-06, - "loss": 4.4676, + "grad_norm": 0.333984375, + "learning_rate": 0.001, + "loss": 2.3407, "step": 6695 }, { "epoch": 0.75, - "grad_norm": 22.75, - "learning_rate": 3.6523072489982967e-06, - "loss": 4.4907, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.2263, "step": 6700 }, { "epoch": 0.75, - "grad_norm": 20.5, - "learning_rate": 3.637282644008062e-06, - "loss": 4.484, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.2005, "step": 6705 }, { "epoch": 0.75, - "grad_norm": 18.875, - "learning_rate": 3.622282133630094e-06, - "loss": 4.4688, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.3457, "step": 6710 }, { "epoch": 0.75, - "grad_norm": 24.375, - "learning_rate": 3.607305774668978e-06, - "loss": 4.5047, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.1686, "step": 6715 }, { "epoch": 0.75, - "grad_norm": 20.0, - "learning_rate": 3.592353623837831e-06, - "loss": 4.4861, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.2701, "step": 6720 }, { "epoch": 0.75, - "grad_norm": 17.875, - "learning_rate": 3.5774257377581046e-06, - "loss": 4.4906, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.2268, "step": 6725 }, { "epoch": 0.75, - "grad_norm": 18.125, - "learning_rate": 3.562522172959355e-06, - "loss": 4.4364, + "grad_norm": 0.369140625, + "learning_rate": 0.001, + "loss": 2.232, "step": 6730 }, { "epoch": 0.75, - "grad_norm": 27.875, - "learning_rate": 3.547642985879053e-06, - "loss": 4.4879, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.2332, "step": 6735 }, { "epoch": 0.75, - "grad_norm": 20.75, - "learning_rate": 3.532788232862341e-06, - "loss": 4.4439, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.3156, "step": 6740 }, { "epoch": 0.75, - "grad_norm": 24.875, - "learning_rate": 3.517957970161834e-06, - "loss": 4.4923, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.2292, "step": 6745 }, { "epoch": 0.75, - "grad_norm": 20.25, - "learning_rate": 3.5031522539374174e-06, - "loss": 4.4357, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.267, "step": 6750 }, { "epoch": 0.75, - "grad_norm": 21.5, - "learning_rate": 3.4883711402560105e-06, - "loss": 4.4888, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.3458, "step": 6755 }, { "epoch": 0.75, - "grad_norm": 20.875, - "learning_rate": 3.4736146850913744e-06, - "loss": 4.5935, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.2835, "step": 6760 }, { "epoch": 0.75, - "grad_norm": 26.75, - "learning_rate": 3.4588829443238825e-06, - "loss": 4.4665, + "grad_norm": 0.296875, + "learning_rate": 0.001, + "loss": 2.2314, "step": 6765 }, { "epoch": 0.75, - "grad_norm": 23.625, - "learning_rate": 3.4441759737403356e-06, - "loss": 4.4758, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.2215, "step": 6770 }, { "epoch": 0.76, - "grad_norm": 24.0, - "learning_rate": 3.4294938290337177e-06, - "loss": 4.4964, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.3393, "step": 6775 }, { "epoch": 0.76, - "grad_norm": 20.0, - "learning_rate": 3.414836565803009e-06, - "loss": 4.4898, + "grad_norm": 0.33203125, + "learning_rate": 0.001, + "loss": 2.2511, "step": 6780 }, { "epoch": 0.76, - "grad_norm": 21.125, - "learning_rate": 3.400204239552961e-06, - "loss": 4.4643, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.2021, "step": 6785 }, { "epoch": 0.76, - "grad_norm": 20.625, - "learning_rate": 3.385596905693904e-06, - "loss": 4.41, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.2558, "step": 6790 }, { "epoch": 0.76, - "grad_norm": 23.875, - "learning_rate": 3.3710146195415173e-06, - "loss": 4.425, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.2729, "step": 6795 }, { "epoch": 0.76, - "grad_norm": 19.125, - "learning_rate": 3.356457436316625e-06, - "loss": 4.4517, + "grad_norm": 0.291015625, + "learning_rate": 0.001, + "loss": 2.2082, "step": 6800 }, { "epoch": 0.76, - "grad_norm": 22.875, - "learning_rate": 3.341925411145004e-06, - "loss": 4.4127, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.2801, "step": 6805 }, { "epoch": 0.76, - "grad_norm": 24.125, - "learning_rate": 3.3274185990571505e-06, - "loss": 4.5345, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.312, "step": 6810 }, { "epoch": 0.76, - "grad_norm": 22.5, - "learning_rate": 3.3129370549880834e-06, - "loss": 4.4351, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.2518, "step": 6815 }, { "epoch": 0.76, - "grad_norm": 18.625, - "learning_rate": 3.298480833777138e-06, - "loss": 4.4944, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.2459, "step": 6820 }, { "epoch": 0.76, - "grad_norm": 22.375, - "learning_rate": 3.284049990167759e-06, - "loss": 4.5076, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.1997, "step": 6825 }, { "epoch": 0.76, - "grad_norm": 22.375, - "learning_rate": 3.269644578807286e-06, - "loss": 4.3947, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.3033, "step": 6830 }, { "epoch": 0.76, - "grad_norm": 18.125, - "learning_rate": 3.2552646542467457e-06, - "loss": 4.4459, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.2142, "step": 6835 }, { "epoch": 0.76, - "grad_norm": 21.125, - "learning_rate": 3.240910270940666e-06, - "loss": 4.4681, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.3164, "step": 6840 }, { "epoch": 0.76, - "grad_norm": 18.0, - "learning_rate": 3.2265814832468377e-06, - "loss": 4.5539, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.2225, "step": 6845 }, { "epoch": 0.76, - "grad_norm": 19.5, - "learning_rate": 3.2122783454261332e-06, - "loss": 4.4743, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.3232, "step": 6850 }, { "epoch": 0.76, - "grad_norm": 19.375, - "learning_rate": 3.198000911642287e-06, - "loss": 4.5413, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.2207, "step": 6855 }, { "epoch": 0.76, - "grad_norm": 17.875, - "learning_rate": 3.183749235961708e-06, - "loss": 4.5165, + "grad_norm": 0.34375, + "learning_rate": 0.001, + "loss": 2.3717, "step": 6860 }, { "epoch": 0.77, - "grad_norm": 20.25, - "learning_rate": 3.1695233723532525e-06, - "loss": 4.5506, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.2952, "step": 6865 }, { "epoch": 0.77, - "grad_norm": 19.125, - "learning_rate": 3.1553233746880342e-06, - "loss": 4.3732, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.2302, "step": 6870 }, { "epoch": 0.77, - "grad_norm": 23.125, - "learning_rate": 3.141149296739211e-06, - "loss": 4.5, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.3487, "step": 6875 }, { "epoch": 0.77, - "grad_norm": 21.625, - "learning_rate": 3.127001192181801e-06, - "loss": 4.5359, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.2465, "step": 6880 }, { "epoch": 0.77, - "grad_norm": 23.125, - "learning_rate": 3.112879114592451e-06, - "loss": 4.4301, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.3268, "step": 6885 }, { "epoch": 0.77, - "grad_norm": 27.125, - "learning_rate": 3.09878311744925e-06, - "loss": 4.458, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.2749, "step": 6890 }, { "epoch": 0.77, - "grad_norm": 28.0, - "learning_rate": 3.0847132541315348e-06, - "loss": 4.5116, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.3893, "step": 6895 }, { "epoch": 0.77, - "grad_norm": 25.125, - "learning_rate": 3.070669577919667e-06, - "loss": 4.5045, + "grad_norm": 0.298828125, + "learning_rate": 0.001, + "loss": 2.3373, "step": 6900 }, { "epoch": 0.77, - "grad_norm": 20.625, - "learning_rate": 3.0566521419948447e-06, - "loss": 4.4879, + "grad_norm": 0.337890625, + "learning_rate": 0.001, + "loss": 2.2797, "step": 6905 }, { "epoch": 0.77, - "grad_norm": 24.125, - "learning_rate": 3.0426609994388945e-06, - "loss": 4.5643, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.3495, "step": 6910 }, { "epoch": 0.77, - "grad_norm": 21.25, - "learning_rate": 3.0286962032340837e-06, - "loss": 4.5064, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.297, "step": 6915 }, { "epoch": 0.77, - "grad_norm": 19.625, - "learning_rate": 3.0147578062629025e-06, - "loss": 4.4771, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.2722, "step": 6920 }, { "epoch": 0.77, - "grad_norm": 20.75, - "learning_rate": 3.0008458613078727e-06, - "loss": 4.4053, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.2851, "step": 6925 }, { "epoch": 0.77, - "grad_norm": 19.75, - "learning_rate": 2.986960421051345e-06, - "loss": 4.5311, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.2873, "step": 6930 }, { "epoch": 0.77, - "grad_norm": 24.75, - "learning_rate": 2.973101538075307e-06, - "loss": 4.4804, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.3203, "step": 6935 }, { "epoch": 0.77, - "grad_norm": 20.625, - "learning_rate": 2.959269264861172e-06, - "loss": 4.5412, + "grad_norm": 0.302734375, + "learning_rate": 0.001, + "loss": 2.3418, "step": 6940 }, { "epoch": 0.77, - "grad_norm": 20.375, - "learning_rate": 2.9454636537895852e-06, - "loss": 4.4301, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.2656, "step": 6945 }, { "epoch": 0.77, - "grad_norm": 19.25, - "learning_rate": 2.931684757140235e-06, - "loss": 4.5211, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.2917, "step": 6950 }, { "epoch": 0.78, - "grad_norm": 19.75, - "learning_rate": 2.9179326270916396e-06, - "loss": 4.4479, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.3157, "step": 6955 }, { "epoch": 0.78, - "grad_norm": 20.875, - "learning_rate": 2.904207315720956e-06, - "loss": 4.4608, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.3485, "step": 6960 }, { "epoch": 0.78, - "grad_norm": 18.625, - "learning_rate": 2.8905088750037822e-06, - "loss": 4.4637, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.3457, "step": 6965 }, { "epoch": 0.78, - "grad_norm": 17.25, - "learning_rate": 2.876837356813971e-06, - "loss": 4.5406, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.3021, "step": 6970 }, { "epoch": 0.78, - "grad_norm": 21.875, - "learning_rate": 2.863192812923411e-06, - "loss": 4.5247, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.3554, "step": 6975 }, { "epoch": 0.78, - "grad_norm": 24.125, - "learning_rate": 2.8495752950018517e-06, - "loss": 4.5231, + "grad_norm": 0.302734375, + "learning_rate": 0.001, + "loss": 2.214, "step": 6980 }, { "epoch": 0.78, - "grad_norm": 21.25, - "learning_rate": 2.8359848546166924e-06, - "loss": 4.5005, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.2834, "step": 6985 }, { "epoch": 0.78, - "grad_norm": 26.875, - "learning_rate": 2.822421543232805e-06, - "loss": 4.5678, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.2508, "step": 6990 }, { "epoch": 0.78, - "grad_norm": 23.25, - "learning_rate": 2.8088854122123176e-06, - "loss": 4.4723, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.3555, "step": 6995 }, { "epoch": 0.78, - "grad_norm": 19.375, - "learning_rate": 2.795376512814432e-06, - "loss": 4.4824, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.2949, "step": 7000 }, { "epoch": 0.78, - "grad_norm": 21.0, - "learning_rate": 2.781894896195236e-06, - "loss": 4.4805, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.3022, "step": 7005 }, { "epoch": 0.78, - "grad_norm": 19.75, - "learning_rate": 2.7684406134074947e-06, - "loss": 4.4576, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.2868, "step": 7010 }, { "epoch": 0.78, - "grad_norm": 24.625, - "learning_rate": 2.755013715400463e-06, - "loss": 4.4967, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.3297, "step": 7015 }, { "epoch": 0.78, - "grad_norm": 21.75, - "learning_rate": 2.7416142530197008e-06, - "loss": 4.4742, + "grad_norm": 0.34765625, + "learning_rate": 0.001, + "loss": 2.257, "step": 7020 }, { "epoch": 0.78, - "grad_norm": 23.25, - "learning_rate": 2.728242277006866e-06, - "loss": 4.5676, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.2424, "step": 7025 }, { "epoch": 0.78, - "grad_norm": 22.625, - "learning_rate": 2.714897837999537e-06, - "loss": 4.4431, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.2686, "step": 7030 }, { "epoch": 0.78, - "grad_norm": 21.875, - "learning_rate": 2.7015809865310104e-06, - "loss": 4.4931, + "grad_norm": 0.294921875, + "learning_rate": 0.001, + "loss": 2.2936, "step": 7035 }, { "epoch": 0.78, - "grad_norm": 25.0, - "learning_rate": 2.6882917730301085e-06, - "loss": 4.5309, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.3123, "step": 7040 }, { "epoch": 0.79, - "grad_norm": 22.625, - "learning_rate": 2.6750302478210044e-06, - "loss": 4.5005, + "grad_norm": 0.33203125, + "learning_rate": 0.001, + "loss": 2.2571, "step": 7045 }, { "epoch": 0.79, - "grad_norm": 22.25, - "learning_rate": 2.6617964611230098e-06, - "loss": 4.4308, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.2647, "step": 7050 }, { "epoch": 0.79, - "grad_norm": 20.625, - "learning_rate": 2.6485904630503978e-06, - "loss": 4.4298, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.304, "step": 7055 }, { "epoch": 0.79, - "grad_norm": 20.25, - "learning_rate": 2.635412303612208e-06, - "loss": 4.5509, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.2541, "step": 7060 }, { "epoch": 0.79, - "grad_norm": 19.75, - "learning_rate": 2.6222620327120707e-06, - "loss": 4.5078, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.354, "step": 7065 }, { "epoch": 0.79, - "grad_norm": 21.0, - "learning_rate": 2.6091397001479925e-06, - "loss": 4.4769, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.336, "step": 7070 }, { "epoch": 0.79, - "grad_norm": 21.0, - "learning_rate": 2.59604535561219e-06, - "loss": 4.4793, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.3124, "step": 7075 }, { "epoch": 0.79, - "grad_norm": 22.0, - "learning_rate": 2.582979048690887e-06, - "loss": 4.4936, + "grad_norm": 0.337890625, + "learning_rate": 0.001, + "loss": 2.3281, "step": 7080 }, { "epoch": 0.79, - "grad_norm": 22.375, - "learning_rate": 2.5699408288641435e-06, - "loss": 4.4553, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.3212, "step": 7085 }, { "epoch": 0.79, - "grad_norm": 20.25, - "learning_rate": 2.556930745505651e-06, - "loss": 4.4753, + "grad_norm": 0.3671875, + "learning_rate": 0.001, + "loss": 2.3689, "step": 7090 }, { "epoch": 0.79, - "grad_norm": 24.125, - "learning_rate": 2.543948847882549e-06, - "loss": 4.4753, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.2572, "step": 7095 }, { "epoch": 0.79, - "grad_norm": 20.75, - "learning_rate": 2.5309951851552537e-06, - "loss": 4.4836, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.335, "step": 7100 }, { "epoch": 0.79, - "grad_norm": 29.75, - "learning_rate": 2.5180698063772503e-06, - "loss": 4.5001, + "grad_norm": 0.298828125, + "learning_rate": 0.001, + "loss": 2.2974, "step": 7105 }, { "epoch": 0.79, - "grad_norm": 21.125, - "learning_rate": 2.5051727604949193e-06, - "loss": 4.4608, + "grad_norm": 0.35546875, + "learning_rate": 0.001, + "loss": 2.3526, "step": 7110 }, { "epoch": 0.79, - "grad_norm": 21.375, - "learning_rate": 2.4923040963473478e-06, - "loss": 4.4715, + "grad_norm": 0.2890625, + "learning_rate": 0.001, + "loss": 2.3439, "step": 7115 }, { "epoch": 0.79, - "grad_norm": 20.0, - "learning_rate": 2.4794638626661526e-06, - "loss": 4.4963, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.3026, "step": 7120 }, { "epoch": 0.79, - "grad_norm": 22.25, - "learning_rate": 2.4666521080752825e-06, - "loss": 4.4948, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.3288, "step": 7125 }, { "epoch": 0.79, - "grad_norm": 22.125, - "learning_rate": 2.4538688810908384e-06, - "loss": 4.6327, + "grad_norm": 0.29296875, + "learning_rate": 0.001, + "loss": 2.2952, "step": 7130 }, { "epoch": 0.8, - "grad_norm": 25.5, - "learning_rate": 2.4411142301209e-06, - "loss": 4.4742, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.2942, "step": 7135 }, { "epoch": 0.8, - "grad_norm": 19.875, - "learning_rate": 2.4283882034653273e-06, - "loss": 4.461, + "grad_norm": 0.30078125, + "learning_rate": 0.001, + "loss": 2.2384, "step": 7140 }, { "epoch": 0.8, - "grad_norm": 24.875, - "learning_rate": 2.415690849315586e-06, - "loss": 4.5774, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.2436, "step": 7145 }, { "epoch": 0.8, - "grad_norm": 19.125, - "learning_rate": 2.4030222157545614e-06, - "loss": 4.4999, + "grad_norm": 0.33984375, + "learning_rate": 0.001, + "loss": 2.3166, "step": 7150 }, { "epoch": 0.8, - "grad_norm": 23.375, - "learning_rate": 2.390382350756386e-06, - "loss": 4.517, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.2552, "step": 7155 }, { "epoch": 0.8, - "grad_norm": 21.25, - "learning_rate": 2.3777713021862424e-06, - "loss": 4.5016, + "grad_norm": 0.294921875, + "learning_rate": 0.001, + "loss": 2.2395, "step": 7160 }, { "epoch": 0.8, - "grad_norm": 19.875, - "learning_rate": 2.365189117800195e-06, - "loss": 4.434, + "grad_norm": 0.302734375, + "learning_rate": 0.001, + "loss": 2.2334, "step": 7165 }, { "epoch": 0.8, - "grad_norm": 19.625, - "learning_rate": 2.352635845244996e-06, - "loss": 4.4786, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.3261, "step": 7170 }, { "epoch": 0.8, - "grad_norm": 19.625, - "learning_rate": 2.3401115320579258e-06, - "loss": 4.5094, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.2233, "step": 7175 }, { "epoch": 0.8, - "grad_norm": 31.0, - "learning_rate": 2.3276162256665915e-06, - "loss": 4.5346, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.3239, "step": 7180 }, { "epoch": 0.8, - "grad_norm": 19.25, - "learning_rate": 2.315149973388754e-06, - "loss": 4.3837, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.208, "step": 7185 }, { "epoch": 0.8, - "grad_norm": 24.5, - "learning_rate": 2.3027128224321593e-06, - "loss": 4.5139, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.2938, "step": 7190 }, { "epoch": 0.8, - "grad_norm": 19.75, - "learning_rate": 2.2903048198943445e-06, - "loss": 4.5321, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.3055, "step": 7195 }, { "epoch": 0.8, - "grad_norm": 21.75, - "learning_rate": 2.277926012762469e-06, - "loss": 4.5049, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.2853, "step": 7200 }, { "epoch": 0.8, - "grad_norm": 21.375, - "learning_rate": 2.2655764479131305e-06, - "loss": 4.5154, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.2827, "step": 7205 }, { "epoch": 0.8, - "grad_norm": 23.375, - "learning_rate": 2.253256172112197e-06, - "loss": 4.5115, + "grad_norm": 0.390625, + "learning_rate": 0.001, + "loss": 2.234, "step": 7210 }, { "epoch": 0.8, - "grad_norm": 24.125, - "learning_rate": 2.2409652320146193e-06, - "loss": 4.526, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.2621, "step": 7215 }, { "epoch": 0.8, - "grad_norm": 21.125, - "learning_rate": 2.2287036741642566e-06, - "loss": 4.5109, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.2557, "step": 7220 }, { "epoch": 0.81, - "grad_norm": 28.5, - "learning_rate": 2.2164715449937047e-06, - "loss": 4.4908, + "grad_norm": 0.302734375, + "learning_rate": 0.001, + "loss": 2.1625, "step": 7225 }, { "epoch": 0.81, - "grad_norm": 22.125, - "learning_rate": 2.2042688908241205e-06, - "loss": 4.479, + "grad_norm": 0.33203125, + "learning_rate": 0.001, + "loss": 2.2853, "step": 7230 }, { "epoch": 0.81, - "grad_norm": 25.875, - "learning_rate": 2.192095757865039e-06, - "loss": 4.5369, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.1697, "step": 7235 }, { "epoch": 0.81, - "grad_norm": 20.875, - "learning_rate": 2.179952192214202e-06, - "loss": 4.653, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.3439, "step": 7240 }, { "epoch": 0.81, - "grad_norm": 21.5, - "learning_rate": 2.1678382398573915e-06, - "loss": 4.3876, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.2568, "step": 7245 }, { "epoch": 0.81, - "grad_norm": 24.875, - "learning_rate": 2.155753946668243e-06, - "loss": 4.4264, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.2698, "step": 7250 }, { "epoch": 0.81, - "grad_norm": 25.375, - "learning_rate": 2.1436993584080788e-06, - "loss": 4.5224, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.3522, "step": 7255 }, { "epoch": 0.81, - "grad_norm": 23.75, - "learning_rate": 2.1316745207257295e-06, - "loss": 4.4777, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.2786, "step": 7260 }, { "epoch": 0.81, - "grad_norm": 24.25, - "learning_rate": 2.1196794791573736e-06, - "loss": 4.5013, + "grad_norm": 0.337890625, + "learning_rate": 0.001, + "loss": 2.2758, "step": 7265 }, { "epoch": 0.81, - "grad_norm": 24.625, - "learning_rate": 2.1077142791263506e-06, - "loss": 4.4738, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.2267, "step": 7270 }, { "epoch": 0.81, - "grad_norm": 20.5, - "learning_rate": 2.0957789659429918e-06, - "loss": 4.5301, + "grad_norm": 0.302734375, + "learning_rate": 0.001, + "loss": 2.2619, "step": 7275 }, { "epoch": 0.81, - "grad_norm": 25.0, - "learning_rate": 2.08387358480446e-06, - "loss": 4.4673, + "grad_norm": 0.296875, + "learning_rate": 0.001, + "loss": 2.235, "step": 7280 }, { "epoch": 0.81, - "grad_norm": 20.75, - "learning_rate": 2.0719981807945643e-06, - "loss": 4.5077, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.1946, "step": 7285 }, { "epoch": 0.81, - "grad_norm": 20.25, - "learning_rate": 2.060152798883596e-06, - "loss": 4.4822, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.3306, "step": 7290 }, { "epoch": 0.81, - "grad_norm": 22.875, - "learning_rate": 2.0483374839281534e-06, - "loss": 4.4776, + "grad_norm": 0.3515625, + "learning_rate": 0.001, + "loss": 2.3197, "step": 7295 }, { "epoch": 0.81, - "grad_norm": 21.625, - "learning_rate": 2.036552280670988e-06, - "loss": 4.5647, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.3484, "step": 7300 }, { "epoch": 0.81, - "grad_norm": 23.0, - "learning_rate": 2.0247972337408117e-06, - "loss": 4.5082, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.2644, "step": 7305 }, { "epoch": 0.82, - "grad_norm": 35.75, - "learning_rate": 2.013072387652143e-06, - "loss": 4.5021, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.3429, "step": 7310 }, { "epoch": 0.82, - "grad_norm": 24.125, - "learning_rate": 2.001377786805132e-06, - "loss": 4.4512, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.248, "step": 7315 }, { "epoch": 0.82, - "grad_norm": 22.875, - "learning_rate": 1.9897134754854008e-06, - "loss": 4.4372, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.2096, "step": 7320 }, { "epoch": 0.82, - "grad_norm": 20.875, - "learning_rate": 1.9780794978638663e-06, - "loss": 4.5109, + "grad_norm": 0.376953125, + "learning_rate": 0.001, + "loss": 2.1851, "step": 7325 }, { "epoch": 0.82, - "grad_norm": 20.5, - "learning_rate": 1.9664758979965705e-06, - "loss": 4.3441, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.2935, "step": 7330 }, { "epoch": 0.82, - "grad_norm": 18.75, - "learning_rate": 1.954902719824533e-06, - "loss": 4.5016, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.3234, "step": 7335 }, { "epoch": 0.82, - "grad_norm": 21.375, - "learning_rate": 1.94336000717356e-06, - "loss": 4.4558, + "grad_norm": 0.298828125, + "learning_rate": 0.001, + "loss": 2.2555, "step": 7340 }, { "epoch": 0.82, - "grad_norm": 19.25, - "learning_rate": 1.9318478037540933e-06, - "loss": 4.3644, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.2168, "step": 7345 }, { "epoch": 0.82, - "grad_norm": 22.375, - "learning_rate": 1.9203661531610385e-06, - "loss": 4.4378, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.3627, "step": 7350 }, { "epoch": 0.82, - "grad_norm": 20.25, - "learning_rate": 1.9089150988736094e-06, - "loss": 4.5137, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.2858, "step": 7355 }, { "epoch": 0.82, - "grad_norm": 18.625, - "learning_rate": 1.8974946842551511e-06, - "loss": 4.4813, + "grad_norm": 0.33984375, + "learning_rate": 0.001, + "loss": 2.3002, "step": 7360 }, { "epoch": 0.82, - "grad_norm": 20.875, - "learning_rate": 1.8861049525529795e-06, - "loss": 4.4115, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 2.2817, "step": 7365 }, { "epoch": 0.82, - "grad_norm": 19.0, - "learning_rate": 1.8747459468982188e-06, - "loss": 4.4163, + "grad_norm": 0.291015625, + "learning_rate": 0.001, + "loss": 2.2995, "step": 7370 }, { "epoch": 0.82, - "grad_norm": 21.375, - "learning_rate": 1.8634177103056483e-06, - "loss": 4.485, + "grad_norm": 0.30078125, + "learning_rate": 0.001, + "loss": 2.2896, "step": 7375 }, { "epoch": 0.82, - "grad_norm": 21.5, - "learning_rate": 1.8521202856735187e-06, - "loss": 4.4831, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.2691, "step": 7380 }, { "epoch": 0.82, - "grad_norm": 20.375, - "learning_rate": 1.8408537157834027e-06, - "loss": 4.4458, + "grad_norm": 0.298828125, + "learning_rate": 0.001, + "loss": 2.3571, "step": 7385 }, { "epoch": 0.82, - "grad_norm": 17.875, - "learning_rate": 1.8296180433000366e-06, - "loss": 4.4921, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.2753, "step": 7390 }, { "epoch": 0.82, - "grad_norm": 24.375, - "learning_rate": 1.8184133107711467e-06, - "loss": 4.5356, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.1712, "step": 7395 }, { "epoch": 0.83, - "grad_norm": 25.375, - "learning_rate": 1.8072395606272986e-06, - "loss": 4.5258, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.3137, "step": 7400 }, { "epoch": 0.83, - "grad_norm": 18.5, - "learning_rate": 1.7960968351817253e-06, - "loss": 4.4608, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.3335, "step": 7405 }, { "epoch": 0.83, - "grad_norm": 18.625, - "learning_rate": 1.7849851766301873e-06, - "loss": 4.4611, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.1866, "step": 7410 }, { "epoch": 0.83, - "grad_norm": 21.75, - "learning_rate": 1.7739046270507898e-06, - "loss": 4.496, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.219, "step": 7415 }, { "epoch": 0.83, - "grad_norm": 19.375, - "learning_rate": 1.7628552284038336e-06, - "loss": 4.4563, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.3086, "step": 7420 }, { "epoch": 0.83, - "grad_norm": 23.75, - "learning_rate": 1.751837022531655e-06, - "loss": 4.4397, + "grad_norm": 0.33984375, + "learning_rate": 0.001, + "loss": 2.2989, "step": 7425 }, { "epoch": 0.83, - "grad_norm": 18.75, - "learning_rate": 1.740850051158478e-06, - "loss": 4.4917, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.2209, "step": 7430 }, { "epoch": 0.83, - "grad_norm": 19.625, - "learning_rate": 1.7298943558902347e-06, - "loss": 4.5372, + "grad_norm": 0.33203125, + "learning_rate": 0.001, + "loss": 2.2487, "step": 7435 }, { "epoch": 0.83, - "grad_norm": 26.375, - "learning_rate": 1.7189699782144231e-06, - "loss": 4.4642, + "grad_norm": 0.294921875, + "learning_rate": 0.001, + "loss": 2.2269, "step": 7440 }, { "epoch": 0.83, - "grad_norm": 19.625, - "learning_rate": 1.7080769594999514e-06, - "loss": 4.4863, + "grad_norm": 0.287109375, + "learning_rate": 0.001, + "loss": 2.2807, "step": 7445 }, { "epoch": 0.83, - "grad_norm": 24.625, - "learning_rate": 1.6972153409969694e-06, - "loss": 4.4777, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.3287, "step": 7450 }, { "epoch": 0.83, - "grad_norm": 26.5, - "learning_rate": 1.6863851638367224e-06, - "loss": 4.4801, + "grad_norm": 0.30078125, + "learning_rate": 0.001, + "loss": 2.2568, "step": 7455 }, { "epoch": 0.83, - "grad_norm": 20.625, - "learning_rate": 1.6755864690313904e-06, - "loss": 4.5398, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.2662, "step": 7460 }, { "epoch": 0.83, - "grad_norm": 19.5, - "learning_rate": 1.6648192974739407e-06, - "loss": 4.5178, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.2783, "step": 7465 }, { "epoch": 0.83, - "grad_norm": 21.125, - "learning_rate": 1.6540836899379598e-06, - "loss": 4.4163, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.239, "step": 7470 }, { "epoch": 0.83, - "grad_norm": 17.375, - "learning_rate": 1.6433796870775066e-06, - "loss": 4.5243, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.2978, "step": 7475 }, { "epoch": 0.83, - "grad_norm": 21.25, - "learning_rate": 1.6327073294269646e-06, - "loss": 4.5062, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.3329, "step": 7480 }, { "epoch": 0.83, - "grad_norm": 23.25, - "learning_rate": 1.622066657400877e-06, - "loss": 4.3784, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.1576, "step": 7485 }, { "epoch": 0.84, - "grad_norm": 21.25, - "learning_rate": 1.6114577112937967e-06, - "loss": 4.4241, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.2521, "step": 7490 }, { "epoch": 0.84, - "grad_norm": 23.25, - "learning_rate": 1.600880531280138e-06, - "loss": 4.5093, + "grad_norm": 0.298828125, + "learning_rate": 0.001, + "loss": 2.2536, "step": 7495 }, { "epoch": 0.84, - "grad_norm": 19.375, - "learning_rate": 1.5903351574140246e-06, - "loss": 4.4151, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.2902, "step": 7500 }, { "epoch": 0.84, - "grad_norm": 21.5, - "learning_rate": 1.5798216296291323e-06, - "loss": 4.4884, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.2477, "step": 7505 }, { "epoch": 0.84, - "grad_norm": 24.5, - "learning_rate": 1.5693399877385408e-06, - "loss": 4.5383, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.2292, "step": 7510 }, { "epoch": 0.84, - "grad_norm": 24.25, - "learning_rate": 1.558890271434579e-06, - "loss": 4.5612, + "grad_norm": 0.33984375, + "learning_rate": 0.001, + "loss": 2.2579, "step": 7515 }, { "epoch": 0.84, - "grad_norm": 32.75, - "learning_rate": 1.5484725202886897e-06, - "loss": 4.5016, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.2531, "step": 7520 }, { "epoch": 0.84, - "grad_norm": 23.875, - "learning_rate": 1.5380867737512595e-06, - "loss": 4.4858, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.3611, "step": 7525 }, { "epoch": 0.84, - "grad_norm": 19.0, - "learning_rate": 1.527733071151477e-06, - "loss": 4.5164, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.2527, "step": 7530 }, { "epoch": 0.84, - "grad_norm": 24.125, - "learning_rate": 1.517411451697195e-06, - "loss": 4.425, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.3296, "step": 7535 }, { "epoch": 0.84, - "grad_norm": 20.625, - "learning_rate": 1.507121954474765e-06, - "loss": 4.3715, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.2596, "step": 7540 }, { "epoch": 0.84, - "grad_norm": 22.25, - "learning_rate": 1.4968646184488978e-06, - "loss": 4.4647, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.2396, "step": 7545 }, { "epoch": 0.84, - "grad_norm": 22.5, - "learning_rate": 1.486639482462513e-06, - "loss": 4.4822, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.2573, "step": 7550 }, { "epoch": 0.84, - "grad_norm": 20.75, - "learning_rate": 1.4764465852366016e-06, - "loss": 4.5358, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.3043, "step": 7555 }, { "epoch": 0.84, - "grad_norm": 23.5, - "learning_rate": 1.4662859653700622e-06, - "loss": 4.4391, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.2121, "step": 7560 }, { "epoch": 0.84, - "grad_norm": 20.5, - "learning_rate": 1.45615766133957e-06, - "loss": 4.4732, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.3462, "step": 7565 }, { "epoch": 0.84, - "grad_norm": 19.25, - "learning_rate": 1.4460617114994203e-06, - "loss": 4.4062, + "grad_norm": 0.287109375, + "learning_rate": 0.001, + "loss": 2.3078, "step": 7570 }, { "epoch": 0.84, - "grad_norm": 19.5, - "learning_rate": 1.435998154081394e-06, - "loss": 4.4574, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.316, "step": 7575 }, { "epoch": 0.85, - "grad_norm": 22.875, - "learning_rate": 1.4259670271946036e-06, - "loss": 4.5023, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.292, "step": 7580 }, { "epoch": 0.85, - "grad_norm": 19.25, - "learning_rate": 1.415968368825349e-06, - "loss": 4.4914, + "grad_norm": 0.341796875, + "learning_rate": 0.001, + "loss": 2.3084, "step": 7585 }, { "epoch": 0.85, - "grad_norm": 22.625, - "learning_rate": 1.4060022168369869e-06, - "loss": 4.5358, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.235, "step": 7590 }, { "epoch": 0.85, - "grad_norm": 24.75, - "learning_rate": 1.3960686089697673e-06, - "loss": 4.5022, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.2572, "step": 7595 }, { "epoch": 0.85, - "grad_norm": 25.875, - "learning_rate": 1.386167582840704e-06, - "loss": 4.4937, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.2612, "step": 7600 }, { "epoch": 0.85, - "grad_norm": 20.875, - "learning_rate": 1.3762991759434297e-06, - "loss": 4.4661, + "grad_norm": 0.302734375, + "learning_rate": 0.001, + "loss": 2.2995, "step": 7605 }, { "epoch": 0.85, - "grad_norm": 17.875, - "learning_rate": 1.3664634256480547e-06, - "loss": 4.5035, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.2694, "step": 7610 }, { "epoch": 0.85, - "grad_norm": 22.875, - "learning_rate": 1.3566603692010217e-06, - "loss": 4.5133, + "grad_norm": 0.302734375, + "learning_rate": 0.001, + "loss": 2.2282, "step": 7615 }, { "epoch": 0.85, - "grad_norm": 24.5, - "learning_rate": 1.3468900437249666e-06, - "loss": 4.4752, + "grad_norm": 0.296875, + "learning_rate": 0.001, + "loss": 2.2096, "step": 7620 }, { "epoch": 0.85, - "grad_norm": 27.5, - "learning_rate": 1.337152486218576e-06, - "loss": 4.445, + "grad_norm": 0.30078125, + "learning_rate": 0.001, + "loss": 2.2345, "step": 7625 }, { "epoch": 0.85, - "grad_norm": 21.25, - "learning_rate": 1.3274477335564573e-06, - "loss": 4.4212, + "grad_norm": 0.33984375, + "learning_rate": 0.001, + "loss": 2.2535, "step": 7630 }, { "epoch": 0.85, - "grad_norm": 23.125, - "learning_rate": 1.3177758224889848e-06, - "loss": 4.5492, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.2774, "step": 7635 }, { "epoch": 0.85, - "grad_norm": 23.5, - "learning_rate": 1.3081367896421649e-06, - "loss": 4.4769, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.2084, "step": 7640 }, { "epoch": 0.85, - "grad_norm": 23.375, - "learning_rate": 1.2985306715175038e-06, - "loss": 4.457, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.2479, "step": 7645 }, { "epoch": 0.85, - "grad_norm": 20.0, - "learning_rate": 1.2889575044918679e-06, - "loss": 4.4974, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.2153, "step": 7650 }, { "epoch": 0.85, - "grad_norm": 23.5, - "learning_rate": 1.279417324817336e-06, - "loss": 4.4553, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.2901, "step": 7655 }, { "epoch": 0.85, - "grad_norm": 23.25, - "learning_rate": 1.2699101686210701e-06, - "loss": 4.4338, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.2653, "step": 7660 }, { "epoch": 0.85, - "grad_norm": 19.125, - "learning_rate": 1.2604360719051779e-06, - "loss": 4.4529, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.2731, "step": 7665 }, { "epoch": 0.86, - "grad_norm": 23.375, - "learning_rate": 1.2509950705465811e-06, - "loss": 4.5013, + "grad_norm": 0.294921875, + "learning_rate": 0.001, + "loss": 2.1943, "step": 7670 }, { "epoch": 0.86, - "grad_norm": 22.25, - "learning_rate": 1.2415872002968688e-06, - "loss": 4.4916, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.1499, "step": 7675 }, { "epoch": 0.86, - "grad_norm": 22.75, - "learning_rate": 1.232212496782167e-06, - "loss": 4.4833, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.2693, "step": 7680 }, { "epoch": 0.86, - "grad_norm": 24.5, - "learning_rate": 1.2228709955030104e-06, - "loss": 4.5291, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.2261, "step": 7685 }, { "epoch": 0.86, - "grad_norm": 24.0, - "learning_rate": 1.213562731834198e-06, - "loss": 4.4749, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.1544, "step": 7690 }, { "epoch": 0.86, - "grad_norm": 19.5, - "learning_rate": 1.2042877410246634e-06, - "loss": 4.5482, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.2456, "step": 7695 }, { "epoch": 0.86, - "grad_norm": 28.875, - "learning_rate": 1.195046058197339e-06, - "loss": 4.5779, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.2612, "step": 7700 }, { "epoch": 0.86, - "grad_norm": 25.0, - "learning_rate": 1.1858377183490311e-06, - "loss": 4.4704, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.2848, "step": 7705 }, { "epoch": 0.86, - "grad_norm": 24.375, - "learning_rate": 1.1766627563502774e-06, - "loss": 4.4368, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.2257, "step": 7710 }, { "epoch": 0.86, - "grad_norm": 20.375, - "learning_rate": 1.1675212069452158e-06, - "loss": 4.4507, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.2658, "step": 7715 }, { "epoch": 0.86, - "grad_norm": 22.0, - "learning_rate": 1.1584131047514646e-06, - "loss": 4.5014, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.2202, "step": 7720 }, { "epoch": 0.86, - "grad_norm": 24.25, - "learning_rate": 1.1493384842599754e-06, - "loss": 4.5378, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.2599, "step": 7725 }, { "epoch": 0.86, - "grad_norm": 27.0, - "learning_rate": 1.1402973798349115e-06, - "loss": 4.5173, + "grad_norm": 0.369140625, + "learning_rate": 0.001, + "loss": 2.3019, "step": 7730 }, { "epoch": 0.86, - "grad_norm": 21.5, - "learning_rate": 1.1312898257135153e-06, - "loss": 4.4887, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.3055, "step": 7735 }, { "epoch": 0.86, - "grad_norm": 21.25, - "learning_rate": 1.1223158560059845e-06, - "loss": 4.4745, + "grad_norm": 0.296875, + "learning_rate": 0.001, + "loss": 2.1914, "step": 7740 }, { "epoch": 0.86, - "grad_norm": 21.75, - "learning_rate": 1.1133755046953332e-06, - "loss": 4.4192, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.2936, "step": 7745 }, { "epoch": 0.86, - "grad_norm": 27.375, - "learning_rate": 1.1044688056372676e-06, - "loss": 4.5045, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.2759, "step": 7750 }, { "epoch": 0.86, - "grad_norm": 22.375, - "learning_rate": 1.0955957925600592e-06, - "loss": 4.5381, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.2421, "step": 7755 }, { "epoch": 0.87, - "grad_norm": 21.25, - "learning_rate": 1.0867564990644198e-06, - "loss": 4.4128, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.3079, "step": 7760 }, { "epoch": 0.87, - "grad_norm": 23.125, - "learning_rate": 1.0779509586233638e-06, - "loss": 4.5706, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.2364, "step": 7765 }, { "epoch": 0.87, - "grad_norm": 20.375, - "learning_rate": 1.0691792045820904e-06, - "loss": 4.4544, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.2714, "step": 7770 }, { "epoch": 0.87, - "grad_norm": 26.5, - "learning_rate": 1.0604412701578582e-06, - "loss": 4.5622, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.2086, "step": 7775 }, { "epoch": 0.87, - "grad_norm": 25.25, - "learning_rate": 1.051737188439854e-06, - "loss": 4.499, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.2275, "step": 7780 }, { "epoch": 0.87, - "grad_norm": 22.5, - "learning_rate": 1.043066992389068e-06, - "loss": 4.5117, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.3385, "step": 7785 }, { "epoch": 0.87, - "grad_norm": 23.125, - "learning_rate": 1.0344307148381715e-06, - "loss": 4.5017, + "grad_norm": 0.291015625, + "learning_rate": 0.001, + "loss": 2.2574, "step": 7790 }, { "epoch": 0.87, - "grad_norm": 20.125, - "learning_rate": 1.0258283884913956e-06, - "loss": 4.4225, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.2743, "step": 7795 }, { "epoch": 0.87, - "grad_norm": 21.5, - "learning_rate": 1.0172600459243975e-06, - "loss": 4.4356, + "grad_norm": 0.294921875, + "learning_rate": 0.001, + "loss": 2.2457, "step": 7800 }, { "epoch": 0.87, - "grad_norm": 21.0, - "learning_rate": 1.0087257195841482e-06, - "loss": 4.5252, + "grad_norm": 0.298828125, + "learning_rate": 0.001, + "loss": 2.3624, "step": 7805 }, { "epoch": 0.87, - "grad_norm": 25.5, - "learning_rate": 1.000225441788798e-06, - "loss": 4.4089, + "grad_norm": 0.291015625, + "learning_rate": 0.001, + "loss": 2.2603, "step": 7810 }, { "epoch": 0.87, - "grad_norm": 20.5, - "learning_rate": 9.917592447275714e-07, - "loss": 4.5347, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.2317, "step": 7815 }, { "epoch": 0.87, - "grad_norm": 23.0, - "learning_rate": 9.833271604606254e-07, - "loss": 4.5475, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.2466, "step": 7820 }, { "epoch": 0.87, - "grad_norm": 21.25, - "learning_rate": 9.749292209189377e-07, - "loss": 4.5855, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.3946, "step": 7825 }, { "epoch": 0.87, - "grad_norm": 21.25, - "learning_rate": 9.665654579041917e-07, - "loss": 4.4815, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.2815, "step": 7830 }, { "epoch": 0.87, - "grad_norm": 18.5, - "learning_rate": 9.582359030886446e-07, - "loss": 4.458, + "grad_norm": 0.302734375, + "learning_rate": 0.001, + "loss": 2.2996, "step": 7835 }, { "epoch": 0.87, - "grad_norm": 23.5, - "learning_rate": 9.499405880150126e-07, - "loss": 4.4648, + "grad_norm": 0.30078125, + "learning_rate": 0.001, + "loss": 2.2776, "step": 7840 }, { "epoch": 0.87, - "grad_norm": 19.75, - "learning_rate": 9.4167954409635e-07, - "loss": 4.4798, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.184, "step": 7845 }, { "epoch": 0.88, - "grad_norm": 23.25, - "learning_rate": 9.334528026159384e-07, - "loss": 4.5238, + "grad_norm": 0.33203125, + "learning_rate": 0.001, + "loss": 2.2714, "step": 7850 }, { "epoch": 0.88, - "grad_norm": 21.125, - "learning_rate": 9.252603947271554e-07, - "loss": 4.4987, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.2512, "step": 7855 }, { "epoch": 0.88, - "grad_norm": 22.0, - "learning_rate": 9.171023514533628e-07, - "loss": 4.5459, + "grad_norm": 0.291015625, + "learning_rate": 0.001, + "loss": 2.2536, "step": 7860 }, { "epoch": 0.88, - "grad_norm": 20.25, - "learning_rate": 9.089787036877906e-07, - "loss": 4.459, + "grad_norm": 0.35546875, + "learning_rate": 0.001, + "loss": 2.1873, "step": 7865 }, { "epoch": 0.88, - "grad_norm": 22.25, - "learning_rate": 9.0088948219342e-07, - "loss": 4.4777, + "grad_norm": 0.337890625, + "learning_rate": 0.001, + "loss": 2.2324, "step": 7870 }, { "epoch": 0.88, - "grad_norm": 22.875, - "learning_rate": 8.928347176028629e-07, - "loss": 4.5559, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.3213, "step": 7875 }, { "epoch": 0.88, - "grad_norm": 24.75, - "learning_rate": 8.848144404182468e-07, - "loss": 4.4299, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.2541, "step": 7880 }, { "epoch": 0.88, - "grad_norm": 21.125, - "learning_rate": 8.76828681011106e-07, - "loss": 4.4263, + "grad_norm": 0.302734375, + "learning_rate": 0.001, + "loss": 2.2901, "step": 7885 }, { "epoch": 0.88, - "grad_norm": 22.25, - "learning_rate": 8.688774696222579e-07, - "loss": 4.4683, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.3655, "step": 7890 }, { "epoch": 0.88, - "grad_norm": 21.875, - "learning_rate": 8.609608363616917e-07, - "loss": 4.4105, + "grad_norm": 0.291015625, + "learning_rate": 0.001, + "loss": 2.1354, "step": 7895 }, { "epoch": 0.88, - "grad_norm": 23.875, - "learning_rate": 8.53078811208452e-07, - "loss": 4.5012, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.2691, "step": 7900 }, { "epoch": 0.88, - "grad_norm": 23.125, - "learning_rate": 8.452314240105374e-07, - "loss": 4.4723, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.159, "step": 7905 }, { "epoch": 0.88, - "grad_norm": 20.75, - "learning_rate": 8.374187044847692e-07, - "loss": 4.5058, + "grad_norm": 0.291015625, + "learning_rate": 0.001, + "loss": 2.1469, "step": 7910 }, { "epoch": 0.88, - "grad_norm": 28.25, - "learning_rate": 8.296406822166892e-07, - "loss": 4.5083, + "grad_norm": 0.30078125, + "learning_rate": 0.001, + "loss": 2.2344, "step": 7915 }, { "epoch": 0.88, - "grad_norm": 23.5, - "learning_rate": 8.218973866604485e-07, - "loss": 4.4547, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.2182, "step": 7920 }, { "epoch": 0.88, - "grad_norm": 21.5, - "learning_rate": 8.141888471386939e-07, - "loss": 4.5838, + "grad_norm": 0.302734375, + "learning_rate": 0.001, + "loss": 2.3306, "step": 7925 }, { "epoch": 0.88, - "grad_norm": 23.75, - "learning_rate": 8.065150928424525e-07, - "loss": 4.5333, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.1549, "step": 7930 }, { "epoch": 0.88, - "grad_norm": 20.75, - "learning_rate": 7.988761528310262e-07, - "loss": 4.5734, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.3377, "step": 7935 }, { "epoch": 0.89, - "grad_norm": 25.5, - "learning_rate": 7.912720560318876e-07, - "loss": 4.4832, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.2853, "step": 7940 }, { "epoch": 0.89, - "grad_norm": 18.625, - "learning_rate": 7.837028312405548e-07, - "loss": 4.425, + "grad_norm": 0.302734375, + "learning_rate": 0.001, + "loss": 2.2024, "step": 7945 }, { "epoch": 0.89, - "grad_norm": 26.5, - "learning_rate": 7.76168507120495e-07, - "loss": 4.4884, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.2813, "step": 7950 }, { "epoch": 0.89, - "grad_norm": 18.125, - "learning_rate": 7.686691122030087e-07, - "loss": 4.5416, + "grad_norm": 0.298828125, + "learning_rate": 0.001, + "loss": 2.3001, "step": 7955 }, { "epoch": 0.89, - "grad_norm": 24.5, - "learning_rate": 7.612046748871327e-07, - "loss": 4.5013, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.2503, "step": 7960 }, { "epoch": 0.89, - "grad_norm": 26.125, - "learning_rate": 7.537752234395179e-07, - "loss": 4.4242, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.1995, "step": 7965 }, { "epoch": 0.89, - "grad_norm": 21.625, - "learning_rate": 7.463807859943284e-07, - "loss": 4.5317, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.2804, "step": 7970 }, { "epoch": 0.89, - "grad_norm": 20.75, - "learning_rate": 7.390213905531429e-07, - "loss": 4.5134, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.2591, "step": 7975 }, { "epoch": 0.89, - "grad_norm": 21.25, - "learning_rate": 7.316970649848354e-07, - "loss": 4.4657, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.2143, "step": 7980 }, { "epoch": 0.89, - "grad_norm": 18.25, - "learning_rate": 7.24407837025477e-07, - "loss": 4.4879, + "grad_norm": 0.298828125, + "learning_rate": 0.001, + "loss": 2.2275, "step": 7985 }, { "epoch": 0.89, - "grad_norm": 25.875, - "learning_rate": 7.171537342782298e-07, - "loss": 4.5792, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.3919, "step": 7990 }, { "epoch": 0.89, - "grad_norm": 20.0, - "learning_rate": 7.099347842132454e-07, - "loss": 4.4961, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.309, "step": 7995 }, { "epoch": 0.89, - "grad_norm": 17.875, - "learning_rate": 7.027510141675519e-07, - "loss": 4.5659, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 2.2363, "step": 8000 }, { "epoch": 0.89, - "grad_norm": 23.5, - "learning_rate": 6.956024513449611e-07, - "loss": 4.4595, + "grad_norm": 0.298828125, + "learning_rate": 0.001, + "loss": 2.3152, "step": 8005 }, { "epoch": 0.89, - "grad_norm": 22.75, - "learning_rate": 6.884891228159574e-07, - "loss": 4.4404, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.2732, "step": 8010 }, { "epoch": 0.89, - "grad_norm": 25.25, - "learning_rate": 6.814110555176012e-07, - "loss": 4.4572, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.2561, "step": 8015 }, { "epoch": 0.89, - "grad_norm": 18.0, - "learning_rate": 6.743682762534221e-07, - "loss": 4.5458, + "grad_norm": 0.349609375, + "learning_rate": 0.001, + "loss": 2.2615, "step": 8020 }, { "epoch": 0.89, - "grad_norm": 19.5, - "learning_rate": 6.673608116933194e-07, - "loss": 4.4693, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.2131, "step": 8025 }, { "epoch": 0.9, - "grad_norm": 20.625, - "learning_rate": 6.603886883734634e-07, - "loss": 4.5108, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.2894, "step": 8030 }, { "epoch": 0.9, - "grad_norm": 18.125, - "learning_rate": 6.534519326961908e-07, - "loss": 4.5203, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.3007, "step": 8035 }, { "epoch": 0.9, - "grad_norm": 18.875, - "learning_rate": 6.465505709299058e-07, - "loss": 4.4751, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.2376, "step": 8040 }, { "epoch": 0.9, - "grad_norm": 22.375, - "learning_rate": 6.396846292089831e-07, - "loss": 4.5218, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.3124, "step": 8045 }, { "epoch": 0.9, - "grad_norm": 20.5, - "learning_rate": 6.328541335336668e-07, - "loss": 4.504, + "grad_norm": 0.30078125, + "learning_rate": 0.001, + "loss": 2.1713, "step": 8050 }, { "epoch": 0.9, - "grad_norm": 22.375, - "learning_rate": 6.260591097699731e-07, - "loss": 4.4888, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.3618, "step": 8055 }, { "epoch": 0.9, - "grad_norm": 21.625, - "learning_rate": 6.192995836495885e-07, - "loss": 4.3751, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.3386, "step": 8060 }, { "epoch": 0.9, - "grad_norm": 24.25, - "learning_rate": 6.125755807697809e-07, - "loss": 4.5255, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.2925, "step": 8065 }, { "epoch": 0.9, - "grad_norm": 22.0, - "learning_rate": 6.058871265932919e-07, - "loss": 4.4923, + "grad_norm": 0.33203125, + "learning_rate": 0.001, + "loss": 2.2123, "step": 8070 }, { "epoch": 0.9, - "grad_norm": 22.125, - "learning_rate": 5.992342464482481e-07, - "loss": 4.4805, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.2016, "step": 8075 }, { "epoch": 0.9, - "grad_norm": 23.625, - "learning_rate": 5.926169655280634e-07, - "loss": 4.4458, + "grad_norm": 0.302734375, + "learning_rate": 0.001, + "loss": 2.2361, "step": 8080 }, { "epoch": 0.9, - "grad_norm": 21.5, - "learning_rate": 5.860353088913429e-07, - "loss": 4.4807, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.2677, "step": 8085 }, { "epoch": 0.9, - "grad_norm": 20.75, - "learning_rate": 5.794893014617864e-07, - "loss": 4.5536, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.2364, "step": 8090 }, { "epoch": 0.9, - "grad_norm": 22.75, - "learning_rate": 5.729789680280972e-07, - "loss": 4.4121, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.2845, "step": 8095 }, { "epoch": 0.9, - "grad_norm": 23.75, - "learning_rate": 5.665043332438869e-07, - "loss": 4.4611, + "grad_norm": 0.291015625, + "learning_rate": 0.001, + "loss": 2.2399, "step": 8100 }, { "epoch": 0.9, - "grad_norm": 21.125, - "learning_rate": 5.600654216275814e-07, - "loss": 4.394, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.2638, "step": 8105 }, { "epoch": 0.9, - "grad_norm": 20.5, - "learning_rate": 5.536622575623285e-07, - "loss": 4.4729, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.2932, "step": 8110 }, { "epoch": 0.9, - "grad_norm": 25.25, - "learning_rate": 5.472948652959042e-07, - "loss": 4.4995, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.2401, "step": 8115 }, { "epoch": 0.91, - "grad_norm": 23.5, - "learning_rate": 5.409632689406285e-07, - "loss": 4.5305, + "grad_norm": 0.33203125, + "learning_rate": 0.001, + "loss": 2.2824, "step": 8120 }, { "epoch": 0.91, - "grad_norm": 23.75, - "learning_rate": 5.346674924732587e-07, - "loss": 4.4765, + "grad_norm": 0.35546875, + "learning_rate": 0.001, + "loss": 2.2077, "step": 8125 }, { "epoch": 0.91, - "grad_norm": 22.5, - "learning_rate": 5.284075597349148e-07, - "loss": 4.4889, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.2046, "step": 8130 }, { "epoch": 0.91, - "grad_norm": 24.125, - "learning_rate": 5.221834944309778e-07, - "loss": 4.508, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.1893, "step": 8135 }, { "epoch": 0.91, - "grad_norm": 19.125, - "learning_rate": 5.159953201310097e-07, - "loss": 4.4668, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.2769, "step": 8140 }, { "epoch": 0.91, - "grad_norm": 22.875, - "learning_rate": 5.098430602686532e-07, - "loss": 4.4681, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.217, "step": 8145 }, { "epoch": 0.91, - "grad_norm": 25.625, - "learning_rate": 5.037267381415523e-07, - "loss": 4.4499, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.2389, "step": 8150 }, { "epoch": 0.91, - "grad_norm": 20.0, - "learning_rate": 4.97646376911256e-07, - "loss": 4.4883, + "grad_norm": 0.291015625, + "learning_rate": 0.001, + "loss": 2.266, "step": 8155 }, { "epoch": 0.91, - "grad_norm": 20.375, - "learning_rate": 4.91601999603143e-07, - "loss": 4.457, + "grad_norm": 0.33203125, + "learning_rate": 0.001, + "loss": 2.2453, "step": 8160 }, { "epoch": 0.91, - "grad_norm": 19.375, - "learning_rate": 4.85593629106319e-07, - "loss": 4.5321, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.281, "step": 8165 }, { "epoch": 0.91, - "grad_norm": 22.125, - "learning_rate": 4.796212881735395e-07, - "loss": 4.5107, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.2076, "step": 8170 }, { "epoch": 0.91, - "grad_norm": 21.375, - "learning_rate": 4.736849994211246e-07, - "loss": 4.4256, + "grad_norm": 0.298828125, + "learning_rate": 0.001, + "loss": 2.1804, "step": 8175 }, { "epoch": 0.91, - "grad_norm": 29.125, - "learning_rate": 4.677847853288675e-07, - "loss": 4.4432, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.2309, "step": 8180 }, { "epoch": 0.91, - "grad_norm": 26.125, - "learning_rate": 4.619206682399546e-07, - "loss": 4.5233, + "grad_norm": 0.33203125, + "learning_rate": 0.001, + "loss": 2.2703, "step": 8185 }, { "epoch": 0.91, - "grad_norm": 23.125, - "learning_rate": 4.560926703608737e-07, - "loss": 4.4465, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.3598, "step": 8190 }, { "epoch": 0.91, - "grad_norm": 20.875, - "learning_rate": 4.5030081376134073e-07, - "loss": 4.4984, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.2038, "step": 8195 }, { "epoch": 0.91, - "grad_norm": 23.125, - "learning_rate": 4.4454512037420907e-07, - "loss": 4.4737, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.2529, "step": 8200 }, { "epoch": 0.91, - "grad_norm": 23.0, - "learning_rate": 4.3882561199538376e-07, - "loss": 4.5292, + "grad_norm": 0.30078125, + "learning_rate": 0.001, + "loss": 2.2816, "step": 8205 }, { "epoch": 0.92, - "grad_norm": 27.0, - "learning_rate": 4.331423102837462e-07, - "loss": 4.4744, + "grad_norm": 0.30078125, + "learning_rate": 0.001, + "loss": 2.238, "step": 8210 }, { "epoch": 0.92, - "grad_norm": 17.625, - "learning_rate": 4.2749523676106965e-07, - "loss": 4.4291, + "grad_norm": 0.302734375, + "learning_rate": 0.001, + "loss": 2.2579, "step": 8215 }, { "epoch": 0.92, - "grad_norm": 20.125, - "learning_rate": 4.218844128119348e-07, - "loss": 4.5029, + "grad_norm": 0.294921875, + "learning_rate": 0.001, + "loss": 2.1992, "step": 8220 }, { "epoch": 0.92, - "grad_norm": 20.75, - "learning_rate": 4.163098596836501e-07, - "loss": 4.5178, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.1818, "step": 8225 }, { "epoch": 0.92, - "grad_norm": 20.5, - "learning_rate": 4.10771598486176e-07, - "loss": 4.5474, + "grad_norm": 0.33984375, + "learning_rate": 0.001, + "loss": 2.1506, "step": 8230 }, { "epoch": 0.92, - "grad_norm": 22.125, - "learning_rate": 4.052696501920372e-07, - "loss": 4.5362, + "grad_norm": 0.298828125, + "learning_rate": 0.001, + "loss": 2.2157, "step": 8235 }, { "epoch": 0.92, - "grad_norm": 20.25, - "learning_rate": 3.998040356362498e-07, - "loss": 4.4329, + "grad_norm": 0.296875, + "learning_rate": 0.001, + "loss": 2.2456, "step": 8240 }, { "epoch": 0.92, - "grad_norm": 23.5, - "learning_rate": 3.9437477551623637e-07, - "loss": 4.4357, + "grad_norm": 0.30078125, + "learning_rate": 0.001, + "loss": 2.2597, "step": 8245 }, { "epoch": 0.92, - "grad_norm": 26.75, - "learning_rate": 3.889818903917542e-07, - "loss": 4.5242, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.2447, "step": 8250 }, { "epoch": 0.92, - "grad_norm": 19.125, - "learning_rate": 3.836254006848139e-07, - "loss": 4.4828, + "grad_norm": 0.302734375, + "learning_rate": 0.001, + "loss": 2.1631, "step": 8255 }, { "epoch": 0.92, - "grad_norm": 30.0, - "learning_rate": 3.7830532667959993e-07, - "loss": 4.4982, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.2394, "step": 8260 }, { "epoch": 0.92, - "grad_norm": 20.75, - "learning_rate": 3.730216885224003e-07, - "loss": 4.4922, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.2786, "step": 8265 }, { "epoch": 0.92, - "grad_norm": 21.375, - "learning_rate": 3.67774506221521e-07, - "loss": 4.4983, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.2592, "step": 8270 }, { "epoch": 0.92, - "grad_norm": 21.25, - "learning_rate": 3.625637996472198e-07, - "loss": 4.4771, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.2017, "step": 8275 }, { "epoch": 0.92, - "grad_norm": 24.75, - "learning_rate": 3.5738958853162253e-07, - "loss": 4.5612, + "grad_norm": 0.302734375, + "learning_rate": 0.001, + "loss": 2.2654, "step": 8280 }, { "epoch": 0.92, - "grad_norm": 22.75, - "learning_rate": 3.522518924686569e-07, - "loss": 4.3921, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.2704, "step": 8285 }, { "epoch": 0.92, - "grad_norm": 22.875, - "learning_rate": 3.471507309139732e-07, - "loss": 4.5872, + "grad_norm": 0.34375, + "learning_rate": 0.001, + "loss": 2.3055, "step": 8290 }, { "epoch": 0.92, - "grad_norm": 22.75, - "learning_rate": 3.420861231848671e-07, - "loss": 4.4813, + "grad_norm": 0.298828125, + "learning_rate": 0.001, + "loss": 2.1985, "step": 8295 }, { "epoch": 0.93, - "grad_norm": 21.25, - "learning_rate": 3.3705808846021373e-07, - "loss": 4.4333, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.259, "step": 8300 }, { "epoch": 0.93, - "grad_norm": 24.75, - "learning_rate": 3.3206664578039225e-07, - "loss": 4.4817, + "grad_norm": 0.302734375, + "learning_rate": 0.001, + "loss": 2.276, "step": 8305 }, { "epoch": 0.93, - "grad_norm": 18.5, - "learning_rate": 3.27111814047214e-07, - "loss": 4.4239, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.1859, "step": 8310 }, { "epoch": 0.93, - "grad_norm": 19.0, - "learning_rate": 3.221936120238478e-07, - "loss": 4.4671, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.2696, "step": 8315 }, { "epoch": 0.93, - "grad_norm": 21.25, - "learning_rate": 3.173120583347522e-07, - "loss": 4.4562, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.2577, "step": 8320 }, { "epoch": 0.93, - "grad_norm": 23.25, - "learning_rate": 3.1246717146560803e-07, - "loss": 4.5081, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.2533, "step": 8325 }, { "epoch": 0.93, - "grad_norm": 19.125, - "learning_rate": 3.0765896976323927e-07, - "loss": 4.4779, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.3143, "step": 8330 }, { "epoch": 0.93, - "grad_norm": 21.0, - "learning_rate": 3.028874714355512e-07, - "loss": 4.5741, + "grad_norm": 0.298828125, + "learning_rate": 0.001, + "loss": 2.2516, "step": 8335 }, { "epoch": 0.93, - "grad_norm": 22.875, - "learning_rate": 2.981526945514601e-07, - "loss": 4.486, + "grad_norm": 0.30078125, + "learning_rate": 0.001, + "loss": 2.2624, "step": 8340 }, { "epoch": 0.93, - "grad_norm": 19.375, - "learning_rate": 2.9345465704082363e-07, - "loss": 4.5119, + "grad_norm": 0.2890625, + "learning_rate": 0.001, + "loss": 2.1764, "step": 8345 }, { "epoch": 0.93, - "grad_norm": 22.5, - "learning_rate": 2.887933766943729e-07, - "loss": 4.4288, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.2136, "step": 8350 }, { "epoch": 0.93, - "grad_norm": 21.125, - "learning_rate": 2.8416887116364255e-07, - "loss": 4.4234, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.2297, "step": 8355 }, { "epoch": 0.93, - "grad_norm": 22.875, - "learning_rate": 2.79581157960912e-07, - "loss": 4.549, + "grad_norm": 0.30078125, + "learning_rate": 0.001, + "loss": 2.2867, "step": 8360 }, { "epoch": 0.93, - "grad_norm": 30.0, - "learning_rate": 2.75030254459131e-07, - "loss": 4.3869, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.2896, "step": 8365 }, { "epoch": 0.93, - "grad_norm": 18.625, - "learning_rate": 2.7051617789185636e-07, - "loss": 4.4901, + "grad_norm": 0.302734375, + "learning_rate": 0.001, + "loss": 2.2175, "step": 8370 }, { "epoch": 0.93, - "grad_norm": 23.0, - "learning_rate": 2.6603894535318754e-07, - "loss": 4.4702, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.19, "step": 8375 }, { "epoch": 0.93, - "grad_norm": 19.375, - "learning_rate": 2.6159857379770336e-07, - "loss": 4.4127, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.134, "step": 8380 }, { "epoch": 0.93, - "grad_norm": 19.5, - "learning_rate": 2.571950800403922e-07, - "loss": 4.5212, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.2358, "step": 8385 }, { "epoch": 0.94, - "grad_norm": 23.125, - "learning_rate": 2.5282848075659504e-07, - "loss": 4.5268, + "grad_norm": 0.287109375, + "learning_rate": 0.001, + "loss": 2.1986, "step": 8390 }, { "epoch": 0.94, - "grad_norm": 22.875, - "learning_rate": 2.4849879248193596e-07, - "loss": 4.5049, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.203, "step": 8395 }, { "epoch": 0.94, - "grad_norm": 22.375, - "learning_rate": 2.4420603161226966e-07, - "loss": 4.4839, + "grad_norm": 0.287109375, + "learning_rate": 0.001, + "loss": 2.2181, "step": 8400 }, { "epoch": 0.94, - "grad_norm": 22.625, - "learning_rate": 2.3995021440360364e-07, - "loss": 4.4563, + "grad_norm": 0.294921875, + "learning_rate": 0.001, + "loss": 2.2655, "step": 8405 }, { "epoch": 0.94, - "grad_norm": 20.375, - "learning_rate": 2.3573135697205207e-07, - "loss": 4.4799, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.2862, "step": 8410 }, { "epoch": 0.94, - "grad_norm": 19.75, - "learning_rate": 2.3154947529376525e-07, - "loss": 4.4489, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.3058, "step": 8415 }, { "epoch": 0.94, - "grad_norm": 18.75, - "learning_rate": 2.2740458520487453e-07, - "loss": 4.5091, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.2694, "step": 8420 }, { "epoch": 0.94, - "grad_norm": 20.5, - "learning_rate": 2.2329670240142543e-07, - "loss": 4.3693, + "grad_norm": 0.30078125, + "learning_rate": 0.001, + "loss": 2.2549, "step": 8425 }, { "epoch": 0.94, - "grad_norm": 20.875, - "learning_rate": 2.1922584243932787e-07, - "loss": 4.4582, + "grad_norm": 0.34765625, + "learning_rate": 0.001, + "loss": 2.2976, "step": 8430 }, { "epoch": 0.94, - "grad_norm": 20.875, - "learning_rate": 2.1519202073428945e-07, - "loss": 4.4446, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.2406, "step": 8435 }, { "epoch": 0.94, - "grad_norm": 20.0, - "learning_rate": 2.1119525256176e-07, - "loss": 4.4928, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.2485, "step": 8440 }, { "epoch": 0.94, - "grad_norm": 24.0, - "learning_rate": 2.072355530568726e-07, - "loss": 4.4579, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.1618, "step": 8445 }, { "epoch": 0.94, - "grad_norm": 22.0, - "learning_rate": 2.0331293721438716e-07, - "loss": 4.4809, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.2759, "step": 8450 }, { "epoch": 0.94, - "grad_norm": 24.5, - "learning_rate": 1.994274198886381e-07, - "loss": 4.4465, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.2618, "step": 8455 }, { "epoch": 0.94, - "grad_norm": 22.75, - "learning_rate": 1.9557901579346672e-07, - "loss": 4.5267, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.203, "step": 8460 }, { "epoch": 0.94, - "grad_norm": 29.25, - "learning_rate": 1.9176773950217553e-07, - "loss": 4.5381, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.1972, "step": 8465 }, { "epoch": 0.94, - "grad_norm": 25.875, - "learning_rate": 1.879936054474707e-07, - "loss": 4.5617, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.1704, "step": 8470 }, { "epoch": 0.94, - "grad_norm": 24.25, - "learning_rate": 1.8425662792140641e-07, - "loss": 4.432, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.2045, "step": 8475 }, { "epoch": 0.95, - "grad_norm": 27.0, - "learning_rate": 1.8055682107532945e-07, - "loss": 4.499, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.188, "step": 8480 }, { "epoch": 0.95, - "grad_norm": 20.75, - "learning_rate": 1.7689419891982585e-07, - "loss": 4.4516, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.2592, "step": 8485 }, { "epoch": 0.95, - "grad_norm": 25.125, - "learning_rate": 1.7326877532467534e-07, - "loss": 4.5226, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.2913, "step": 8490 }, { "epoch": 0.95, - "grad_norm": 22.375, - "learning_rate": 1.6968056401878486e-07, - "loss": 4.4667, + "grad_norm": 0.294921875, + "learning_rate": 0.001, + "loss": 2.2592, "step": 8495 }, { "epoch": 0.95, - "grad_norm": 24.5, - "learning_rate": 1.6612957859014733e-07, - "loss": 4.4464, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.2068, "step": 8500 }, { "epoch": 0.95, - "grad_norm": 19.125, - "learning_rate": 1.626158324857907e-07, - "loss": 4.4693, + "grad_norm": 0.30078125, + "learning_rate": 0.001, + "loss": 2.2432, "step": 8505 }, { "epoch": 0.95, - "grad_norm": 24.625, - "learning_rate": 1.5913933901171687e-07, - "loss": 4.4457, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.1883, "step": 8510 }, { "epoch": 0.95, - "grad_norm": 24.0, - "learning_rate": 1.5570011133286267e-07, - "loss": 4.596, + "grad_norm": 0.291015625, + "learning_rate": 0.001, + "loss": 2.2438, "step": 8515 }, { "epoch": 0.95, - "grad_norm": 20.375, - "learning_rate": 1.5229816247304242e-07, - "loss": 4.4503, + "grad_norm": 0.30078125, + "learning_rate": 0.001, + "loss": 2.1702, "step": 8520 }, { "epoch": 0.95, - "grad_norm": 24.125, - "learning_rate": 1.4893350531490436e-07, - "loss": 4.4432, + "grad_norm": 0.296875, + "learning_rate": 0.001, + "loss": 2.2469, "step": 8525 }, { "epoch": 0.95, - "grad_norm": 22.25, - "learning_rate": 1.4560615259987642e-07, - "loss": 4.4331, + "grad_norm": 0.3203125, + "learning_rate": 0.001, + "loss": 2.1156, "step": 8530 }, { "epoch": 0.95, - "grad_norm": 19.875, - "learning_rate": 1.4231611692812175e-07, - "loss": 4.5166, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.1934, "step": 8535 }, { "epoch": 0.95, - "grad_norm": 21.25, - "learning_rate": 1.3906341075848652e-07, - "loss": 4.4565, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.3777, "step": 8540 }, { "epoch": 0.95, - "grad_norm": 23.125, - "learning_rate": 1.3584804640846326e-07, - "loss": 4.518, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.2576, "step": 8545 }, { "epoch": 0.95, - "grad_norm": 24.0, - "learning_rate": 1.3267003605412888e-07, - "loss": 4.4478, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.2462, "step": 8550 }, { "epoch": 0.95, - "grad_norm": 21.875, - "learning_rate": 1.2952939173010993e-07, - "loss": 4.412, + "grad_norm": 0.33203125, + "learning_rate": 0.001, + "loss": 2.2936, "step": 8555 }, { "epoch": 0.95, - "grad_norm": 25.875, - "learning_rate": 1.2642612532953512e-07, - "loss": 4.5687, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.3116, "step": 8560 }, { "epoch": 0.95, - "grad_norm": 28.875, - "learning_rate": 1.2336024860398755e-07, - "loss": 4.5121, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.2689, "step": 8565 }, { "epoch": 0.96, - "grad_norm": 20.875, - "learning_rate": 1.2033177316346012e-07, - "loss": 4.449, + "grad_norm": 0.36328125, + "learning_rate": 0.001, + "loss": 2.3067, "step": 8570 }, { "epoch": 0.96, - "grad_norm": 19.375, - "learning_rate": 1.1734071047631468e-07, - "loss": 4.5158, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.3342, "step": 8575 }, { "epoch": 0.96, - "grad_norm": 20.5, - "learning_rate": 1.1438707186923636e-07, - "loss": 4.513, + "grad_norm": 0.30078125, + "learning_rate": 0.001, + "loss": 2.2079, "step": 8580 }, { "epoch": 0.96, - "grad_norm": 23.375, - "learning_rate": 1.1147086852719036e-07, - "loss": 4.565, + "grad_norm": 0.33984375, + "learning_rate": 0.001, + "loss": 2.243, "step": 8585 }, { "epoch": 0.96, - "grad_norm": 22.5, - "learning_rate": 1.08592111493383e-07, - "loss": 4.4658, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.2, "step": 8590 }, { "epoch": 0.96, - "grad_norm": 21.25, - "learning_rate": 1.0575081166921297e-07, - "loss": 4.5096, + "grad_norm": 0.2890625, + "learning_rate": 0.001, + "loss": 2.2437, "step": 8595 }, { "epoch": 0.96, - "grad_norm": 21.125, - "learning_rate": 1.0294697981423907e-07, - "loss": 4.4186, + "grad_norm": 0.298828125, + "learning_rate": 0.001, + "loss": 2.1937, "step": 8600 }, { "epoch": 0.96, - "grad_norm": 25.5, - "learning_rate": 1.0018062654613137e-07, - "loss": 4.4645, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.2336, "step": 8605 }, { "epoch": 0.96, - "grad_norm": 20.375, - "learning_rate": 9.745176234063569e-08, - "loss": 4.4634, + "grad_norm": 0.314453125, + "learning_rate": 0.001, + "loss": 2.2758, "step": 8610 }, { "epoch": 0.96, - "grad_norm": 33.5, - "learning_rate": 9.476039753153366e-08, - "loss": 4.4904, + "grad_norm": 0.345703125, + "learning_rate": 0.001, + "loss": 2.2071, "step": 8615 }, { "epoch": 0.96, - "grad_norm": 19.125, - "learning_rate": 9.210654231060046e-08, - "loss": 4.4314, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.149, "step": 8620 }, { "epoch": 0.96, - "grad_norm": 23.125, - "learning_rate": 8.949020672756936e-08, - "loss": 4.4352, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.257, "step": 8625 }, { "epoch": 0.96, - "grad_norm": 23.125, - "learning_rate": 8.691140069009285e-08, - "loss": 4.4103, + "grad_norm": 0.296875, + "learning_rate": 0.001, + "loss": 2.207, "step": 8630 }, { "epoch": 0.96, - "grad_norm": 22.125, - "learning_rate": 8.437013396370485e-08, - "loss": 4.4056, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.2858, "step": 8635 }, { "epoch": 0.96, - "grad_norm": 22.5, - "learning_rate": 8.186641617178526e-08, - "loss": 4.4983, + "grad_norm": 0.328125, + "learning_rate": 0.001, + "loss": 2.2318, "step": 8640 }, { "epoch": 0.96, - "grad_norm": 25.125, - "learning_rate": 7.940025679551766e-08, - "loss": 4.4142, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.295, "step": 8645 }, { "epoch": 0.96, - "grad_norm": 22.375, - "learning_rate": 7.697166517386168e-08, - "loss": 4.471, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.1429, "step": 8650 }, { "epoch": 0.96, - "grad_norm": 22.0, - "learning_rate": 7.458065050351404e-08, - "loss": 4.421, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.1635, "step": 8655 }, { "epoch": 0.97, - "grad_norm": 24.125, - "learning_rate": 7.222722183887088e-08, - "loss": 4.4875, + "grad_norm": 0.302734375, + "learning_rate": 0.001, + "loss": 2.3268, "step": 8660 }, { "epoch": 0.97, - "grad_norm": 20.5, - "learning_rate": 6.99113880919977e-08, - "loss": 4.4697, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.2371, "step": 8665 }, { "epoch": 0.97, - "grad_norm": 23.75, - "learning_rate": 6.763315803259285e-08, - "loss": 4.4681, + "grad_norm": 0.298828125, + "learning_rate": 0.001, + "loss": 2.2604, "step": 8670 }, { "epoch": 0.97, - "grad_norm": 21.75, - "learning_rate": 6.539254028795738e-08, - "loss": 4.4828, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.2545, "step": 8675 }, { "epoch": 0.97, - "grad_norm": 20.375, - "learning_rate": 6.318954334295745e-08, - "loss": 4.4194, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.1592, "step": 8680 }, { "epoch": 0.97, - "grad_norm": 19.25, - "learning_rate": 6.102417553999873e-08, - "loss": 4.4764, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.2776, "step": 8685 }, { "epoch": 0.97, - "grad_norm": 20.875, - "learning_rate": 5.889644507898751e-08, - "loss": 4.4739, + "grad_norm": 0.302734375, + "learning_rate": 0.001, + "loss": 2.213, "step": 8690 }, { "epoch": 0.97, - "grad_norm": 20.25, - "learning_rate": 5.680636001730633e-08, - "loss": 4.5241, + "grad_norm": 0.287109375, + "learning_rate": 0.001, + "loss": 2.1203, "step": 8695 }, { "epoch": 0.97, - "grad_norm": 24.375, - "learning_rate": 5.475392826977843e-08, - "loss": 4.4824, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.3063, "step": 8700 }, { "epoch": 0.97, - "grad_norm": 25.25, - "learning_rate": 5.2739157608642235e-08, - "loss": 4.4598, + "grad_norm": 0.296875, + "learning_rate": 0.001, + "loss": 2.2479, "step": 8705 }, { "epoch": 0.97, - "grad_norm": 22.5, - "learning_rate": 5.0762055663517994e-08, - "loss": 4.4981, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.2708, "step": 8710 }, { "epoch": 0.97, - "grad_norm": 23.375, - "learning_rate": 4.8822629921378985e-08, - "loss": 4.5661, + "grad_norm": 0.326171875, + "learning_rate": 0.001, + "loss": 2.308, "step": 8715 }, { "epoch": 0.97, - "grad_norm": 20.625, - "learning_rate": 4.692088772652703e-08, - "loss": 4.4838, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.2732, "step": 8720 }, { "epoch": 0.97, - "grad_norm": 22.5, - "learning_rate": 4.505683628056146e-08, - "loss": 4.5305, + "grad_norm": 0.296875, + "learning_rate": 0.001, + "loss": 2.2556, "step": 8725 }, { "epoch": 0.97, - "grad_norm": 27.875, - "learning_rate": 4.323048264235241e-08, - "loss": 4.4989, + "grad_norm": 0.3359375, + "learning_rate": 0.001, + "loss": 2.2082, "step": 8730 }, { "epoch": 0.97, - "grad_norm": 17.625, - "learning_rate": 4.144183372801536e-08, - "loss": 4.5348, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.2652, "step": 8735 }, { "epoch": 0.97, - "grad_norm": 22.125, - "learning_rate": 3.969089631088108e-08, - "loss": 4.5329, + "grad_norm": 0.296875, + "learning_rate": 0.001, + "loss": 2.3049, "step": 8740 }, { "epoch": 0.98, - "grad_norm": 19.0, - "learning_rate": 3.797767702147792e-08, - "loss": 4.5282, + "grad_norm": 0.30078125, + "learning_rate": 0.001, + "loss": 2.2438, "step": 8745 }, { "epoch": 0.98, - "grad_norm": 19.25, - "learning_rate": 3.6302182347496274e-08, - "loss": 4.4855, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.1741, "step": 8750 }, { "epoch": 0.98, - "grad_norm": 23.5, - "learning_rate": 3.46644186337719e-08, - "loss": 4.4571, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.2807, "step": 8755 }, { "epoch": 0.98, - "grad_norm": 25.0, - "learning_rate": 3.306439208225931e-08, - "loss": 4.489, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.2431, "step": 8760 }, { "epoch": 0.98, - "grad_norm": 20.875, - "learning_rate": 3.150210875200732e-08, - "loss": 4.5096, + "grad_norm": 0.291015625, + "learning_rate": 0.001, + "loss": 2.2556, "step": 8765 }, { "epoch": 0.98, - "grad_norm": 24.625, - "learning_rate": 2.9977574559139077e-08, - "loss": 4.4331, + "grad_norm": 0.302734375, + "learning_rate": 0.001, + "loss": 2.2498, "step": 8770 }, { "epoch": 0.98, - "grad_norm": 22.25, - "learning_rate": 2.8490795276825413e-08, - "loss": 4.5337, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.2788, "step": 8775 }, { "epoch": 0.98, - "grad_norm": 20.75, - "learning_rate": 2.7041776535265963e-08, - "loss": 4.3925, + "grad_norm": 0.29296875, + "learning_rate": 0.001, + "loss": 2.2636, "step": 8780 }, { "epoch": 0.98, - "grad_norm": 22.0, - "learning_rate": 2.5630523821669194e-08, - "loss": 4.4629, + "grad_norm": 0.330078125, + "learning_rate": 0.001, + "loss": 2.2968, "step": 8785 }, { "epoch": 0.98, - "grad_norm": 21.75, - "learning_rate": 2.4257042480224645e-08, - "loss": 4.5107, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.2984, "step": 8790 }, { "epoch": 0.98, - "grad_norm": 25.25, - "learning_rate": 2.2921337712092928e-08, - "loss": 4.504, + "grad_norm": 0.302734375, + "learning_rate": 0.001, + "loss": 2.2495, "step": 8795 }, { "epoch": 0.98, - "grad_norm": 20.375, - "learning_rate": 2.1623414575377977e-08, - "loss": 4.5135, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.2131, "step": 8800 }, { "epoch": 0.98, - "grad_norm": 28.75, - "learning_rate": 2.0363277985111508e-08, - "loss": 4.4682, + "grad_norm": 0.33203125, + "learning_rate": 0.001, + "loss": 2.176, "step": 8805 }, { "epoch": 0.98, - "grad_norm": 19.0, - "learning_rate": 1.914093271323081e-08, - "loss": 4.4985, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.3043, "step": 8810 }, { "epoch": 0.98, - "grad_norm": 20.0, - "learning_rate": 1.7956383388568753e-08, - "loss": 4.4782, + "grad_norm": 0.30078125, + "learning_rate": 0.001, + "loss": 2.2409, "step": 8815 }, { "epoch": 0.98, - "grad_norm": 18.875, - "learning_rate": 1.6809634496824934e-08, - "loss": 4.4257, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.1861, "step": 8820 }, { "epoch": 0.98, - "grad_norm": 21.625, - "learning_rate": 1.57006903805601e-08, - "loss": 4.4241, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.2472, "step": 8825 }, { "epoch": 0.98, - "grad_norm": 21.125, - "learning_rate": 1.4629555239169535e-08, - "loss": 4.5151, + "grad_norm": 0.306640625, + "learning_rate": 0.001, + "loss": 2.2569, "step": 8830 }, { "epoch": 0.99, - "grad_norm": 20.875, - "learning_rate": 1.3596233128874148e-08, - "loss": 4.4844, + "grad_norm": 0.32421875, + "learning_rate": 0.001, + "loss": 2.2432, "step": 8835 }, { "epoch": 0.99, - "grad_norm": 19.125, - "learning_rate": 1.260072796270384e-08, - "loss": 4.4737, + "grad_norm": 0.298828125, + "learning_rate": 0.001, + "loss": 2.2889, "step": 8840 }, { "epoch": 0.99, - "grad_norm": 22.125, - "learning_rate": 1.1643043510479734e-08, - "loss": 4.4824, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.1974, "step": 8845 }, { "epoch": 0.99, - "grad_norm": 20.0, - "learning_rate": 1.0723183398803071e-08, - "loss": 4.5043, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.3096, "step": 8850 }, { "epoch": 0.99, - "grad_norm": 18.75, - "learning_rate": 9.84115111103967e-09, - "loss": 4.4874, + "grad_norm": 0.30078125, + "learning_rate": 0.001, + "loss": 2.1649, "step": 8855 }, { "epoch": 0.99, - "grad_norm": 22.5, - "learning_rate": 8.996949987306603e-09, - "loss": 4.5241, + "grad_norm": 0.298828125, + "learning_rate": 0.001, + "loss": 2.2163, "step": 8860 }, { "epoch": 0.99, - "grad_norm": 19.375, - "learning_rate": 8.19058322446109e-09, - "loss": 4.4381, + "grad_norm": 0.302734375, + "learning_rate": 0.001, + "loss": 2.2611, "step": 8865 }, { "epoch": 0.99, - "grad_norm": 25.75, - "learning_rate": 7.422053876088298e-09, - "loss": 4.5376, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.245, "step": 8870 }, { "epoch": 0.99, - "grad_norm": 20.125, - "learning_rate": 6.691364852489113e-09, - "loss": 4.5136, + "grad_norm": 0.30078125, + "learning_rate": 0.001, + "loss": 2.1904, "step": 8875 }, { "epoch": 0.99, - "grad_norm": 22.0, - "learning_rate": 5.998518920665719e-09, - "loss": 4.5452, + "grad_norm": 0.310546875, + "learning_rate": 0.001, + "loss": 2.2698, "step": 8880 }, { "epoch": 0.99, - "grad_norm": 18.875, - "learning_rate": 5.343518704318263e-09, - "loss": 4.4536, + "grad_norm": 0.3046875, + "learning_rate": 0.001, + "loss": 2.2447, "step": 8885 }, { "epoch": 0.99, - "grad_norm": 21.5, - "learning_rate": 4.726366683830419e-09, - "loss": 4.4167, + "grad_norm": 0.349609375, + "learning_rate": 0.001, + "loss": 2.2229, "step": 8890 }, { "epoch": 0.99, - "grad_norm": 20.125, - "learning_rate": 4.147065196260513e-09, - "loss": 4.5355, + "grad_norm": 0.294921875, + "learning_rate": 0.001, + "loss": 2.1953, "step": 8895 }, { "epoch": 0.99, - "grad_norm": 26.75, - "learning_rate": 3.6056164353293065e-09, - "loss": 4.5308, + "grad_norm": 0.291015625, + "learning_rate": 0.001, + "loss": 2.2675, "step": 8900 }, { "epoch": 0.99, - "grad_norm": 22.875, - "learning_rate": 3.1020224514222154e-09, - "loss": 4.4939, + "grad_norm": 0.322265625, + "learning_rate": 0.001, + "loss": 2.3104, "step": 8905 }, { "epoch": 0.99, - "grad_norm": 21.75, - "learning_rate": 2.6362851515682185e-09, - "loss": 4.385, + "grad_norm": 0.31640625, + "learning_rate": 0.001, + "loss": 2.3058, "step": 8910 }, { "epoch": 0.99, - "grad_norm": 23.375, - "learning_rate": 2.2084062994409685e-09, - "loss": 4.4598, + "grad_norm": 0.294921875, + "learning_rate": 0.001, + "loss": 2.1981, "step": 8915 }, { "epoch": 0.99, - "grad_norm": 21.875, - "learning_rate": 1.8183875153521268e-09, - "loss": 4.5582, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.1727, "step": 8920 }, { "epoch": 1.0, - "grad_norm": 21.625, - "learning_rate": 1.466230276241376e-09, - "loss": 4.4951, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.1917, "step": 8925 }, { "epoch": 1.0, - "grad_norm": 26.625, - "learning_rate": 1.151935915673086e-09, - "loss": 4.3931, + "grad_norm": 0.291015625, + "learning_rate": 0.001, + "loss": 2.1705, "step": 8930 }, { "epoch": 1.0, - "grad_norm": 22.0, - "learning_rate": 8.755056238296533e-10, - "loss": 4.5343, + "grad_norm": 0.3125, + "learning_rate": 0.001, + "loss": 2.2193, "step": 8935 }, { "epoch": 1.0, - "grad_norm": 23.25, - "learning_rate": 6.369404475092822e-10, - "loss": 4.4856, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.2683, "step": 8940 }, { "epoch": 1.0, - "grad_norm": 27.0, - "learning_rate": 4.3624129012154183e-10, - "loss": 4.4404, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.1673, "step": 8945 }, { "epoch": 1.0, - "grad_norm": 22.25, - "learning_rate": 2.734089116829264e-10, - "loss": 4.3703, + "grad_norm": 0.2890625, + "learning_rate": 0.001, + "loss": 2.2151, "step": 8950 }, { "epoch": 1.0, - "grad_norm": 26.75, - "learning_rate": 1.48443928812414e-10, - "loss": 4.4418, + "grad_norm": 0.341796875, + "learning_rate": 0.001, + "loss": 2.2106, "step": 8955 }, { "epoch": 1.0, - "grad_norm": 18.875, - "learning_rate": 6.134681473479732e-11, - "loss": 4.5045, + "grad_norm": 0.318359375, + "learning_rate": 0.001, + "loss": 2.2439, "step": 8960 }, { "epoch": 1.0, - "grad_norm": 22.125, - "learning_rate": 1.211789927291207e-11, - "loss": 4.5336, + "grad_norm": 0.30859375, + "learning_rate": 0.001, + "loss": 2.2279, "step": 8965 }, { "epoch": 1.0, - "eval_loss": 4.488282680511475, - "eval_runtime": 1698.1153, - "eval_samples_per_second": 9.353, - "eval_steps_per_second": 1.17, + "eval_loss": 2.2348995208740234, + "eval_runtime": 2639.8137, + "eval_samples_per_second": 6.016, + "eval_steps_per_second": 0.752, "step": 8969 }, { "epoch": 1.0, "step": 8969, - "total_flos": 1.8254804498880922e+18, - "train_loss": 4.746947802406368, - "train_runtime": 46954.2707, - "train_samples_per_second": 3.056, - "train_steps_per_second": 0.191 + "total_flos": 1.50079627163861e+17, + "train_loss": 2.666482654486044, + "train_runtime": 30048.5081, + "train_samples_per_second": 4.775, + "train_steps_per_second": 0.298 } ], "logging_steps": 5, "max_steps": 8969, "num_input_tokens_seen": 0, "num_train_epochs": 1, - "save_steps": 100, - "total_flos": 1.8254804498880922e+18, + "save_steps": 1000, + "total_flos": 1.50079627163861e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null