{ "best_metric": 0.6414651274681091, "best_model_checkpoint": "output/checkpoint-170000", "epoch": 15.0, "global_step": 171660, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "learning_rate": 4.98e-06, "loss": 2.0401, "step": 500 }, { "epoch": 0.09, "learning_rate": 9.980000000000001e-06, "loss": 1.397, "step": 1000 }, { "epoch": 0.13, "learning_rate": 1.4979999999999999e-05, "loss": 1.302, "step": 1500 }, { "epoch": 0.17, "learning_rate": 1.9980000000000002e-05, "loss": 1.242, "step": 2000 }, { "epoch": 0.22, "learning_rate": 2.4970000000000003e-05, "loss": 1.1985, "step": 2500 }, { "epoch": 0.22, "eval_loss": 1.0940483808517456, "eval_runtime": 3092.4183, "eval_samples_per_second": 24.931, "eval_steps_per_second": 1.558, "step": 2500 }, { "epoch": 0.26, "learning_rate": 2.9970000000000003e-05, "loss": 1.1712, "step": 3000 }, { "epoch": 0.31, "learning_rate": 3.4970000000000006e-05, "loss": 1.1419, "step": 3500 }, { "epoch": 0.35, "learning_rate": 3.9970000000000005e-05, "loss": 1.1259, "step": 4000 }, { "epoch": 0.39, "learning_rate": 4.497e-05, "loss": 1.1088, "step": 4500 }, { "epoch": 0.44, "learning_rate": 4.997e-05, "loss": 1.0937, "step": 5000 }, { "epoch": 0.44, "eval_loss": 1.0032908916473389, "eval_runtime": 3130.3678, "eval_samples_per_second": 24.628, "eval_steps_per_second": 1.539, "step": 5000 }, { "epoch": 0.48, "learning_rate": 5.497e-05, "loss": 1.0865, "step": 5500 }, { "epoch": 0.52, "learning_rate": 5.9970000000000004e-05, "loss": 1.0763, "step": 6000 }, { "epoch": 0.57, "learning_rate": 6.497000000000001e-05, "loss": 1.0666, "step": 6500 }, { "epoch": 0.61, "learning_rate": 6.997e-05, "loss": 1.0704, "step": 7000 }, { "epoch": 0.66, "learning_rate": 7.497000000000001e-05, "loss": 1.0675, "step": 7500 }, { "epoch": 0.66, "eval_loss": 0.9753177165985107, "eval_runtime": 3110.5948, "eval_samples_per_second": 24.785, "eval_steps_per_second": 1.549, "step": 7500 }, { "epoch": 0.7, "learning_rate": 7.997e-05, "loss": 1.0647, "step": 8000 }, { "epoch": 0.74, "learning_rate": 8.495e-05, "loss": 1.0493, "step": 8500 }, { "epoch": 0.79, "learning_rate": 8.995e-05, "loss": 1.0583, "step": 9000 }, { "epoch": 0.83, "learning_rate": 9.495e-05, "loss": 1.0582, "step": 9500 }, { "epoch": 0.87, "learning_rate": 9.995e-05, "loss": 1.0565, "step": 10000 }, { "epoch": 0.87, "eval_loss": 0.9801222085952759, "eval_runtime": 3136.3647, "eval_samples_per_second": 24.581, "eval_steps_per_second": 1.536, "step": 10000 }, { "epoch": 0.92, "learning_rate": 9.895171537484117e-05, "loss": 1.0507, "step": 10500 }, { "epoch": 0.96, "learning_rate": 9.789284201609487e-05, "loss": 1.0443, "step": 11000 }, { "epoch": 1.0, "learning_rate": 9.683396865734859e-05, "loss": 1.0318, "step": 11500 }, { "epoch": 1.05, "learning_rate": 9.577721304531978e-05, "loss": 1.0298, "step": 12000 }, { "epoch": 1.09, "learning_rate": 9.471833968657349e-05, "loss": 1.0244, "step": 12500 }, { "epoch": 1.09, "eval_loss": 0.9525517225265503, "eval_runtime": 3137.7232, "eval_samples_per_second": 24.571, "eval_steps_per_second": 1.536, "step": 12500 }, { "epoch": 1.14, "learning_rate": 9.365946632782719e-05, "loss": 1.024, "step": 13000 }, { "epoch": 1.18, "learning_rate": 9.26005929690809e-05, "loss": 1.0054, "step": 13500 }, { "epoch": 1.22, "learning_rate": 9.154383735705209e-05, "loss": 1.0105, "step": 14000 }, { "epoch": 1.27, "learning_rate": 9.04849639983058e-05, "loss": 1.0035, "step": 14500 }, { "epoch": 1.31, "learning_rate": 8.942609063955952e-05, "loss": 0.9943, "step": 15000 }, { "epoch": 1.31, "eval_loss": 0.9298429489135742, "eval_runtime": 3051.8254, "eval_samples_per_second": 25.262, "eval_steps_per_second": 1.579, "step": 15000 }, { "epoch": 1.35, "learning_rate": 8.836721728081322e-05, "loss": 0.9959, "step": 15500 }, { "epoch": 1.4, "learning_rate": 8.731046166878442e-05, "loss": 0.9874, "step": 16000 }, { "epoch": 1.44, "learning_rate": 8.625158831003812e-05, "loss": 0.9834, "step": 16500 }, { "epoch": 1.49, "learning_rate": 8.519483269800933e-05, "loss": 0.9764, "step": 17000 }, { "epoch": 1.53, "learning_rate": 8.413595933926302e-05, "loss": 0.9799, "step": 17500 }, { "epoch": 1.53, "eval_loss": 0.9034547805786133, "eval_runtime": 3114.4981, "eval_samples_per_second": 24.754, "eval_steps_per_second": 1.547, "step": 17500 }, { "epoch": 1.57, "learning_rate": 8.307708598051674e-05, "loss": 0.9642, "step": 18000 }, { "epoch": 1.62, "learning_rate": 8.201821262177044e-05, "loss": 0.9671, "step": 18500 }, { "epoch": 1.66, "learning_rate": 8.095933926302415e-05, "loss": 0.9637, "step": 19000 }, { "epoch": 1.7, "learning_rate": 7.990046590427785e-05, "loss": 0.9545, "step": 19500 }, { "epoch": 1.75, "learning_rate": 7.884159254553156e-05, "loss": 0.95, "step": 20000 }, { "epoch": 1.75, "eval_loss": 0.8834716081619263, "eval_runtime": 3125.2028, "eval_samples_per_second": 24.669, "eval_steps_per_second": 1.542, "step": 20000 }, { "epoch": 1.79, "learning_rate": 7.778483693350275e-05, "loss": 0.9556, "step": 20500 }, { "epoch": 1.84, "learning_rate": 7.672596357475647e-05, "loss": 0.9519, "step": 21000 }, { "epoch": 1.88, "learning_rate": 7.566709021601018e-05, "loss": 0.9448, "step": 21500 }, { "epoch": 1.92, "learning_rate": 7.460821685726388e-05, "loss": 0.9382, "step": 22000 }, { "epoch": 1.97, "learning_rate": 7.354934349851758e-05, "loss": 0.933, "step": 22500 }, { "epoch": 1.97, "eval_loss": 0.8636178970336914, "eval_runtime": 3087.2735, "eval_samples_per_second": 24.972, "eval_steps_per_second": 1.561, "step": 22500 }, { "epoch": 2.01, "learning_rate": 7.249258788648878e-05, "loss": 0.93, "step": 23000 }, { "epoch": 2.05, "learning_rate": 7.14337145277425e-05, "loss": 0.9193, "step": 23500 }, { "epoch": 2.1, "learning_rate": 7.037484116899618e-05, "loss": 0.9141, "step": 24000 }, { "epoch": 2.14, "learning_rate": 6.931596781024989e-05, "loss": 0.9092, "step": 24500 }, { "epoch": 2.18, "learning_rate": 6.82570944515036e-05, "loss": 0.9079, "step": 25000 }, { "epoch": 2.18, "eval_loss": 0.8507079482078552, "eval_runtime": 3106.3211, "eval_samples_per_second": 24.819, "eval_steps_per_second": 1.551, "step": 25000 }, { "epoch": 2.23, "learning_rate": 6.72003388394748e-05, "loss": 0.9041, "step": 25500 }, { "epoch": 2.27, "learning_rate": 6.614146548072851e-05, "loss": 0.9004, "step": 26000 }, { "epoch": 2.32, "learning_rate": 6.508259212198221e-05, "loss": 0.8995, "step": 26500 }, { "epoch": 2.36, "learning_rate": 6.402371876323592e-05, "loss": 0.8922, "step": 27000 }, { "epoch": 2.4, "learning_rate": 6.296484540448962e-05, "loss": 0.8938, "step": 27500 }, { "epoch": 2.4, "eval_loss": 0.8396568894386292, "eval_runtime": 3033.9082, "eval_samples_per_second": 25.411, "eval_steps_per_second": 1.588, "step": 27500 }, { "epoch": 2.45, "learning_rate": 6.190597204574333e-05, "loss": 0.8897, "step": 28000 }, { "epoch": 2.49, "learning_rate": 6.084709868699704e-05, "loss": 0.8855, "step": 28500 }, { "epoch": 2.53, "learning_rate": 5.979034307496824e-05, "loss": 0.8795, "step": 29000 }, { "epoch": 2.58, "learning_rate": 5.873146971622194e-05, "loss": 0.8841, "step": 29500 }, { "epoch": 2.62, "learning_rate": 5.767259635747565e-05, "loss": 0.8781, "step": 30000 }, { "epoch": 2.62, "eval_loss": 0.8194745182991028, "eval_runtime": 3075.2096, "eval_samples_per_second": 25.07, "eval_steps_per_second": 1.567, "step": 30000 }, { "epoch": 2.67, "learning_rate": 5.6613722998729355e-05, "loss": 0.8776, "step": 30500 }, { "epoch": 2.71, "learning_rate": 5.555484963998306e-05, "loss": 0.8777, "step": 31000 }, { "epoch": 2.75, "learning_rate": 5.4495976281236774e-05, "loss": 0.865, "step": 31500 }, { "epoch": 2.8, "learning_rate": 5.3437102922490467e-05, "loss": 0.8752, "step": 32000 }, { "epoch": 2.84, "learning_rate": 5.237822956374417e-05, "loss": 0.8647, "step": 32500 }, { "epoch": 2.84, "eval_loss": 0.8087666630744934, "eval_runtime": 3169.435, "eval_samples_per_second": 24.325, "eval_steps_per_second": 1.52, "step": 32500 }, { "epoch": 2.88, "learning_rate": 5.1321473951715384e-05, "loss": 0.8599, "step": 33000 }, { "epoch": 2.93, "learning_rate": 5.0262600592969076e-05, "loss": 0.8568, "step": 33500 }, { "epoch": 2.97, "learning_rate": 4.920372723422279e-05, "loss": 0.8528, "step": 34000 }, { "epoch": 3.01, "learning_rate": 4.8144853875476495e-05, "loss": 0.8463, "step": 34500 }, { "epoch": 3.06, "learning_rate": 4.70859805167302e-05, "loss": 0.8422, "step": 35000 }, { "epoch": 3.06, "eval_loss": 0.7954000234603882, "eval_runtime": 3127.0452, "eval_samples_per_second": 24.655, "eval_steps_per_second": 1.541, "step": 35000 }, { "epoch": 3.1, "learning_rate": 4.602710715798391e-05, "loss": 0.8373, "step": 35500 }, { "epoch": 3.15, "learning_rate": 4.4968233799237613e-05, "loss": 0.8268, "step": 36000 }, { "epoch": 3.19, "learning_rate": 4.390936044049132e-05, "loss": 0.8426, "step": 36500 }, { "epoch": 3.23, "learning_rate": 4.2850487081745026e-05, "loss": 0.837, "step": 37000 }, { "epoch": 3.28, "learning_rate": 4.179373146971622e-05, "loss": 0.831, "step": 37500 }, { "epoch": 3.28, "eval_loss": 0.7871229648590088, "eval_runtime": 3095.9065, "eval_samples_per_second": 24.903, "eval_steps_per_second": 1.557, "step": 37500 }, { "epoch": 3.32, "learning_rate": 4.073485811096993e-05, "loss": 0.8343, "step": 38000 }, { "epoch": 3.36, "learning_rate": 3.9675984752223635e-05, "loss": 0.825, "step": 38500 }, { "epoch": 3.41, "learning_rate": 3.861711139347734e-05, "loss": 0.8195, "step": 39000 }, { "epoch": 3.45, "learning_rate": 3.755823803473105e-05, "loss": 0.8219, "step": 39500 }, { "epoch": 3.5, "learning_rate": 3.650148242270225e-05, "loss": 0.8173, "step": 40000 }, { "epoch": 3.5, "eval_loss": 0.7720773220062256, "eval_runtime": 3173.0037, "eval_samples_per_second": 24.297, "eval_steps_per_second": 1.519, "step": 40000 }, { "epoch": 3.54, "learning_rate": 3.544260906395595e-05, "loss": 0.8148, "step": 40500 }, { "epoch": 3.58, "learning_rate": 3.4385853451927155e-05, "loss": 0.8102, "step": 41000 }, { "epoch": 3.63, "learning_rate": 3.3329097839898346e-05, "loss": 0.8103, "step": 41500 }, { "epoch": 3.67, "learning_rate": 3.227022448115206e-05, "loss": 0.8116, "step": 42000 }, { "epoch": 3.71, "learning_rate": 3.121346886912325e-05, "loss": 0.8072, "step": 42500 }, { "epoch": 3.71, "eval_loss": 0.7610893845558167, "eval_runtime": 3130.5399, "eval_samples_per_second": 24.627, "eval_steps_per_second": 1.539, "step": 42500 }, { "epoch": 3.76, "learning_rate": 3.015459551037696e-05, "loss": 0.802, "step": 43000 }, { "epoch": 3.8, "learning_rate": 2.9095722151630668e-05, "loss": 0.7994, "step": 43500 }, { "epoch": 3.84, "learning_rate": 2.8036848792884374e-05, "loss": 0.8029, "step": 44000 }, { "epoch": 3.89, "learning_rate": 2.697797543413808e-05, "loss": 0.7975, "step": 44500 }, { "epoch": 3.93, "learning_rate": 2.5919102075391783e-05, "loss": 0.8011, "step": 45000 }, { "epoch": 3.93, "eval_loss": 0.7531821131706238, "eval_runtime": 3062.0684, "eval_samples_per_second": 25.178, "eval_steps_per_second": 1.574, "step": 45000 }, { "epoch": 3.98, "learning_rate": 2.486022871664549e-05, "loss": 0.7935, "step": 45500 }, { "epoch": 4.02, "learning_rate": 2.3801355357899195e-05, "loss": 0.793, "step": 46000 }, { "epoch": 4.06, "learning_rate": 2.2742481999152905e-05, "loss": 0.788, "step": 46500 }, { "epoch": 4.11, "learning_rate": 2.1683608640406607e-05, "loss": 0.7853, "step": 47000 }, { "epoch": 4.15, "learning_rate": 2.0624735281660313e-05, "loss": 0.7828, "step": 47500 }, { "epoch": 4.15, "eval_loss": 0.7431035041809082, "eval_runtime": 3053.0232, "eval_samples_per_second": 25.252, "eval_steps_per_second": 1.578, "step": 47500 }, { "epoch": 4.19, "learning_rate": 1.956586192291402e-05, "loss": 0.7715, "step": 48000 }, { "epoch": 4.24, "learning_rate": 1.8506988564167726e-05, "loss": 0.7752, "step": 48500 }, { "epoch": 4.28, "learning_rate": 1.7450232952138926e-05, "loss": 0.7714, "step": 49000 }, { "epoch": 4.33, "learning_rate": 1.639135959339263e-05, "loss": 0.7736, "step": 49500 }, { "epoch": 4.37, "learning_rate": 1.533248623464634e-05, "loss": 0.7691, "step": 50000 }, { "epoch": 4.37, "eval_loss": 0.7367000579833984, "eval_runtime": 3035.0963, "eval_samples_per_second": 25.402, "eval_steps_per_second": 1.588, "step": 50000 }, { "epoch": 4.41, "learning_rate": 1.4273612875900045e-05, "loss": 0.7685, "step": 50500 }, { "epoch": 4.46, "learning_rate": 1.3214739517153749e-05, "loss": 0.7639, "step": 51000 }, { "epoch": 4.5, "learning_rate": 1.2157983905124948e-05, "loss": 0.7654, "step": 51500 }, { "epoch": 4.54, "learning_rate": 1.1099110546378654e-05, "loss": 0.7677, "step": 52000 }, { "epoch": 4.59, "learning_rate": 1.0040237187632359e-05, "loss": 0.7659, "step": 52500 }, { "epoch": 4.59, "eval_loss": 0.7291901111602783, "eval_runtime": 3110.3355, "eval_samples_per_second": 24.787, "eval_steps_per_second": 1.549, "step": 52500 }, { "epoch": 4.63, "learning_rate": 8.981363828886067e-06, "loss": 0.763, "step": 53000 }, { "epoch": 4.67, "learning_rate": 7.922490470139773e-06, "loss": 0.759, "step": 53500 }, { "epoch": 4.72, "learning_rate": 6.863617111393478e-06, "loss": 0.7559, "step": 54000 }, { "epoch": 4.76, "learning_rate": 5.804743752647183e-06, "loss": 0.7547, "step": 54500 }, { "epoch": 4.81, "learning_rate": 4.74587039390089e-06, "loss": 0.7606, "step": 55000 }, { "epoch": 4.81, "eval_loss": 0.7244983911514282, "eval_runtime": 3076.3307, "eval_samples_per_second": 25.061, "eval_steps_per_second": 1.566, "step": 55000 }, { "epoch": 4.85, "learning_rate": 5.6455381080045965e-05, "loss": 0.7842, "step": 55500 }, { "epoch": 4.89, "learning_rate": 5.597663730371505e-05, "loss": 0.7983, "step": 56000 }, { "epoch": 4.94, "learning_rate": 5.549789352738415e-05, "loss": 0.8032, "step": 56500 }, { "epoch": 4.98, "learning_rate": 5.5019149751053236e-05, "loss": 0.8037, "step": 57000 }, { "epoch": 5.02, "learning_rate": 5.454040597472233e-05, "loss": 0.8082, "step": 57500 }, { "epoch": 5.02, "eval_loss": 0.7695716023445129, "eval_runtime": 3095.7063, "eval_samples_per_second": 24.904, "eval_steps_per_second": 1.557, "step": 57500 }, { "epoch": 5.07, "learning_rate": 5.406166219839143e-05, "loss": 0.8102, "step": 58000 }, { "epoch": 5.11, "learning_rate": 5.358291842206051e-05, "loss": 0.8067, "step": 58500 }, { "epoch": 5.16, "learning_rate": 5.310417464572961e-05, "loss": 0.8103, "step": 59000 }, { "epoch": 5.2, "learning_rate": 5.2627345844504025e-05, "loss": 0.8088, "step": 59500 }, { "epoch": 5.24, "learning_rate": 5.214955955572578e-05, "loss": 0.8114, "step": 60000 }, { "epoch": 5.24, "eval_loss": 0.7694710493087769, "eval_runtime": 3084.9083, "eval_samples_per_second": 24.991, "eval_steps_per_second": 1.562, "step": 60000 }, { "epoch": 5.29, "learning_rate": 5.167081577939487e-05, "loss": 0.8073, "step": 60500 }, { "epoch": 5.33, "learning_rate": 5.119207200306396e-05, "loss": 0.8019, "step": 61000 }, { "epoch": 5.37, "learning_rate": 5.071332822673306e-05, "loss": 0.7971, "step": 61500 }, { "epoch": 5.42, "learning_rate": 5.0234584450402144e-05, "loss": 0.8024, "step": 62000 }, { "epoch": 5.46, "learning_rate": 4.97567981616239e-05, "loss": 0.8022, "step": 62500 }, { "epoch": 5.46, "eval_loss": 0.7612630128860474, "eval_runtime": 3135.2609, "eval_samples_per_second": 24.59, "eval_steps_per_second": 1.537, "step": 62500 }, { "epoch": 5.51, "learning_rate": 4.927805438529299e-05, "loss": 0.8071, "step": 63000 }, { "epoch": 5.55, "learning_rate": 4.8799310608962085e-05, "loss": 0.793, "step": 63500 }, { "epoch": 5.59, "learning_rate": 4.832152432018384e-05, "loss": 0.798, "step": 64000 }, { "epoch": 5.64, "learning_rate": 4.7842780543852934e-05, "loss": 0.7976, "step": 64500 }, { "epoch": 5.68, "learning_rate": 4.7364036767522026e-05, "loss": 0.7986, "step": 65000 }, { "epoch": 5.68, "eval_loss": 0.7558260560035706, "eval_runtime": 3154.5823, "eval_samples_per_second": 24.439, "eval_steps_per_second": 1.528, "step": 65000 }, { "epoch": 5.72, "learning_rate": 4.688529299119111e-05, "loss": 0.7918, "step": 65500 }, { "epoch": 5.77, "learning_rate": 4.6406549214860205e-05, "loss": 0.7941, "step": 66000 }, { "epoch": 5.81, "learning_rate": 4.5927805438529304e-05, "loss": 0.7976, "step": 66500 }, { "epoch": 5.85, "learning_rate": 4.5449061662198396e-05, "loss": 0.7953, "step": 67000 }, { "epoch": 5.9, "learning_rate": 4.497031788586749e-05, "loss": 0.8018, "step": 67500 }, { "epoch": 5.9, "eval_loss": 0.7478091716766357, "eval_runtime": 3109.9687, "eval_samples_per_second": 24.79, "eval_steps_per_second": 1.55, "step": 67500 }, { "epoch": 5.94, "learning_rate": 4.4491574109536574e-05, "loss": 0.7939, "step": 68000 }, { "epoch": 5.99, "learning_rate": 4.401283033320567e-05, "loss": 0.7844, "step": 68500 }, { "epoch": 6.03, "learning_rate": 4.3534086556874766e-05, "loss": 0.7853, "step": 69000 }, { "epoch": 6.07, "learning_rate": 4.305534278054386e-05, "loss": 0.7776, "step": 69500 }, { "epoch": 6.12, "learning_rate": 4.2576599004212944e-05, "loss": 0.782, "step": 70000 }, { "epoch": 6.12, "eval_loss": 0.7434529662132263, "eval_runtime": 3111.4708, "eval_samples_per_second": 24.778, "eval_steps_per_second": 1.549, "step": 70000 }, { "epoch": 6.16, "learning_rate": 4.209785522788204e-05, "loss": 0.7812, "step": 70500 }, { "epoch": 6.2, "learning_rate": 4.161911145155113e-05, "loss": 0.7816, "step": 71000 }, { "epoch": 6.25, "learning_rate": 4.1141325162772885e-05, "loss": 0.7745, "step": 71500 }, { "epoch": 6.29, "learning_rate": 4.066258138644198e-05, "loss": 0.7776, "step": 72000 }, { "epoch": 6.34, "learning_rate": 4.018383761011107e-05, "loss": 0.7743, "step": 72500 }, { "epoch": 6.34, "eval_loss": 0.7367435097694397, "eval_runtime": 3109.6155, "eval_samples_per_second": 24.793, "eval_steps_per_second": 1.55, "step": 72500 }, { "epoch": 6.38, "learning_rate": 3.970509383378016e-05, "loss": 0.7762, "step": 73000 }, { "epoch": 6.42, "learning_rate": 3.9226350057449255e-05, "loss": 0.7728, "step": 73500 }, { "epoch": 6.47, "learning_rate": 3.874760628111835e-05, "loss": 0.7743, "step": 74000 }, { "epoch": 6.51, "learning_rate": 3.826886250478744e-05, "loss": 0.7715, "step": 74500 }, { "epoch": 6.55, "learning_rate": 3.779011872845653e-05, "loss": 0.774, "step": 75000 }, { "epoch": 6.55, "eval_loss": 0.7312998175621033, "eval_runtime": 3143.7734, "eval_samples_per_second": 24.523, "eval_steps_per_second": 1.533, "step": 75000 }, { "epoch": 6.6, "learning_rate": 3.7311374952125625e-05, "loss": 0.7656, "step": 75500 }, { "epoch": 6.64, "learning_rate": 3.683263117579472e-05, "loss": 0.7653, "step": 76000 }, { "epoch": 6.68, "learning_rate": 3.635388739946381e-05, "loss": 0.7619, "step": 76500 }, { "epoch": 6.73, "learning_rate": 3.587610111068556e-05, "loss": 0.765, "step": 77000 }, { "epoch": 6.77, "learning_rate": 3.539735733435465e-05, "loss": 0.7692, "step": 77500 }, { "epoch": 6.77, "eval_loss": 0.7270153164863586, "eval_runtime": 3083.4104, "eval_samples_per_second": 25.003, "eval_steps_per_second": 1.563, "step": 77500 }, { "epoch": 6.82, "learning_rate": 3.491957104557641e-05, "loss": 0.7676, "step": 78000 }, { "epoch": 6.86, "learning_rate": 3.44408272692455e-05, "loss": 0.7617, "step": 78500 }, { "epoch": 6.9, "learning_rate": 3.396208349291459e-05, "loss": 0.765, "step": 79000 }, { "epoch": 6.95, "learning_rate": 3.3483339716583685e-05, "loss": 0.7609, "step": 79500 }, { "epoch": 6.99, "learning_rate": 3.300459594025278e-05, "loss": 0.7604, "step": 80000 }, { "epoch": 6.99, "eval_loss": 0.7200314998626709, "eval_runtime": 3077.861, "eval_samples_per_second": 25.049, "eval_steps_per_second": 1.566, "step": 80000 }, { "epoch": 7.03, "learning_rate": 3.252585216392187e-05, "loss": 0.7513, "step": 80500 }, { "epoch": 7.08, "learning_rate": 3.204710838759096e-05, "loss": 0.7549, "step": 81000 }, { "epoch": 7.12, "learning_rate": 3.156932209881271e-05, "loss": 0.7485, "step": 81500 }, { "epoch": 7.17, "learning_rate": 3.109057832248181e-05, "loss": 0.7486, "step": 82000 }, { "epoch": 7.21, "learning_rate": 3.0611834546150903e-05, "loss": 0.7468, "step": 82500 }, { "epoch": 7.21, "eval_loss": 0.7163689136505127, "eval_runtime": 3165.721, "eval_samples_per_second": 24.353, "eval_steps_per_second": 1.522, "step": 82500 }, { "epoch": 7.25, "learning_rate": 3.0133090769819993e-05, "loss": 0.7471, "step": 83000 }, { "epoch": 7.3, "learning_rate": 2.9654346993489085e-05, "loss": 0.7473, "step": 83500 }, { "epoch": 7.34, "learning_rate": 2.9175603217158177e-05, "loss": 0.747, "step": 84000 }, { "epoch": 7.38, "learning_rate": 2.8696859440827273e-05, "loss": 0.743, "step": 84500 }, { "epoch": 7.43, "learning_rate": 2.8219073152049026e-05, "loss": 0.7486, "step": 85000 }, { "epoch": 7.43, "eval_loss": 0.7117038369178772, "eval_runtime": 3102.6798, "eval_samples_per_second": 24.848, "eval_steps_per_second": 1.553, "step": 85000 }, { "epoch": 7.47, "learning_rate": 2.7740329375718115e-05, "loss": 0.7411, "step": 85500 }, { "epoch": 7.51, "learning_rate": 2.7261585599387208e-05, "loss": 0.7418, "step": 86000 }, { "epoch": 7.56, "learning_rate": 2.6782841823056303e-05, "loss": 0.7443, "step": 86500 }, { "epoch": 7.6, "learning_rate": 2.6305055534278056e-05, "loss": 0.7403, "step": 87000 }, { "epoch": 7.65, "learning_rate": 2.582631175794715e-05, "loss": 0.7399, "step": 87500 }, { "epoch": 7.65, "eval_loss": 0.7042549252510071, "eval_runtime": 3110.6014, "eval_samples_per_second": 24.785, "eval_steps_per_second": 1.549, "step": 87500 }, { "epoch": 7.69, "learning_rate": 2.5347567981616238e-05, "loss": 0.7393, "step": 88000 }, { "epoch": 7.73, "learning_rate": 2.4868824205285333e-05, "loss": 0.7404, "step": 88500 }, { "epoch": 7.78, "learning_rate": 2.4390080428954426e-05, "loss": 0.7356, "step": 89000 }, { "epoch": 7.82, "learning_rate": 2.391133665262352e-05, "loss": 0.7395, "step": 89500 }, { "epoch": 7.86, "learning_rate": 2.3432592876292607e-05, "loss": 0.7306, "step": 90000 }, { "epoch": 7.86, "eval_loss": 0.6955912709236145, "eval_runtime": 3139.2914, "eval_samples_per_second": 24.558, "eval_steps_per_second": 1.535, "step": 90000 }, { "epoch": 7.91, "learning_rate": 2.2953849099961703e-05, "loss": 0.735, "step": 90500 }, { "epoch": 7.95, "learning_rate": 2.2475105323630792e-05, "loss": 0.7281, "step": 91000 }, { "epoch": 8.0, "learning_rate": 2.1996361547299888e-05, "loss": 0.732, "step": 91500 }, { "epoch": 8.04, "learning_rate": 2.1517617770968977e-05, "loss": 0.7219, "step": 92000 }, { "epoch": 8.08, "learning_rate": 2.1039831482190733e-05, "loss": 0.7243, "step": 92500 }, { "epoch": 8.08, "eval_loss": 0.695923924446106, "eval_runtime": 3092.5373, "eval_samples_per_second": 24.93, "eval_steps_per_second": 1.558, "step": 92500 }, { "epoch": 8.13, "learning_rate": 2.0561087705859826e-05, "loss": 0.7243, "step": 93000 }, { "epoch": 8.17, "learning_rate": 2.0082343929528918e-05, "loss": 0.7258, "step": 93500 }, { "epoch": 8.21, "learning_rate": 1.960360015319801e-05, "loss": 0.7211, "step": 94000 }, { "epoch": 8.26, "learning_rate": 1.91248563768671e-05, "loss": 0.7214, "step": 94500 }, { "epoch": 8.3, "learning_rate": 1.8646112600536196e-05, "loss": 0.7132, "step": 95000 }, { "epoch": 8.3, "eval_loss": 0.6916212439537048, "eval_runtime": 3163.2312, "eval_samples_per_second": 24.373, "eval_steps_per_second": 1.523, "step": 95000 }, { "epoch": 8.34, "learning_rate": 1.8167368824205285e-05, "loss": 0.7188, "step": 95500 }, { "epoch": 8.39, "learning_rate": 1.768862504787438e-05, "loss": 0.717, "step": 96000 }, { "epoch": 8.43, "learning_rate": 1.7210838759096133e-05, "loss": 0.7189, "step": 96500 }, { "epoch": 8.48, "learning_rate": 1.6732094982765226e-05, "loss": 0.7206, "step": 97000 }, { "epoch": 8.52, "learning_rate": 1.625430869398698e-05, "loss": 0.71, "step": 97500 }, { "epoch": 8.52, "eval_loss": 0.6853311061859131, "eval_runtime": 3146.2403, "eval_samples_per_second": 24.504, "eval_steps_per_second": 1.532, "step": 97500 }, { "epoch": 8.56, "learning_rate": 1.577556491765607e-05, "loss": 0.7148, "step": 98000 }, { "epoch": 8.61, "learning_rate": 1.5296821141325163e-05, "loss": 0.7169, "step": 98500 }, { "epoch": 8.65, "learning_rate": 1.4818077364994256e-05, "loss": 0.7155, "step": 99000 }, { "epoch": 8.69, "learning_rate": 1.4339333588663348e-05, "loss": 0.713, "step": 99500 }, { "epoch": 8.74, "learning_rate": 1.3860589812332439e-05, "loss": 0.7128, "step": 100000 }, { "epoch": 8.74, "eval_loss": 0.6855071783065796, "eval_runtime": 3110.846, "eval_samples_per_second": 24.783, "eval_steps_per_second": 1.549, "step": 100000 }, { "epoch": 8.78, "learning_rate": 1.3381846036001533e-05, "loss": 0.7107, "step": 100500 }, { "epoch": 8.83, "learning_rate": 1.2903102259670624e-05, "loss": 0.7142, "step": 101000 }, { "epoch": 8.87, "learning_rate": 1.2424358483339716e-05, "loss": 0.7035, "step": 101500 }, { "epoch": 8.91, "learning_rate": 1.1945614707008809e-05, "loss": 0.7061, "step": 102000 }, { "epoch": 8.96, "learning_rate": 1.1467828418230563e-05, "loss": 0.7088, "step": 102500 }, { "epoch": 8.96, "eval_loss": 0.680884063243866, "eval_runtime": 3113.4696, "eval_samples_per_second": 24.762, "eval_steps_per_second": 1.548, "step": 102500 }, { "epoch": 9.0, "learning_rate": 1.0989084641899656e-05, "loss": 0.7039, "step": 103000 }, { "epoch": 9.04, "learning_rate": 1.0510340865568748e-05, "loss": 0.6983, "step": 103500 }, { "epoch": 9.09, "learning_rate": 1.0032554576790503e-05, "loss": 0.7028, "step": 104000 }, { "epoch": 9.13, "learning_rate": 9.553810800459595e-06, "loss": 0.7002, "step": 104500 }, { "epoch": 9.18, "learning_rate": 9.075067024128686e-06, "loss": 0.7002, "step": 105000 }, { "epoch": 9.18, "eval_loss": 0.6783619523048401, "eval_runtime": 3133.9738, "eval_samples_per_second": 24.6, "eval_steps_per_second": 1.538, "step": 105000 }, { "epoch": 9.22, "learning_rate": 8.596323247797778e-06, "loss": 0.699, "step": 105500 }, { "epoch": 9.26, "learning_rate": 8.11757947146687e-06, "loss": 0.6927, "step": 106000 }, { "epoch": 9.31, "learning_rate": 7.638835695135963e-06, "loss": 0.6976, "step": 106500 }, { "epoch": 9.35, "learning_rate": 7.1600919188050565e-06, "loss": 0.6978, "step": 107000 }, { "epoch": 9.39, "learning_rate": 6.681348142474147e-06, "loss": 0.6953, "step": 107500 }, { "epoch": 9.39, "eval_loss": 0.6736627221107483, "eval_runtime": 3133.4547, "eval_samples_per_second": 24.604, "eval_steps_per_second": 1.538, "step": 107500 }, { "epoch": 9.44, "learning_rate": 6.2026043661432406e-06, "loss": 0.6892, "step": 108000 }, { "epoch": 9.48, "learning_rate": 5.723860589812333e-06, "loss": 0.6989, "step": 108500 }, { "epoch": 9.52, "learning_rate": 5.2451168134814254e-06, "loss": 0.6975, "step": 109000 }, { "epoch": 9.57, "learning_rate": 4.767330524703179e-06, "loss": 0.6977, "step": 109500 }, { "epoch": 9.61, "learning_rate": 4.288586748372271e-06, "loss": 0.695, "step": 110000 }, { "epoch": 9.61, "eval_loss": 0.6714410185813904, "eval_runtime": 3139.9991, "eval_samples_per_second": 24.553, "eval_steps_per_second": 1.535, "step": 110000 }, { "epoch": 9.66, "learning_rate": 3.810800459594025e-06, "loss": 0.6947, "step": 110500 }, { "epoch": 9.7, "learning_rate": 3.3320566832631176e-06, "loss": 0.6967, "step": 111000 }, { "epoch": 9.74, "learning_rate": 2.85331290693221e-06, "loss": 0.6904, "step": 111500 }, { "epoch": 9.79, "learning_rate": 2.3745691306013025e-06, "loss": 0.6949, "step": 112000 }, { "epoch": 9.83, "learning_rate": 1.8958253542703947e-06, "loss": 0.6871, "step": 112500 }, { "epoch": 9.83, "eval_loss": 0.6687204837799072, "eval_runtime": 3111.3683, "eval_samples_per_second": 24.779, "eval_steps_per_second": 1.549, "step": 112500 }, { "epoch": 9.87, "learning_rate": 3.6311394284300384e-05, "loss": 0.7034, "step": 113000 }, { "epoch": 9.92, "learning_rate": 3.600210317951256e-05, "loss": 0.7149, "step": 113500 }, { "epoch": 9.96, "learning_rate": 3.5692812074724736e-05, "loss": 0.716, "step": 114000 }, { "epoch": 10.01, "learning_rate": 3.5383520969936905e-05, "loss": 0.7236, "step": 114500 }, { "epoch": 10.05, "learning_rate": 3.5074229865149074e-05, "loss": 0.7161, "step": 115000 }, { "epoch": 10.05, "eval_loss": 0.6960982084274292, "eval_runtime": 3127.0335, "eval_samples_per_second": 24.655, "eval_steps_per_second": 1.541, "step": 115000 }, { "epoch": 10.09, "learning_rate": 3.476493876036126e-05, "loss": 0.7149, "step": 115500 }, { "epoch": 10.14, "learning_rate": 3.4455647655573426e-05, "loss": 0.7203, "step": 116000 }, { "epoch": 10.18, "learning_rate": 3.41463565507856e-05, "loss": 0.7259, "step": 116500 }, { "epoch": 10.22, "learning_rate": 3.383706544599777e-05, "loss": 0.7213, "step": 117000 }, { "epoch": 10.27, "learning_rate": 3.35290115056291e-05, "loss": 0.7265, "step": 117500 }, { "epoch": 10.27, "eval_loss": 0.7006255984306335, "eval_runtime": 3139.1903, "eval_samples_per_second": 24.559, "eval_steps_per_second": 1.535, "step": 117500 }, { "epoch": 10.31, "learning_rate": 3.321972040084128e-05, "loss": 0.7228, "step": 118000 }, { "epoch": 10.35, "learning_rate": 3.2910429296053447e-05, "loss": 0.7214, "step": 118500 }, { "epoch": 10.4, "learning_rate": 3.260113819126562e-05, "loss": 0.7309, "step": 119000 }, { "epoch": 10.44, "learning_rate": 3.229184708647779e-05, "loss": 0.7208, "step": 119500 }, { "epoch": 10.49, "learning_rate": 3.198255598168997e-05, "loss": 0.7284, "step": 120000 }, { "epoch": 10.49, "eval_loss": 0.6941153407096863, "eval_runtime": 3132.0931, "eval_samples_per_second": 24.615, "eval_steps_per_second": 1.539, "step": 120000 }, { "epoch": 10.53, "learning_rate": 3.167326487690214e-05, "loss": 0.7233, "step": 120500 }, { "epoch": 10.57, "learning_rate": 3.136397377211431e-05, "loss": 0.7219, "step": 121000 }, { "epoch": 10.62, "learning_rate": 3.105468266732649e-05, "loss": 0.7163, "step": 121500 }, { "epoch": 10.66, "learning_rate": 3.0745391562538664e-05, "loss": 0.7223, "step": 122000 }, { "epoch": 10.7, "learning_rate": 3.0436100457750836e-05, "loss": 0.724, "step": 122500 }, { "epoch": 10.7, "eval_loss": 0.6886956095695496, "eval_runtime": 3117.6812, "eval_samples_per_second": 24.729, "eval_steps_per_second": 1.546, "step": 122500 }, { "epoch": 10.75, "learning_rate": 3.012680935296301e-05, "loss": 0.7181, "step": 123000 }, { "epoch": 10.79, "learning_rate": 2.9817518248175185e-05, "loss": 0.7215, "step": 123500 }, { "epoch": 10.84, "learning_rate": 2.9508845725596935e-05, "loss": 0.726, "step": 124000 }, { "epoch": 10.88, "learning_rate": 2.9199554620809104e-05, "loss": 0.7205, "step": 124500 }, { "epoch": 10.92, "learning_rate": 2.8890882098230854e-05, "loss": 0.7266, "step": 125000 }, { "epoch": 10.92, "eval_loss": 0.6931244134902954, "eval_runtime": 3180.6149, "eval_samples_per_second": 24.239, "eval_steps_per_second": 1.515, "step": 125000 }, { "epoch": 10.97, "learning_rate": 2.858159099344303e-05, "loss": 0.7214, "step": 125500 }, { "epoch": 11.01, "learning_rate": 2.8272299888655202e-05, "loss": 0.7165, "step": 126000 }, { "epoch": 11.05, "learning_rate": 2.7963627366076955e-05, "loss": 0.7044, "step": 126500 }, { "epoch": 11.1, "learning_rate": 2.7654954843498705e-05, "loss": 0.7071, "step": 127000 }, { "epoch": 11.14, "learning_rate": 2.7345663738710874e-05, "loss": 0.7051, "step": 127500 }, { "epoch": 11.14, "eval_loss": 0.6846074461936951, "eval_runtime": 3127.4042, "eval_samples_per_second": 24.652, "eval_steps_per_second": 1.541, "step": 127500 }, { "epoch": 11.18, "learning_rate": 2.703637263392305e-05, "loss": 0.7139, "step": 128000 }, { "epoch": 11.23, "learning_rate": 2.6727081529135223e-05, "loss": 0.7057, "step": 128500 }, { "epoch": 11.27, "learning_rate": 2.64177904243474e-05, "loss": 0.7104, "step": 129000 }, { "epoch": 11.32, "learning_rate": 2.610849931955957e-05, "loss": 0.7036, "step": 129500 }, { "epoch": 11.36, "learning_rate": 2.5799208214771743e-05, "loss": 0.7106, "step": 130000 }, { "epoch": 11.36, "eval_loss": 0.68162602186203, "eval_runtime": 3071.6142, "eval_samples_per_second": 25.1, "eval_steps_per_second": 1.569, "step": 130000 }, { "epoch": 11.4, "learning_rate": 2.548991710998392e-05, "loss": 0.7072, "step": 130500 }, { "epoch": 11.45, "learning_rate": 2.5180626005196088e-05, "loss": 0.7073, "step": 131000 }, { "epoch": 11.49, "learning_rate": 2.4871334900408268e-05, "loss": 0.7109, "step": 131500 }, { "epoch": 11.53, "learning_rate": 2.456204379562044e-05, "loss": 0.7041, "step": 132000 }, { "epoch": 11.58, "learning_rate": 2.4252752690832612e-05, "loss": 0.7011, "step": 132500 }, { "epoch": 11.58, "eval_loss": 0.6830089688301086, "eval_runtime": 3073.7046, "eval_samples_per_second": 25.082, "eval_steps_per_second": 1.568, "step": 132500 }, { "epoch": 11.62, "learning_rate": 2.3943461586044785e-05, "loss": 0.7041, "step": 133000 }, { "epoch": 11.67, "learning_rate": 2.3634789063466535e-05, "loss": 0.7057, "step": 133500 }, { "epoch": 11.71, "learning_rate": 2.332549795867871e-05, "loss": 0.7068, "step": 134000 }, { "epoch": 11.75, "learning_rate": 2.3016825436100457e-05, "loss": 0.7073, "step": 134500 }, { "epoch": 11.8, "learning_rate": 2.2707534331312633e-05, "loss": 0.6997, "step": 135000 }, { "epoch": 11.8, "eval_loss": 0.6784160137176514, "eval_runtime": 3076.1607, "eval_samples_per_second": 25.062, "eval_steps_per_second": 1.567, "step": 135000 }, { "epoch": 11.84, "learning_rate": 2.2398243226524806e-05, "loss": 0.7062, "step": 135500 }, { "epoch": 11.88, "learning_rate": 2.208895212173698e-05, "loss": 0.6982, "step": 136000 }, { "epoch": 11.93, "learning_rate": 2.1779661016949154e-05, "loss": 0.7, "step": 136500 }, { "epoch": 11.97, "learning_rate": 2.147036991216133e-05, "loss": 0.6981, "step": 137000 }, { "epoch": 12.02, "learning_rate": 2.1161697389583076e-05, "loss": 0.6969, "step": 137500 }, { "epoch": 12.02, "eval_loss": 0.673393189907074, "eval_runtime": 3053.9536, "eval_samples_per_second": 25.245, "eval_steps_per_second": 1.578, "step": 137500 }, { "epoch": 12.06, "learning_rate": 2.0852406284795252e-05, "loss": 0.6961, "step": 138000 }, { "epoch": 12.1, "learning_rate": 2.0543115180007425e-05, "loss": 0.6941, "step": 138500 }, { "epoch": 12.15, "learning_rate": 2.0233824075219597e-05, "loss": 0.6938, "step": 139000 }, { "epoch": 12.19, "learning_rate": 1.9924532970431773e-05, "loss": 0.6968, "step": 139500 }, { "epoch": 12.23, "learning_rate": 1.9615241865643945e-05, "loss": 0.6968, "step": 140000 }, { "epoch": 12.23, "eval_loss": 0.6708864569664001, "eval_runtime": 3020.4771, "eval_samples_per_second": 25.524, "eval_steps_per_second": 1.595, "step": 140000 }, { "epoch": 12.28, "learning_rate": 1.9305950760856118e-05, "loss": 0.6928, "step": 140500 }, { "epoch": 12.32, "learning_rate": 1.899665965606829e-05, "loss": 0.6975, "step": 141000 }, { "epoch": 12.36, "learning_rate": 1.8687368551280466e-05, "loss": 0.6912, "step": 141500 }, { "epoch": 12.41, "learning_rate": 1.837807744649264e-05, "loss": 0.6923, "step": 142000 }, { "epoch": 12.45, "learning_rate": 1.8068786341704814e-05, "loss": 0.6867, "step": 142500 }, { "epoch": 12.45, "eval_loss": 0.6655944585800171, "eval_runtime": 3033.3682, "eval_samples_per_second": 25.416, "eval_steps_per_second": 1.589, "step": 142500 }, { "epoch": 12.5, "learning_rate": 1.7759495236916987e-05, "loss": 0.6894, "step": 143000 }, { "epoch": 12.54, "learning_rate": 1.745020413212916e-05, "loss": 0.6861, "step": 143500 }, { "epoch": 12.58, "learning_rate": 1.714153160955091e-05, "loss": 0.6917, "step": 144000 }, { "epoch": 12.63, "learning_rate": 1.6832240504763082e-05, "loss": 0.688, "step": 144500 }, { "epoch": 12.67, "learning_rate": 1.6522949399975258e-05, "loss": 0.6925, "step": 145000 }, { "epoch": 12.67, "eval_loss": 0.6661481261253357, "eval_runtime": 3083.2268, "eval_samples_per_second": 25.005, "eval_steps_per_second": 1.563, "step": 145000 }, { "epoch": 12.71, "learning_rate": 1.621365829518743e-05, "loss": 0.6943, "step": 145500 }, { "epoch": 12.76, "learning_rate": 1.5904367190399606e-05, "loss": 0.6891, "step": 146000 }, { "epoch": 12.8, "learning_rate": 1.559507608561178e-05, "loss": 0.6848, "step": 146500 }, { "epoch": 12.85, "learning_rate": 1.5285784980823954e-05, "loss": 0.6816, "step": 147000 }, { "epoch": 12.89, "learning_rate": 1.49771124582457e-05, "loss": 0.6795, "step": 147500 }, { "epoch": 12.89, "eval_loss": 0.6605859398841858, "eval_runtime": 3089.4045, "eval_samples_per_second": 24.955, "eval_steps_per_second": 1.56, "step": 147500 }, { "epoch": 12.93, "learning_rate": 1.4667821353457875e-05, "loss": 0.6779, "step": 148000 }, { "epoch": 12.98, "learning_rate": 1.4358530248670049e-05, "loss": 0.6846, "step": 148500 }, { "epoch": 13.02, "learning_rate": 1.4049239143882223e-05, "loss": 0.6739, "step": 149000 }, { "epoch": 13.06, "learning_rate": 1.3740566621303971e-05, "loss": 0.6772, "step": 149500 }, { "epoch": 13.11, "learning_rate": 1.3431275516516146e-05, "loss": 0.6774, "step": 150000 }, { "epoch": 13.11, "eval_loss": 0.6616868376731873, "eval_runtime": 3137.3755, "eval_samples_per_second": 24.573, "eval_steps_per_second": 1.536, "step": 150000 }, { "epoch": 13.15, "learning_rate": 1.3122602993937894e-05, "loss": 0.6768, "step": 150500 }, { "epoch": 13.19, "learning_rate": 1.2813311889150068e-05, "loss": 0.6794, "step": 151000 }, { "epoch": 13.24, "learning_rate": 1.2504020784362242e-05, "loss": 0.6816, "step": 151500 }, { "epoch": 13.28, "learning_rate": 1.2194729679574415e-05, "loss": 0.6748, "step": 152000 }, { "epoch": 13.33, "learning_rate": 1.1886057156996165e-05, "loss": 0.6756, "step": 152500 }, { "epoch": 13.33, "eval_loss": 0.6562890410423279, "eval_runtime": 3110.9217, "eval_samples_per_second": 24.782, "eval_steps_per_second": 1.549, "step": 152500 }, { "epoch": 13.37, "learning_rate": 1.1576766052208339e-05, "loss": 0.672, "step": 153000 }, { "epoch": 13.41, "learning_rate": 1.1267474947420513e-05, "loss": 0.6711, "step": 153500 }, { "epoch": 13.46, "learning_rate": 1.0958183842632687e-05, "loss": 0.6731, "step": 154000 }, { "epoch": 13.5, "learning_rate": 1.064889273784486e-05, "loss": 0.6707, "step": 154500 }, { "epoch": 13.54, "learning_rate": 1.0339601633057034e-05, "loss": 0.6728, "step": 155000 }, { "epoch": 13.54, "eval_loss": 0.6547101140022278, "eval_runtime": 3126.1621, "eval_samples_per_second": 24.662, "eval_steps_per_second": 1.542, "step": 155000 }, { "epoch": 13.59, "learning_rate": 1.0030929110478784e-05, "loss": 0.6727, "step": 155500 }, { "epoch": 13.63, "learning_rate": 9.721638005690958e-06, "loss": 0.6679, "step": 156000 }, { "epoch": 13.68, "learning_rate": 9.41234690090313e-06, "loss": 0.6626, "step": 156500 }, { "epoch": 13.72, "learning_rate": 9.103055796115304e-06, "loss": 0.6703, "step": 157000 }, { "epoch": 13.76, "learning_rate": 8.793764691327477e-06, "loss": 0.6732, "step": 157500 }, { "epoch": 13.76, "eval_loss": 0.6520426273345947, "eval_runtime": 3118.7072, "eval_samples_per_second": 24.72, "eval_steps_per_second": 1.545, "step": 157500 }, { "epoch": 13.81, "learning_rate": 8.484473586539651e-06, "loss": 0.6734, "step": 158000 }, { "epoch": 13.85, "learning_rate": 8.175182481751825e-06, "loss": 0.6707, "step": 158500 }, { "epoch": 13.89, "learning_rate": 7.865891376964e-06, "loss": 0.667, "step": 159000 }, { "epoch": 13.94, "learning_rate": 7.557218854385749e-06, "loss": 0.6667, "step": 159500 }, { "epoch": 13.98, "learning_rate": 7.2485463318074974e-06, "loss": 0.6704, "step": 160000 }, { "epoch": 13.98, "eval_loss": 0.6492029428482056, "eval_runtime": 3114.0251, "eval_samples_per_second": 24.758, "eval_steps_per_second": 1.548, "step": 160000 }, { "epoch": 14.02, "learning_rate": 6.939255227019672e-06, "loss": 0.6631, "step": 160500 }, { "epoch": 14.07, "learning_rate": 6.629964122231845e-06, "loss": 0.663, "step": 161000 }, { "epoch": 14.11, "learning_rate": 6.320673017444019e-06, "loss": 0.6661, "step": 161500 }, { "epoch": 14.16, "learning_rate": 6.011381912656192e-06, "loss": 0.6641, "step": 162000 }, { "epoch": 14.2, "learning_rate": 5.7020908078683665e-06, "loss": 0.6666, "step": 162500 }, { "epoch": 14.2, "eval_loss": 0.6445870995521545, "eval_runtime": 3137.0802, "eval_samples_per_second": 24.576, "eval_steps_per_second": 1.536, "step": 162500 }, { "epoch": 14.24, "learning_rate": 5.392799703080539e-06, "loss": 0.6612, "step": 163000 }, { "epoch": 14.29, "learning_rate": 5.083508598292713e-06, "loss": 0.6645, "step": 163500 }, { "epoch": 14.33, "learning_rate": 4.774217493504887e-06, "loss": 0.6613, "step": 164000 }, { "epoch": 14.37, "learning_rate": 4.464926388717061e-06, "loss": 0.6608, "step": 164500 }, { "epoch": 14.42, "learning_rate": 4.155635283929235e-06, "loss": 0.6615, "step": 165000 }, { "epoch": 14.42, "eval_loss": 0.6487849950790405, "eval_runtime": 3095.1776, "eval_samples_per_second": 24.908, "eval_steps_per_second": 1.557, "step": 165000 }, { "epoch": 14.46, "learning_rate": 3.846344179141408e-06, "loss": 0.6598, "step": 165500 }, { "epoch": 14.51, "learning_rate": 3.537671656563157e-06, "loss": 0.6569, "step": 166000 }, { "epoch": 14.55, "learning_rate": 3.2289991339849062e-06, "loss": 0.6587, "step": 166500 }, { "epoch": 14.59, "learning_rate": 2.920326611406656e-06, "loss": 0.6575, "step": 167000 }, { "epoch": 14.64, "learning_rate": 2.61103550661883e-06, "loss": 0.6638, "step": 167500 }, { "epoch": 14.64, "eval_loss": 0.6522655487060547, "eval_runtime": 3117.0073, "eval_samples_per_second": 24.734, "eval_steps_per_second": 1.546, "step": 167500 }, { "epoch": 14.68, "learning_rate": 2.3017444018310032e-06, "loss": 0.6553, "step": 168000 }, { "epoch": 14.72, "learning_rate": 1.9930718792527527e-06, "loss": 0.6617, "step": 168500 }, { "epoch": 14.77, "learning_rate": 1.6837807744649262e-06, "loss": 0.6583, "step": 169000 }, { "epoch": 14.81, "learning_rate": 1.3744896696771002e-06, "loss": 0.6579, "step": 169500 }, { "epoch": 14.85, "learning_rate": 1.0651985648892737e-06, "loss": 0.6588, "step": 170000 }, { "epoch": 14.85, "eval_loss": 0.6414651274681091, "eval_runtime": 3135.6411, "eval_samples_per_second": 24.587, "eval_steps_per_second": 1.537, "step": 170000 }, { "epoch": 14.9, "learning_rate": 7.559074601014475e-07, "loss": 0.6589, "step": 170500 }, { "epoch": 14.94, "learning_rate": 4.4661635531362117e-07, "loss": 0.6582, "step": 171000 }, { "epoch": 14.99, "learning_rate": 1.3732525052579488e-07, "loss": 0.6569, "step": 171500 }, { "epoch": 15.0, "step": 171660, "total_flos": 2.5739338132512957e+18, "train_loss": 0.23807151880656507, "train_runtime": 725915.7505, "train_samples_per_second": 30.268, "train_steps_per_second": 0.236 } ], "max_steps": 171660, "num_train_epochs": 15, "total_flos": 2.5739338132512957e+18, "trial_name": null, "trial_params": null }