{ "best_metric": 0.8695652173913043, "best_model_checkpoint": "swiftformer-xs-ve-U13-b-80\\checkpoint-175", "epoch": 73.84615384615384, "eval_steps": 500, "global_step": 480, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.92, "eval_accuracy": 0.10869565217391304, "eval_loss": 1.4460796117782593, "eval_runtime": 0.7125, "eval_samples_per_second": 64.563, "eval_steps_per_second": 2.807, "step": 6 }, { "epoch": 1.54, "learning_rate": 4.166666666666667e-05, "loss": 1.3993, "step": 10 }, { "epoch": 2.0, "eval_accuracy": 0.10869565217391304, "eval_loss": 1.4435073137283325, "eval_runtime": 0.6148, "eval_samples_per_second": 74.823, "eval_steps_per_second": 3.253, "step": 13 }, { "epoch": 2.92, "eval_accuracy": 0.10869565217391304, "eval_loss": 1.4388779401779175, "eval_runtime": 0.6122, "eval_samples_per_second": 75.137, "eval_steps_per_second": 3.267, "step": 19 }, { "epoch": 3.08, "learning_rate": 8.333333333333334e-05, "loss": 1.3849, "step": 20 }, { "epoch": 4.0, "eval_accuracy": 0.10869565217391304, "eval_loss": 1.4283998012542725, "eval_runtime": 0.6101, "eval_samples_per_second": 75.4, "eval_steps_per_second": 3.278, "step": 26 }, { "epoch": 4.62, "learning_rate": 0.000125, "loss": 1.3287, "step": 30 }, { "epoch": 4.92, "eval_accuracy": 0.13043478260869565, "eval_loss": 1.4222790002822876, "eval_runtime": 0.6169, "eval_samples_per_second": 74.563, "eval_steps_per_second": 3.242, "step": 32 }, { "epoch": 6.0, "eval_accuracy": 0.10869565217391304, "eval_loss": 1.4646577835083008, "eval_runtime": 0.6082, "eval_samples_per_second": 75.638, "eval_steps_per_second": 3.289, "step": 39 }, { "epoch": 6.15, "learning_rate": 0.0001666666666666667, "loss": 1.2128, "step": 40 }, { "epoch": 6.92, "eval_accuracy": 0.17391304347826086, "eval_loss": 1.4183566570281982, "eval_runtime": 0.64, "eval_samples_per_second": 71.878, "eval_steps_per_second": 3.125, "step": 45 }, { "epoch": 7.69, "learning_rate": 0.0001990740740740741, "loss": 1.122, "step": 50 }, { "epoch": 8.0, "eval_accuracy": 0.1956521739130435, "eval_loss": 1.3262405395507812, "eval_runtime": 0.6384, "eval_samples_per_second": 72.055, "eval_steps_per_second": 3.133, "step": 52 }, { "epoch": 8.92, "eval_accuracy": 0.1956521739130435, "eval_loss": 1.3297909498214722, "eval_runtime": 0.5935, "eval_samples_per_second": 77.507, "eval_steps_per_second": 3.37, "step": 58 }, { "epoch": 9.23, "learning_rate": 0.00019444444444444446, "loss": 1.0062, "step": 60 }, { "epoch": 10.0, "eval_accuracy": 0.3695652173913043, "eval_loss": 1.2035472393035889, "eval_runtime": 0.5924, "eval_samples_per_second": 77.654, "eval_steps_per_second": 3.376, "step": 65 }, { "epoch": 10.77, "learning_rate": 0.00018981481481481483, "loss": 0.872, "step": 70 }, { "epoch": 10.92, "eval_accuracy": 0.32608695652173914, "eval_loss": 1.3667021989822388, "eval_runtime": 0.6186, "eval_samples_per_second": 74.367, "eval_steps_per_second": 3.233, "step": 71 }, { "epoch": 12.0, "eval_accuracy": 0.43478260869565216, "eval_loss": 1.201258897781372, "eval_runtime": 0.6083, "eval_samples_per_second": 75.623, "eval_steps_per_second": 3.288, "step": 78 }, { "epoch": 12.31, "learning_rate": 0.0001851851851851852, "loss": 0.814, "step": 80 }, { "epoch": 12.92, "eval_accuracy": 0.6956521739130435, "eval_loss": 0.9996185898780823, "eval_runtime": 0.6063, "eval_samples_per_second": 75.874, "eval_steps_per_second": 3.299, "step": 84 }, { "epoch": 13.85, "learning_rate": 0.00018055555555555557, "loss": 0.7228, "step": 90 }, { "epoch": 14.0, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.9706003069877625, "eval_runtime": 0.6069, "eval_samples_per_second": 75.799, "eval_steps_per_second": 3.296, "step": 91 }, { "epoch": 14.92, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.9295421242713928, "eval_runtime": 0.6331, "eval_samples_per_second": 72.657, "eval_steps_per_second": 3.159, "step": 97 }, { "epoch": 15.38, "learning_rate": 0.00017592592592592595, "loss": 0.6473, "step": 100 }, { "epoch": 16.0, "eval_accuracy": 0.717391304347826, "eval_loss": 0.8987626433372498, "eval_runtime": 0.607, "eval_samples_per_second": 75.787, "eval_steps_per_second": 3.295, "step": 104 }, { "epoch": 16.92, "learning_rate": 0.00017129629629629632, "loss": 0.5696, "step": 110 }, { "epoch": 16.92, "eval_accuracy": 0.6739130434782609, "eval_loss": 0.9612070322036743, "eval_runtime": 0.6266, "eval_samples_per_second": 73.408, "eval_steps_per_second": 3.192, "step": 110 }, { "epoch": 18.0, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.8712936043739319, "eval_runtime": 0.5923, "eval_samples_per_second": 77.662, "eval_steps_per_second": 3.377, "step": 117 }, { "epoch": 18.46, "learning_rate": 0.0001666666666666667, "loss": 0.5546, "step": 120 }, { "epoch": 18.92, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.8049858212471008, "eval_runtime": 0.6052, "eval_samples_per_second": 76.002, "eval_steps_per_second": 3.304, "step": 123 }, { "epoch": 20.0, "learning_rate": 0.00016203703703703706, "loss": 0.4747, "step": 130 }, { "epoch": 20.0, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7724586129188538, "eval_runtime": 0.5976, "eval_samples_per_second": 76.969, "eval_steps_per_second": 3.346, "step": 130 }, { "epoch": 20.92, "eval_accuracy": 0.6956521739130435, "eval_loss": 0.7933102250099182, "eval_runtime": 0.5936, "eval_samples_per_second": 77.487, "eval_steps_per_second": 3.369, "step": 136 }, { "epoch": 21.54, "learning_rate": 0.00015740740740740743, "loss": 0.4393, "step": 140 }, { "epoch": 22.0, "eval_accuracy": 0.6956521739130435, "eval_loss": 0.7665086984634399, "eval_runtime": 0.6062, "eval_samples_per_second": 75.883, "eval_steps_per_second": 3.299, "step": 143 }, { "epoch": 22.92, "eval_accuracy": 0.717391304347826, "eval_loss": 0.7885972857475281, "eval_runtime": 0.644, "eval_samples_per_second": 71.425, "eval_steps_per_second": 3.105, "step": 149 }, { "epoch": 23.08, "learning_rate": 0.00015277777777777777, "loss": 0.4077, "step": 150 }, { "epoch": 24.0, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.7824200391769409, "eval_runtime": 0.6187, "eval_samples_per_second": 74.344, "eval_steps_per_second": 3.232, "step": 156 }, { "epoch": 24.62, "learning_rate": 0.00014814814814814815, "loss": 0.3326, "step": 160 }, { "epoch": 24.92, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.7020803689956665, "eval_runtime": 0.6226, "eval_samples_per_second": 73.885, "eval_steps_per_second": 3.212, "step": 162 }, { "epoch": 26.0, "eval_accuracy": 0.8260869565217391, "eval_loss": 0.6346074342727661, "eval_runtime": 0.611, "eval_samples_per_second": 75.283, "eval_steps_per_second": 3.273, "step": 169 }, { "epoch": 26.15, "learning_rate": 0.00014351851851851852, "loss": 0.315, "step": 170 }, { "epoch": 26.92, "eval_accuracy": 0.8695652173913043, "eval_loss": 0.6163212060928345, "eval_runtime": 0.6094, "eval_samples_per_second": 75.49, "eval_steps_per_second": 3.282, "step": 175 }, { "epoch": 27.69, "learning_rate": 0.0001388888888888889, "loss": 0.2729, "step": 180 }, { "epoch": 28.0, "eval_accuracy": 0.8043478260869565, "eval_loss": 0.6938338875770569, "eval_runtime": 0.6252, "eval_samples_per_second": 73.577, "eval_steps_per_second": 3.199, "step": 182 }, { "epoch": 28.92, "eval_accuracy": 0.8043478260869565, "eval_loss": 0.7417004704475403, "eval_runtime": 0.6265, "eval_samples_per_second": 73.429, "eval_steps_per_second": 3.193, "step": 188 }, { "epoch": 29.23, "learning_rate": 0.00013425925925925926, "loss": 0.2218, "step": 190 }, { "epoch": 30.0, "eval_accuracy": 0.782608695652174, "eval_loss": 0.6668894290924072, "eval_runtime": 0.6201, "eval_samples_per_second": 74.182, "eval_steps_per_second": 3.225, "step": 195 }, { "epoch": 30.77, "learning_rate": 0.00012962962962962963, "loss": 0.2499, "step": 200 }, { "epoch": 30.92, "eval_accuracy": 0.717391304347826, "eval_loss": 0.7111229300498962, "eval_runtime": 0.6369, "eval_samples_per_second": 72.225, "eval_steps_per_second": 3.14, "step": 201 }, { "epoch": 32.0, "eval_accuracy": 0.782608695652174, "eval_loss": 0.6729680299758911, "eval_runtime": 0.6117, "eval_samples_per_second": 75.197, "eval_steps_per_second": 3.269, "step": 208 }, { "epoch": 32.31, "learning_rate": 0.000125, "loss": 0.2218, "step": 210 }, { "epoch": 32.92, "eval_accuracy": 0.8043478260869565, "eval_loss": 0.6511849164962769, "eval_runtime": 0.6114, "eval_samples_per_second": 75.242, "eval_steps_per_second": 3.271, "step": 214 }, { "epoch": 33.85, "learning_rate": 0.00012037037037037037, "loss": 0.2037, "step": 220 }, { "epoch": 34.0, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.7164542078971863, "eval_runtime": 0.6112, "eval_samples_per_second": 75.258, "eval_steps_per_second": 3.272, "step": 221 }, { "epoch": 34.92, "eval_accuracy": 0.8260869565217391, "eval_loss": 0.6299737691879272, "eval_runtime": 0.6021, "eval_samples_per_second": 76.406, "eval_steps_per_second": 3.322, "step": 227 }, { "epoch": 35.38, "learning_rate": 0.00011574074074074075, "loss": 0.2367, "step": 230 }, { "epoch": 36.0, "eval_accuracy": 0.782608695652174, "eval_loss": 0.7421004176139832, "eval_runtime": 0.6016, "eval_samples_per_second": 76.464, "eval_steps_per_second": 3.325, "step": 234 }, { "epoch": 36.92, "learning_rate": 0.00011111111111111112, "loss": 0.1835, "step": 240 }, { "epoch": 36.92, "eval_accuracy": 0.8043478260869565, "eval_loss": 0.6643686294555664, "eval_runtime": 0.6164, "eval_samples_per_second": 74.628, "eval_steps_per_second": 3.245, "step": 240 }, { "epoch": 38.0, "eval_accuracy": 0.8260869565217391, "eval_loss": 0.6250634789466858, "eval_runtime": 0.6064, "eval_samples_per_second": 75.856, "eval_steps_per_second": 3.298, "step": 247 }, { "epoch": 38.46, "learning_rate": 0.00010648148148148149, "loss": 0.2073, "step": 250 }, { "epoch": 38.92, "eval_accuracy": 0.782608695652174, "eval_loss": 0.6430649161338806, "eval_runtime": 0.6133, "eval_samples_per_second": 75.001, "eval_steps_per_second": 3.261, "step": 253 }, { "epoch": 40.0, "learning_rate": 0.00010185185185185186, "loss": 0.1643, "step": 260 }, { "epoch": 40.0, "eval_accuracy": 0.8043478260869565, "eval_loss": 0.634781539440155, "eval_runtime": 0.6208, "eval_samples_per_second": 74.097, "eval_steps_per_second": 3.222, "step": 260 }, { "epoch": 40.92, "eval_accuracy": 0.8043478260869565, "eval_loss": 0.6192363500595093, "eval_runtime": 0.6183, "eval_samples_per_second": 74.394, "eval_steps_per_second": 3.235, "step": 266 }, { "epoch": 41.54, "learning_rate": 9.722222222222223e-05, "loss": 0.1685, "step": 270 }, { "epoch": 42.0, "eval_accuracy": 0.8043478260869565, "eval_loss": 0.6753482818603516, "eval_runtime": 0.5997, "eval_samples_per_second": 76.701, "eval_steps_per_second": 3.335, "step": 273 }, { "epoch": 42.92, "eval_accuracy": 0.782608695652174, "eval_loss": 0.7440155148506165, "eval_runtime": 0.6117, "eval_samples_per_second": 75.199, "eval_steps_per_second": 3.27, "step": 279 }, { "epoch": 43.08, "learning_rate": 9.25925925925926e-05, "loss": 0.1539, "step": 280 }, { "epoch": 44.0, "eval_accuracy": 0.8043478260869565, "eval_loss": 0.7504969239234924, "eval_runtime": 0.6044, "eval_samples_per_second": 76.107, "eval_steps_per_second": 3.309, "step": 286 }, { "epoch": 44.62, "learning_rate": 8.796296296296297e-05, "loss": 0.1658, "step": 290 }, { "epoch": 44.92, "eval_accuracy": 0.8260869565217391, "eval_loss": 0.6331078410148621, "eval_runtime": 0.6101, "eval_samples_per_second": 75.401, "eval_steps_per_second": 3.278, "step": 292 }, { "epoch": 46.0, "eval_accuracy": 0.8043478260869565, "eval_loss": 0.6549533009529114, "eval_runtime": 0.8345, "eval_samples_per_second": 55.12, "eval_steps_per_second": 2.397, "step": 299 }, { "epoch": 46.15, "learning_rate": 8.333333333333334e-05, "loss": 0.1596, "step": 300 }, { "epoch": 46.92, "eval_accuracy": 0.8043478260869565, "eval_loss": 0.6824274063110352, "eval_runtime": 0.6289, "eval_samples_per_second": 73.143, "eval_steps_per_second": 3.18, "step": 305 }, { "epoch": 47.69, "learning_rate": 7.870370370370372e-05, "loss": 0.1534, "step": 310 }, { "epoch": 48.0, "eval_accuracy": 0.8043478260869565, "eval_loss": 0.6970986723899841, "eval_runtime": 0.6878, "eval_samples_per_second": 66.875, "eval_steps_per_second": 2.908, "step": 312 }, { "epoch": 48.92, "eval_accuracy": 0.8260869565217391, "eval_loss": 0.6347023248672485, "eval_runtime": 0.6022, "eval_samples_per_second": 76.388, "eval_steps_per_second": 3.321, "step": 318 }, { "epoch": 49.23, "learning_rate": 7.407407407407407e-05, "loss": 0.1677, "step": 320 }, { "epoch": 50.0, "eval_accuracy": 0.8260869565217391, "eval_loss": 0.6392168998718262, "eval_runtime": 0.6333, "eval_samples_per_second": 72.632, "eval_steps_per_second": 3.158, "step": 325 }, { "epoch": 50.77, "learning_rate": 6.944444444444444e-05, "loss": 0.1453, "step": 330 }, { "epoch": 50.92, "eval_accuracy": 0.8043478260869565, "eval_loss": 0.6368798017501831, "eval_runtime": 0.6455, "eval_samples_per_second": 71.262, "eval_steps_per_second": 3.098, "step": 331 }, { "epoch": 52.0, "eval_accuracy": 0.782608695652174, "eval_loss": 0.623034656047821, "eval_runtime": 0.7388, "eval_samples_per_second": 62.259, "eval_steps_per_second": 2.707, "step": 338 }, { "epoch": 52.31, "learning_rate": 6.481481481481482e-05, "loss": 0.1385, "step": 340 }, { "epoch": 52.92, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.6432219743728638, "eval_runtime": 0.6128, "eval_samples_per_second": 75.06, "eval_steps_per_second": 3.263, "step": 344 }, { "epoch": 53.85, "learning_rate": 6.018518518518519e-05, "loss": 0.1221, "step": 350 }, { "epoch": 54.0, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.6757408380508423, "eval_runtime": 0.5947, "eval_samples_per_second": 77.346, "eval_steps_per_second": 3.363, "step": 351 }, { "epoch": 54.92, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7383140921592712, "eval_runtime": 0.6233, "eval_samples_per_second": 73.805, "eval_steps_per_second": 3.209, "step": 357 }, { "epoch": 55.38, "learning_rate": 5.555555555555556e-05, "loss": 0.1433, "step": 360 }, { "epoch": 56.0, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7100470066070557, "eval_runtime": 0.6051, "eval_samples_per_second": 76.02, "eval_steps_per_second": 3.305, "step": 364 }, { "epoch": 56.92, "learning_rate": 5.092592592592593e-05, "loss": 0.1567, "step": 370 }, { "epoch": 56.92, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.6861900091171265, "eval_runtime": 0.6176, "eval_samples_per_second": 74.482, "eval_steps_per_second": 3.238, "step": 370 }, { "epoch": 58.0, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.6653789281845093, "eval_runtime": 0.6048, "eval_samples_per_second": 76.061, "eval_steps_per_second": 3.307, "step": 377 }, { "epoch": 58.46, "learning_rate": 4.62962962962963e-05, "loss": 0.1361, "step": 380 }, { "epoch": 58.92, "eval_accuracy": 0.782608695652174, "eval_loss": 0.66651451587677, "eval_runtime": 0.6057, "eval_samples_per_second": 75.942, "eval_steps_per_second": 3.302, "step": 383 }, { "epoch": 60.0, "learning_rate": 4.166666666666667e-05, "loss": 0.1157, "step": 390 }, { "epoch": 60.0, "eval_accuracy": 0.782608695652174, "eval_loss": 0.6439006924629211, "eval_runtime": 0.6871, "eval_samples_per_second": 66.95, "eval_steps_per_second": 2.911, "step": 390 }, { "epoch": 60.92, "eval_accuracy": 0.8260869565217391, "eval_loss": 0.6306149959564209, "eval_runtime": 0.6235, "eval_samples_per_second": 73.782, "eval_steps_per_second": 3.208, "step": 396 }, { "epoch": 61.54, "learning_rate": 3.7037037037037037e-05, "loss": 0.0934, "step": 400 }, { "epoch": 62.0, "eval_accuracy": 0.8043478260869565, "eval_loss": 0.6546051502227783, "eval_runtime": 0.6648, "eval_samples_per_second": 69.19, "eval_steps_per_second": 3.008, "step": 403 }, { "epoch": 62.92, "eval_accuracy": 0.8043478260869565, "eval_loss": 0.665103018283844, "eval_runtime": 0.6067, "eval_samples_per_second": 75.824, "eval_steps_per_second": 3.297, "step": 409 }, { "epoch": 63.08, "learning_rate": 3.240740740740741e-05, "loss": 0.1123, "step": 410 }, { "epoch": 64.0, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.6568355560302734, "eval_runtime": 0.6061, "eval_samples_per_second": 75.896, "eval_steps_per_second": 3.3, "step": 416 }, { "epoch": 64.62, "learning_rate": 2.777777777777778e-05, "loss": 0.0855, "step": 420 }, { "epoch": 64.92, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.6506606936454773, "eval_runtime": 0.6256, "eval_samples_per_second": 73.528, "eval_steps_per_second": 3.197, "step": 422 }, { "epoch": 66.0, "eval_accuracy": 0.8043478260869565, "eval_loss": 0.6667113304138184, "eval_runtime": 0.6184, "eval_samples_per_second": 74.384, "eval_steps_per_second": 3.234, "step": 429 }, { "epoch": 66.15, "learning_rate": 2.314814814814815e-05, "loss": 0.1135, "step": 430 }, { "epoch": 66.92, "eval_accuracy": 0.782608695652174, "eval_loss": 0.6515849232673645, "eval_runtime": 0.6641, "eval_samples_per_second": 69.267, "eval_steps_per_second": 3.012, "step": 435 }, { "epoch": 67.69, "learning_rate": 1.8518518518518518e-05, "loss": 0.0932, "step": 440 }, { "epoch": 68.0, "eval_accuracy": 0.8043478260869565, "eval_loss": 0.6596114039421082, "eval_runtime": 0.5982, "eval_samples_per_second": 76.895, "eval_steps_per_second": 3.343, "step": 442 }, { "epoch": 68.92, "eval_accuracy": 0.8043478260869565, "eval_loss": 0.677249550819397, "eval_runtime": 0.6014, "eval_samples_per_second": 76.488, "eval_steps_per_second": 3.326, "step": 448 }, { "epoch": 69.23, "learning_rate": 1.388888888888889e-05, "loss": 0.1228, "step": 450 }, { "epoch": 70.0, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.6525955200195312, "eval_runtime": 0.6064, "eval_samples_per_second": 75.857, "eval_steps_per_second": 3.298, "step": 455 }, { "epoch": 70.77, "learning_rate": 9.259259259259259e-06, "loss": 0.0878, "step": 460 }, { "epoch": 70.92, "eval_accuracy": 0.8260869565217391, "eval_loss": 0.6731473207473755, "eval_runtime": 0.6319, "eval_samples_per_second": 72.794, "eval_steps_per_second": 3.165, "step": 461 }, { "epoch": 72.0, "eval_accuracy": 0.782608695652174, "eval_loss": 0.6350728869438171, "eval_runtime": 0.626, "eval_samples_per_second": 73.479, "eval_steps_per_second": 3.195, "step": 468 }, { "epoch": 72.31, "learning_rate": 4.6296296296296296e-06, "loss": 0.1073, "step": 470 }, { "epoch": 72.92, "eval_accuracy": 0.782608695652174, "eval_loss": 0.6268617510795593, "eval_runtime": 0.6109, "eval_samples_per_second": 75.304, "eval_steps_per_second": 3.274, "step": 474 }, { "epoch": 73.85, "learning_rate": 0.0, "loss": 0.1028, "step": 480 }, { "epoch": 73.85, "eval_accuracy": 0.782608695652174, "eval_loss": 0.6742560267448425, "eval_runtime": 0.6109, "eval_samples_per_second": 75.3, "eval_steps_per_second": 3.274, "step": 480 }, { "epoch": 73.85, "step": 480, "total_flos": 1.6581977329862246e+17, "train_loss": 0.38355133185784024, "train_runtime": 282.7663, "train_samples_per_second": 231.711, "train_steps_per_second": 1.698 } ], "logging_steps": 10, "max_steps": 480, "num_input_tokens_seen": 0, "num_train_epochs": 80, "save_steps": 500, "total_flos": 1.6581977329862246e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }