{ "best_metric": 0.8043478260869565, "best_model_checkpoint": "swiftformer-xs-ve-U13-b-80c\\checkpoint-175", "epoch": 73.84615384615384, "eval_steps": 500, "global_step": 480, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.92, "eval_accuracy": 0.2391304347826087, "eval_loss": 1.3860037326812744, "eval_runtime": 0.6497, "eval_samples_per_second": 70.799, "eval_steps_per_second": 3.078, "step": 6 }, { "epoch": 1.54, "learning_rate": 2.777777777777778e-05, "loss": 1.3859, "step": 10 }, { "epoch": 2.0, "eval_accuracy": 0.30434782608695654, "eval_loss": 1.3843744993209839, "eval_runtime": 0.6508, "eval_samples_per_second": 70.684, "eval_steps_per_second": 3.073, "step": 13 }, { "epoch": 2.92, "eval_accuracy": 0.1956521739130435, "eval_loss": 1.3819622993469238, "eval_runtime": 0.6238, "eval_samples_per_second": 73.745, "eval_steps_per_second": 3.206, "step": 19 }, { "epoch": 3.08, "learning_rate": 5.555555555555556e-05, "loss": 1.381, "step": 20 }, { "epoch": 4.0, "eval_accuracy": 0.17391304347826086, "eval_loss": 1.3745858669281006, "eval_runtime": 0.6174, "eval_samples_per_second": 74.504, "eval_steps_per_second": 3.239, "step": 26 }, { "epoch": 4.62, "learning_rate": 8.333333333333334e-05, "loss": 1.3573, "step": 30 }, { "epoch": 4.92, "eval_accuracy": 0.1956521739130435, "eval_loss": 1.3642687797546387, "eval_runtime": 0.5993, "eval_samples_per_second": 76.759, "eval_steps_per_second": 3.337, "step": 32 }, { "epoch": 6.0, "eval_accuracy": 0.15217391304347827, "eval_loss": 1.3561071157455444, "eval_runtime": 0.6092, "eval_samples_per_second": 75.505, "eval_steps_per_second": 3.283, "step": 39 }, { "epoch": 6.15, "learning_rate": 0.00011111111111111112, "loss": 1.2692, "step": 40 }, { "epoch": 6.92, "eval_accuracy": 0.15217391304347827, "eval_loss": 1.3582805395126343, "eval_runtime": 0.6066, "eval_samples_per_second": 75.83, "eval_steps_per_second": 3.297, "step": 45 }, { "epoch": 7.69, "learning_rate": 0.0001388888888888889, "loss": 1.1682, "step": 50 }, { "epoch": 8.0, "eval_accuracy": 0.17391304347826086, "eval_loss": 1.3623026609420776, "eval_runtime": 0.6189, "eval_samples_per_second": 74.33, "eval_steps_per_second": 3.232, "step": 52 }, { "epoch": 8.92, "eval_accuracy": 0.2608695652173913, "eval_loss": 1.3296467065811157, "eval_runtime": 0.6038, "eval_samples_per_second": 76.178, "eval_steps_per_second": 3.312, "step": 58 }, { "epoch": 9.23, "learning_rate": 0.0001666666666666667, "loss": 1.1005, "step": 60 }, { "epoch": 10.0, "eval_accuracy": 0.391304347826087, "eval_loss": 1.266342282295227, "eval_runtime": 0.6296, "eval_samples_per_second": 73.06, "eval_steps_per_second": 3.177, "step": 65 }, { "epoch": 10.77, "learning_rate": 0.00019444444444444446, "loss": 0.9884, "step": 70 }, { "epoch": 10.92, "eval_accuracy": 0.3695652173913043, "eval_loss": 1.3159973621368408, "eval_runtime": 0.6014, "eval_samples_per_second": 76.491, "eval_steps_per_second": 3.326, "step": 71 }, { "epoch": 12.0, "eval_accuracy": 0.4782608695652174, "eval_loss": 1.1806195974349976, "eval_runtime": 0.6259, "eval_samples_per_second": 73.491, "eval_steps_per_second": 3.195, "step": 78 }, { "epoch": 12.31, "learning_rate": 0.000196078431372549, "loss": 0.9111, "step": 80 }, { "epoch": 12.92, "eval_accuracy": 0.6086956521739131, "eval_loss": 1.15597403049469, "eval_runtime": 0.6243, "eval_samples_per_second": 73.68, "eval_steps_per_second": 3.203, "step": 84 }, { "epoch": 13.85, "learning_rate": 0.0001911764705882353, "loss": 0.8464, "step": 90 }, { "epoch": 14.0, "eval_accuracy": 0.5869565217391305, "eval_loss": 1.1349503993988037, "eval_runtime": 0.6226, "eval_samples_per_second": 73.889, "eval_steps_per_second": 3.213, "step": 91 }, { "epoch": 14.92, "eval_accuracy": 0.6304347826086957, "eval_loss": 1.0768355131149292, "eval_runtime": 0.6092, "eval_samples_per_second": 75.507, "eval_steps_per_second": 3.283, "step": 97 }, { "epoch": 15.38, "learning_rate": 0.00018627450980392157, "loss": 0.7768, "step": 100 }, { "epoch": 16.0, "eval_accuracy": 0.6086956521739131, "eval_loss": 0.9706609845161438, "eval_runtime": 0.6399, "eval_samples_per_second": 71.887, "eval_steps_per_second": 3.126, "step": 104 }, { "epoch": 16.92, "learning_rate": 0.00018137254901960786, "loss": 0.6754, "step": 110 }, { "epoch": 16.92, "eval_accuracy": 0.6521739130434783, "eval_loss": 0.9544328451156616, "eval_runtime": 0.5992, "eval_samples_per_second": 76.775, "eval_steps_per_second": 3.338, "step": 110 }, { "epoch": 18.0, "eval_accuracy": 0.6739130434782609, "eval_loss": 0.9885360598564148, "eval_runtime": 0.6261, "eval_samples_per_second": 73.474, "eval_steps_per_second": 3.195, "step": 117 }, { "epoch": 18.46, "learning_rate": 0.00017647058823529413, "loss": 0.657, "step": 120 }, { "epoch": 18.92, "eval_accuracy": 0.6956521739130435, "eval_loss": 0.85777348279953, "eval_runtime": 0.6125, "eval_samples_per_second": 75.103, "eval_steps_per_second": 3.265, "step": 123 }, { "epoch": 20.0, "learning_rate": 0.0001715686274509804, "loss": 0.5408, "step": 130 }, { "epoch": 20.0, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.7794063687324524, "eval_runtime": 0.5998, "eval_samples_per_second": 76.697, "eval_steps_per_second": 3.335, "step": 130 }, { "epoch": 20.92, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.8072043657302856, "eval_runtime": 0.6119, "eval_samples_per_second": 75.172, "eval_steps_per_second": 3.268, "step": 136 }, { "epoch": 21.54, "learning_rate": 0.0001666666666666667, "loss": 0.5094, "step": 140 }, { "epoch": 22.0, "eval_accuracy": 0.6739130434782609, "eval_loss": 0.7917114496231079, "eval_runtime": 0.6088, "eval_samples_per_second": 75.563, "eval_steps_per_second": 3.285, "step": 143 }, { "epoch": 22.92, "eval_accuracy": 0.6739130434782609, "eval_loss": 0.7974965572357178, "eval_runtime": 0.6475, "eval_samples_per_second": 71.039, "eval_steps_per_second": 3.089, "step": 149 }, { "epoch": 23.08, "learning_rate": 0.00016176470588235295, "loss": 0.4546, "step": 150 }, { "epoch": 24.0, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.758342981338501, "eval_runtime": 0.6038, "eval_samples_per_second": 76.184, "eval_steps_per_second": 3.312, "step": 156 }, { "epoch": 24.62, "learning_rate": 0.00015686274509803922, "loss": 0.3722, "step": 160 }, { "epoch": 24.92, "eval_accuracy": 0.782608695652174, "eval_loss": 0.7073833346366882, "eval_runtime": 0.6225, "eval_samples_per_second": 73.896, "eval_steps_per_second": 3.213, "step": 162 }, { "epoch": 26.0, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.6909225583076477, "eval_runtime": 0.6068, "eval_samples_per_second": 75.807, "eval_steps_per_second": 3.296, "step": 169 }, { "epoch": 26.15, "learning_rate": 0.00015196078431372549, "loss": 0.3494, "step": 170 }, { "epoch": 26.92, "eval_accuracy": 0.8043478260869565, "eval_loss": 0.7032170295715332, "eval_runtime": 0.6101, "eval_samples_per_second": 75.403, "eval_steps_per_second": 3.278, "step": 175 }, { "epoch": 27.69, "learning_rate": 0.00014705882352941178, "loss": 0.3092, "step": 180 }, { "epoch": 28.0, "eval_accuracy": 0.782608695652174, "eval_loss": 0.8148682713508606, "eval_runtime": 0.6369, "eval_samples_per_second": 72.229, "eval_steps_per_second": 3.14, "step": 182 }, { "epoch": 28.92, "eval_accuracy": 0.782608695652174, "eval_loss": 0.7898127436637878, "eval_runtime": 0.6193, "eval_samples_per_second": 74.278, "eval_steps_per_second": 3.229, "step": 188 }, { "epoch": 29.23, "learning_rate": 0.00014215686274509804, "loss": 0.2643, "step": 190 }, { "epoch": 30.0, "eval_accuracy": 0.8043478260869565, "eval_loss": 0.7311907410621643, "eval_runtime": 0.6043, "eval_samples_per_second": 76.119, "eval_steps_per_second": 3.31, "step": 195 }, { "epoch": 30.77, "learning_rate": 0.0001372549019607843, "loss": 0.2659, "step": 200 }, { "epoch": 30.92, "eval_accuracy": 0.717391304347826, "eval_loss": 0.7597643733024597, "eval_runtime": 0.6226, "eval_samples_per_second": 73.885, "eval_steps_per_second": 3.212, "step": 201 }, { "epoch": 32.0, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7530593276023865, "eval_runtime": 0.5971, "eval_samples_per_second": 77.044, "eval_steps_per_second": 3.35, "step": 208 }, { "epoch": 32.31, "learning_rate": 0.0001323529411764706, "loss": 0.2298, "step": 210 }, { "epoch": 32.92, "eval_accuracy": 0.8043478260869565, "eval_loss": 0.6876691579818726, "eval_runtime": 0.6354, "eval_samples_per_second": 72.394, "eval_steps_per_second": 3.148, "step": 214 }, { "epoch": 33.85, "learning_rate": 0.00012745098039215687, "loss": 0.2147, "step": 220 }, { "epoch": 34.0, "eval_accuracy": 0.8043478260869565, "eval_loss": 0.6864094734191895, "eval_runtime": 0.6204, "eval_samples_per_second": 74.15, "eval_steps_per_second": 3.224, "step": 221 }, { "epoch": 34.92, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.7655704617500305, "eval_runtime": 0.623, "eval_samples_per_second": 73.835, "eval_steps_per_second": 3.21, "step": 227 }, { "epoch": 35.38, "learning_rate": 0.00012254901960784316, "loss": 0.2457, "step": 230 }, { "epoch": 36.0, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.8493983149528503, "eval_runtime": 0.6074, "eval_samples_per_second": 75.734, "eval_steps_per_second": 3.293, "step": 234 }, { "epoch": 36.92, "learning_rate": 0.00011764705882352942, "loss": 0.1905, "step": 240 }, { "epoch": 36.92, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7319269776344299, "eval_runtime": 0.6155, "eval_samples_per_second": 74.733, "eval_steps_per_second": 3.249, "step": 240 }, { "epoch": 38.0, "eval_accuracy": 0.6956521739130435, "eval_loss": 0.8289902806282043, "eval_runtime": 0.6141, "eval_samples_per_second": 74.91, "eval_steps_per_second": 3.257, "step": 247 }, { "epoch": 38.46, "learning_rate": 0.0001127450980392157, "loss": 0.2073, "step": 250 }, { "epoch": 38.92, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7963315844535828, "eval_runtime": 0.6558, "eval_samples_per_second": 70.149, "eval_steps_per_second": 3.05, "step": 253 }, { "epoch": 40.0, "learning_rate": 0.00010784313725490196, "loss": 0.1603, "step": 260 }, { "epoch": 40.0, "eval_accuracy": 0.6956521739130435, "eval_loss": 0.8692622780799866, "eval_runtime": 0.5964, "eval_samples_per_second": 77.127, "eval_steps_per_second": 3.353, "step": 260 }, { "epoch": 40.92, "eval_accuracy": 0.8043478260869565, "eval_loss": 0.7137989401817322, "eval_runtime": 0.6168, "eval_samples_per_second": 74.577, "eval_steps_per_second": 3.242, "step": 266 }, { "epoch": 41.54, "learning_rate": 0.00010294117647058823, "loss": 0.1852, "step": 270 }, { "epoch": 42.0, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7274174690246582, "eval_runtime": 0.6771, "eval_samples_per_second": 67.936, "eval_steps_per_second": 2.954, "step": 273 }, { "epoch": 42.92, "eval_accuracy": 0.6739130434782609, "eval_loss": 0.8352705240249634, "eval_runtime": 0.6424, "eval_samples_per_second": 71.602, "eval_steps_per_second": 3.113, "step": 279 }, { "epoch": 43.08, "learning_rate": 9.80392156862745e-05, "loss": 0.1641, "step": 280 }, { "epoch": 44.0, "eval_accuracy": 0.6956521739130435, "eval_loss": 0.9382135272026062, "eval_runtime": 0.6048, "eval_samples_per_second": 76.054, "eval_steps_per_second": 3.307, "step": 286 }, { "epoch": 44.62, "learning_rate": 9.313725490196079e-05, "loss": 0.1568, "step": 290 }, { "epoch": 44.92, "eval_accuracy": 0.717391304347826, "eval_loss": 0.8655094504356384, "eval_runtime": 0.6326, "eval_samples_per_second": 72.716, "eval_steps_per_second": 3.162, "step": 292 }, { "epoch": 46.0, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.7620847821235657, "eval_runtime": 0.6048, "eval_samples_per_second": 76.056, "eval_steps_per_second": 3.307, "step": 299 }, { "epoch": 46.15, "learning_rate": 8.823529411764706e-05, "loss": 0.1498, "step": 300 }, { "epoch": 46.92, "eval_accuracy": 0.717391304347826, "eval_loss": 0.794407069683075, "eval_runtime": 0.6439, "eval_samples_per_second": 71.44, "eval_steps_per_second": 3.106, "step": 305 }, { "epoch": 47.69, "learning_rate": 8.333333333333334e-05, "loss": 0.1563, "step": 310 }, { "epoch": 48.0, "eval_accuracy": 0.6956521739130435, "eval_loss": 0.8432682752609253, "eval_runtime": 0.6319, "eval_samples_per_second": 72.801, "eval_steps_per_second": 3.165, "step": 312 }, { "epoch": 48.92, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.8633100986480713, "eval_runtime": 0.6452, "eval_samples_per_second": 71.293, "eval_steps_per_second": 3.1, "step": 318 }, { "epoch": 49.23, "learning_rate": 7.843137254901961e-05, "loss": 0.1554, "step": 320 }, { "epoch": 50.0, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.8542913198471069, "eval_runtime": 0.6393, "eval_samples_per_second": 71.95, "eval_steps_per_second": 3.128, "step": 325 }, { "epoch": 50.77, "learning_rate": 7.352941176470589e-05, "loss": 0.1316, "step": 330 }, { "epoch": 50.92, "eval_accuracy": 0.717391304347826, "eval_loss": 0.9127072095870972, "eval_runtime": 0.6018, "eval_samples_per_second": 76.431, "eval_steps_per_second": 3.323, "step": 331 }, { "epoch": 52.0, "eval_accuracy": 0.6956521739130435, "eval_loss": 0.9248411059379578, "eval_runtime": 0.6005, "eval_samples_per_second": 76.606, "eval_steps_per_second": 3.331, "step": 338 }, { "epoch": 52.31, "learning_rate": 6.862745098039216e-05, "loss": 0.1264, "step": 340 }, { "epoch": 52.92, "eval_accuracy": 0.6956521739130435, "eval_loss": 0.9349088072776794, "eval_runtime": 0.5946, "eval_samples_per_second": 77.368, "eval_steps_per_second": 3.364, "step": 344 }, { "epoch": 53.85, "learning_rate": 6.372549019607843e-05, "loss": 0.1082, "step": 350 }, { "epoch": 54.0, "eval_accuracy": 0.6739130434782609, "eval_loss": 0.978458046913147, "eval_runtime": 0.6614, "eval_samples_per_second": 69.548, "eval_steps_per_second": 3.024, "step": 351 }, { "epoch": 54.92, "eval_accuracy": 0.6739130434782609, "eval_loss": 1.0165393352508545, "eval_runtime": 0.6054, "eval_samples_per_second": 75.984, "eval_steps_per_second": 3.304, "step": 357 }, { "epoch": 55.38, "learning_rate": 5.882352941176471e-05, "loss": 0.1366, "step": 360 }, { "epoch": 56.0, "eval_accuracy": 0.6956521739130435, "eval_loss": 0.8369129300117493, "eval_runtime": 0.6396, "eval_samples_per_second": 71.916, "eval_steps_per_second": 3.127, "step": 364 }, { "epoch": 56.92, "learning_rate": 5.392156862745098e-05, "loss": 0.1546, "step": 370 }, { "epoch": 56.92, "eval_accuracy": 0.717391304347826, "eval_loss": 0.8372054696083069, "eval_runtime": 0.6015, "eval_samples_per_second": 76.478, "eval_steps_per_second": 3.325, "step": 370 }, { "epoch": 58.0, "eval_accuracy": 0.6956521739130435, "eval_loss": 0.8595665097236633, "eval_runtime": 0.6434, "eval_samples_per_second": 71.491, "eval_steps_per_second": 3.108, "step": 377 }, { "epoch": 58.46, "learning_rate": 4.901960784313725e-05, "loss": 0.1218, "step": 380 }, { "epoch": 58.92, "eval_accuracy": 0.717391304347826, "eval_loss": 0.8054145574569702, "eval_runtime": 0.6049, "eval_samples_per_second": 76.042, "eval_steps_per_second": 3.306, "step": 383 }, { "epoch": 60.0, "learning_rate": 4.411764705882353e-05, "loss": 0.1162, "step": 390 }, { "epoch": 60.0, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.7963301539421082, "eval_runtime": 0.6236, "eval_samples_per_second": 73.763, "eval_steps_per_second": 3.207, "step": 390 }, { "epoch": 60.92, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.795264720916748, "eval_runtime": 0.6138, "eval_samples_per_second": 74.945, "eval_steps_per_second": 3.258, "step": 396 }, { "epoch": 61.54, "learning_rate": 3.9215686274509805e-05, "loss": 0.0876, "step": 400 }, { "epoch": 62.0, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.8229031562805176, "eval_runtime": 0.6005, "eval_samples_per_second": 76.603, "eval_steps_per_second": 3.331, "step": 403 }, { "epoch": 62.92, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.8364757895469666, "eval_runtime": 0.626, "eval_samples_per_second": 73.487, "eval_steps_per_second": 3.195, "step": 409 }, { "epoch": 63.08, "learning_rate": 3.431372549019608e-05, "loss": 0.1032, "step": 410 }, { "epoch": 64.0, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.8161815404891968, "eval_runtime": 0.6165, "eval_samples_per_second": 74.611, "eval_steps_per_second": 3.244, "step": 416 }, { "epoch": 64.62, "learning_rate": 2.9411764705882354e-05, "loss": 0.0825, "step": 420 }, { "epoch": 64.92, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.8645689487457275, "eval_runtime": 0.608, "eval_samples_per_second": 75.66, "eval_steps_per_second": 3.29, "step": 422 }, { "epoch": 66.0, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.9134983420372009, "eval_runtime": 0.7022, "eval_samples_per_second": 65.511, "eval_steps_per_second": 2.848, "step": 429 }, { "epoch": 66.15, "learning_rate": 2.4509803921568626e-05, "loss": 0.1119, "step": 430 }, { "epoch": 66.92, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.9164416193962097, "eval_runtime": 0.6197, "eval_samples_per_second": 74.228, "eval_steps_per_second": 3.227, "step": 435 }, { "epoch": 67.69, "learning_rate": 1.9607843137254903e-05, "loss": 0.0949, "step": 440 }, { "epoch": 68.0, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.9232246279716492, "eval_runtime": 0.6325, "eval_samples_per_second": 72.724, "eval_steps_per_second": 3.162, "step": 442 }, { "epoch": 68.92, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.9380779266357422, "eval_runtime": 0.5791, "eval_samples_per_second": 79.439, "eval_steps_per_second": 3.454, "step": 448 }, { "epoch": 69.23, "learning_rate": 1.4705882352941177e-05, "loss": 0.1227, "step": 450 }, { "epoch": 70.0, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.8997882604598999, "eval_runtime": 0.606, "eval_samples_per_second": 75.907, "eval_steps_per_second": 3.3, "step": 455 }, { "epoch": 70.77, "learning_rate": 9.803921568627451e-06, "loss": 0.0872, "step": 460 }, { "epoch": 70.92, "eval_accuracy": 0.717391304347826, "eval_loss": 0.9632061123847961, "eval_runtime": 0.5942, "eval_samples_per_second": 77.419, "eval_steps_per_second": 3.366, "step": 461 }, { "epoch": 72.0, "eval_accuracy": 0.717391304347826, "eval_loss": 0.8566347360610962, "eval_runtime": 0.6595, "eval_samples_per_second": 69.753, "eval_steps_per_second": 3.033, "step": 468 }, { "epoch": 72.31, "learning_rate": 4.901960784313726e-06, "loss": 0.1033, "step": 470 }, { "epoch": 72.92, "eval_accuracy": 0.717391304347826, "eval_loss": 0.8909047842025757, "eval_runtime": 0.6783, "eval_samples_per_second": 67.817, "eval_steps_per_second": 2.949, "step": 474 }, { "epoch": 73.85, "learning_rate": 0.0, "loss": 0.0876, "step": 480 }, { "epoch": 73.85, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.8869202733039856, "eval_runtime": 0.6277, "eval_samples_per_second": 73.284, "eval_steps_per_second": 3.186, "step": 480 }, { "epoch": 73.85, "step": 480, "total_flos": 1.6581977329862246e+17, "train_loss": 0.40781121912101903, "train_runtime": 277.6094, "train_samples_per_second": 236.015, "train_steps_per_second": 1.729 } ], "logging_steps": 10, "max_steps": 480, "num_input_tokens_seen": 0, "num_train_epochs": 80, "save_steps": 500, "total_flos": 1.6581977329862246e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }