{ "best_metric": 0.8166666666666667, "best_model_checkpoint": "swiftformer-xs-dmae-va-U5-42C\\checkpoint-418", "epoch": 72.25806451612904, "eval_steps": 500, "global_step": 560, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.9, "eval_accuracy": 0.4666666666666667, "eval_loss": 1.3856309652328491, "eval_runtime": 1.1043, "eval_samples_per_second": 54.333, "eval_steps_per_second": 1.811, "step": 7 }, { "epoch": 1.55, "learning_rate": 4.2857142857142856e-05, "loss": 1.3855, "step": 12 }, { "epoch": 1.94, "eval_accuracy": 0.48333333333333334, "eval_loss": 1.3819035291671753, "eval_runtime": 0.7186, "eval_samples_per_second": 83.495, "eval_steps_per_second": 2.783, "step": 15 }, { "epoch": 2.97, "eval_accuracy": 0.43333333333333335, "eval_loss": 1.3687236309051514, "eval_runtime": 0.7101, "eval_samples_per_second": 84.495, "eval_steps_per_second": 2.817, "step": 23 }, { "epoch": 3.1, "learning_rate": 8.571428571428571e-05, "loss": 1.3742, "step": 24 }, { "epoch": 4.0, "eval_accuracy": 0.31666666666666665, "eval_loss": 1.3188554048538208, "eval_runtime": 0.726, "eval_samples_per_second": 82.647, "eval_steps_per_second": 2.755, "step": 31 }, { "epoch": 4.65, "learning_rate": 0.00012857142857142858, "loss": 1.3004, "step": 36 }, { "epoch": 4.9, "eval_accuracy": 0.48333333333333334, "eval_loss": 1.2501040697097778, "eval_runtime": 0.725, "eval_samples_per_second": 82.755, "eval_steps_per_second": 2.759, "step": 38 }, { "epoch": 5.94, "eval_accuracy": 0.48333333333333334, "eval_loss": 1.2268178462982178, "eval_runtime": 0.7099, "eval_samples_per_second": 84.517, "eval_steps_per_second": 2.817, "step": 46 }, { "epoch": 6.19, "learning_rate": 0.00017142857142857143, "loss": 1.1716, "step": 48 }, { "epoch": 6.97, "eval_accuracy": 0.5, "eval_loss": 1.211478352546692, "eval_runtime": 0.7071, "eval_samples_per_second": 84.855, "eval_steps_per_second": 2.828, "step": 54 }, { "epoch": 7.74, "learning_rate": 0.00019841269841269844, "loss": 1.0686, "step": 60 }, { "epoch": 8.0, "eval_accuracy": 0.5333333333333333, "eval_loss": 1.2243305444717407, "eval_runtime": 0.7126, "eval_samples_per_second": 84.197, "eval_steps_per_second": 2.807, "step": 62 }, { "epoch": 8.9, "eval_accuracy": 0.55, "eval_loss": 1.1432182788848877, "eval_runtime": 0.7953, "eval_samples_per_second": 75.447, "eval_steps_per_second": 2.515, "step": 69 }, { "epoch": 9.29, "learning_rate": 0.00019365079365079365, "loss": 0.9764, "step": 72 }, { "epoch": 9.94, "eval_accuracy": 0.55, "eval_loss": 1.020477294921875, "eval_runtime": 0.7686, "eval_samples_per_second": 78.061, "eval_steps_per_second": 2.602, "step": 77 }, { "epoch": 10.84, "learning_rate": 0.00018888888888888888, "loss": 0.873, "step": 84 }, { "epoch": 10.97, "eval_accuracy": 0.6, "eval_loss": 0.9721332788467407, "eval_runtime": 0.8664, "eval_samples_per_second": 69.254, "eval_steps_per_second": 2.308, "step": 85 }, { "epoch": 12.0, "eval_accuracy": 0.5666666666666667, "eval_loss": 0.9220641851425171, "eval_runtime": 0.7382, "eval_samples_per_second": 81.278, "eval_steps_per_second": 2.709, "step": 93 }, { "epoch": 12.39, "learning_rate": 0.00018412698412698412, "loss": 0.7822, "step": 96 }, { "epoch": 12.9, "eval_accuracy": 0.6166666666666667, "eval_loss": 0.8593236207962036, "eval_runtime": 1.1748, "eval_samples_per_second": 51.071, "eval_steps_per_second": 1.702, "step": 100 }, { "epoch": 13.94, "learning_rate": 0.00017936507936507938, "loss": 0.664, "step": 108 }, { "epoch": 13.94, "eval_accuracy": 0.7, "eval_loss": 0.7774909734725952, "eval_runtime": 0.8265, "eval_samples_per_second": 72.595, "eval_steps_per_second": 2.42, "step": 108 }, { "epoch": 14.97, "eval_accuracy": 0.6166666666666667, "eval_loss": 0.8117440342903137, "eval_runtime": 0.873, "eval_samples_per_second": 68.731, "eval_steps_per_second": 2.291, "step": 116 }, { "epoch": 15.48, "learning_rate": 0.00017460317460317462, "loss": 0.5439, "step": 120 }, { "epoch": 16.0, "eval_accuracy": 0.6833333333333333, "eval_loss": 0.7552784085273743, "eval_runtime": 0.8067, "eval_samples_per_second": 74.373, "eval_steps_per_second": 2.479, "step": 124 }, { "epoch": 16.9, "eval_accuracy": 0.7166666666666667, "eval_loss": 0.669671356678009, "eval_runtime": 0.7809, "eval_samples_per_second": 76.836, "eval_steps_per_second": 2.561, "step": 131 }, { "epoch": 17.03, "learning_rate": 0.00016984126984126986, "loss": 0.496, "step": 132 }, { "epoch": 17.94, "eval_accuracy": 0.7333333333333333, "eval_loss": 0.6479821801185608, "eval_runtime": 0.8358, "eval_samples_per_second": 71.789, "eval_steps_per_second": 2.393, "step": 139 }, { "epoch": 18.58, "learning_rate": 0.0001650793650793651, "loss": 0.4563, "step": 144 }, { "epoch": 18.97, "eval_accuracy": 0.7333333333333333, "eval_loss": 0.7115061283111572, "eval_runtime": 0.7213, "eval_samples_per_second": 83.184, "eval_steps_per_second": 2.773, "step": 147 }, { "epoch": 20.0, "eval_accuracy": 0.7166666666666667, "eval_loss": 0.6776978373527527, "eval_runtime": 1.1968, "eval_samples_per_second": 50.132, "eval_steps_per_second": 1.671, "step": 155 }, { "epoch": 20.13, "learning_rate": 0.00016031746031746033, "loss": 0.3831, "step": 156 }, { "epoch": 20.9, "eval_accuracy": 0.7666666666666667, "eval_loss": 0.641558825969696, "eval_runtime": 0.7539, "eval_samples_per_second": 79.588, "eval_steps_per_second": 2.653, "step": 162 }, { "epoch": 21.68, "learning_rate": 0.00015555555555555556, "loss": 0.339, "step": 168 }, { "epoch": 21.94, "eval_accuracy": 0.7, "eval_loss": 0.7040281891822815, "eval_runtime": 0.7707, "eval_samples_per_second": 77.852, "eval_steps_per_second": 2.595, "step": 170 }, { "epoch": 22.97, "eval_accuracy": 0.7166666666666667, "eval_loss": 0.6858527064323425, "eval_runtime": 0.7222, "eval_samples_per_second": 83.082, "eval_steps_per_second": 2.769, "step": 178 }, { "epoch": 23.23, "learning_rate": 0.0001507936507936508, "loss": 0.3033, "step": 180 }, { "epoch": 24.0, "eval_accuracy": 0.7, "eval_loss": 0.6012035608291626, "eval_runtime": 0.8804, "eval_samples_per_second": 68.154, "eval_steps_per_second": 2.272, "step": 186 }, { "epoch": 24.77, "learning_rate": 0.00014603174603174603, "loss": 0.2655, "step": 192 }, { "epoch": 24.9, "eval_accuracy": 0.7833333333333333, "eval_loss": 0.5439515709877014, "eval_runtime": 0.8773, "eval_samples_per_second": 68.39, "eval_steps_per_second": 2.28, "step": 193 }, { "epoch": 25.94, "eval_accuracy": 0.75, "eval_loss": 0.6173577904701233, "eval_runtime": 0.755, "eval_samples_per_second": 79.471, "eval_steps_per_second": 2.649, "step": 201 }, { "epoch": 26.32, "learning_rate": 0.0001412698412698413, "loss": 0.2269, "step": 204 }, { "epoch": 26.97, "eval_accuracy": 0.7333333333333333, "eval_loss": 0.5745389461517334, "eval_runtime": 0.7541, "eval_samples_per_second": 79.562, "eval_steps_per_second": 2.652, "step": 209 }, { "epoch": 27.87, "learning_rate": 0.0001365079365079365, "loss": 0.2472, "step": 216 }, { "epoch": 28.0, "eval_accuracy": 0.8, "eval_loss": 0.5688391327857971, "eval_runtime": 0.7386, "eval_samples_per_second": 81.235, "eval_steps_per_second": 2.708, "step": 217 }, { "epoch": 28.9, "eval_accuracy": 0.75, "eval_loss": 0.6578179597854614, "eval_runtime": 0.7981, "eval_samples_per_second": 75.174, "eval_steps_per_second": 2.506, "step": 224 }, { "epoch": 29.42, "learning_rate": 0.00013174603174603174, "loss": 0.2004, "step": 228 }, { "epoch": 29.94, "eval_accuracy": 0.7833333333333333, "eval_loss": 0.5811216831207275, "eval_runtime": 0.7017, "eval_samples_per_second": 85.512, "eval_steps_per_second": 2.85, "step": 232 }, { "epoch": 30.97, "learning_rate": 0.00012698412698412698, "loss": 0.2099, "step": 240 }, { "epoch": 30.97, "eval_accuracy": 0.75, "eval_loss": 0.6672316193580627, "eval_runtime": 0.9579, "eval_samples_per_second": 62.635, "eval_steps_per_second": 2.088, "step": 240 }, { "epoch": 32.0, "eval_accuracy": 0.75, "eval_loss": 0.5926868319511414, "eval_runtime": 0.7151, "eval_samples_per_second": 83.907, "eval_steps_per_second": 2.797, "step": 248 }, { "epoch": 32.52, "learning_rate": 0.00012222222222222224, "loss": 0.1834, "step": 252 }, { "epoch": 32.9, "eval_accuracy": 0.7666666666666667, "eval_loss": 0.6193079352378845, "eval_runtime": 0.7137, "eval_samples_per_second": 84.073, "eval_steps_per_second": 2.802, "step": 255 }, { "epoch": 33.94, "eval_accuracy": 0.7166666666666667, "eval_loss": 0.7505124807357788, "eval_runtime": 0.8955, "eval_samples_per_second": 67.004, "eval_steps_per_second": 2.233, "step": 263 }, { "epoch": 34.06, "learning_rate": 0.00011746031746031746, "loss": 0.2248, "step": 264 }, { "epoch": 34.97, "eval_accuracy": 0.7333333333333333, "eval_loss": 0.7730365991592407, "eval_runtime": 0.7499, "eval_samples_per_second": 80.008, "eval_steps_per_second": 2.667, "step": 271 }, { "epoch": 35.61, "learning_rate": 0.0001126984126984127, "loss": 0.1571, "step": 276 }, { "epoch": 36.0, "eval_accuracy": 0.7333333333333333, "eval_loss": 0.6211021542549133, "eval_runtime": 0.9402, "eval_samples_per_second": 63.814, "eval_steps_per_second": 2.127, "step": 279 }, { "epoch": 36.9, "eval_accuracy": 0.7166666666666667, "eval_loss": 0.6227646470069885, "eval_runtime": 0.747, "eval_samples_per_second": 80.323, "eval_steps_per_second": 2.677, "step": 286 }, { "epoch": 37.16, "learning_rate": 0.00010793650793650794, "loss": 0.1983, "step": 288 }, { "epoch": 37.94, "eval_accuracy": 0.7166666666666667, "eval_loss": 0.6088116765022278, "eval_runtime": 0.6721, "eval_samples_per_second": 89.274, "eval_steps_per_second": 2.976, "step": 294 }, { "epoch": 38.71, "learning_rate": 0.00010317460317460319, "loss": 0.1629, "step": 300 }, { "epoch": 38.97, "eval_accuracy": 0.75, "eval_loss": 0.7009025812149048, "eval_runtime": 0.6916, "eval_samples_per_second": 86.756, "eval_steps_per_second": 2.892, "step": 302 }, { "epoch": 40.0, "eval_accuracy": 0.75, "eval_loss": 0.7284848690032959, "eval_runtime": 0.6756, "eval_samples_per_second": 88.806, "eval_steps_per_second": 2.96, "step": 310 }, { "epoch": 40.26, "learning_rate": 9.841269841269841e-05, "loss": 0.1547, "step": 312 }, { "epoch": 40.9, "eval_accuracy": 0.7666666666666667, "eval_loss": 0.6401079297065735, "eval_runtime": 1.0498, "eval_samples_per_second": 57.152, "eval_steps_per_second": 1.905, "step": 317 }, { "epoch": 41.81, "learning_rate": 9.365079365079366e-05, "loss": 0.1548, "step": 324 }, { "epoch": 41.94, "eval_accuracy": 0.7833333333333333, "eval_loss": 0.6122706532478333, "eval_runtime": 0.7047, "eval_samples_per_second": 85.142, "eval_steps_per_second": 2.838, "step": 325 }, { "epoch": 42.97, "eval_accuracy": 0.8, "eval_loss": 0.6316841244697571, "eval_runtime": 0.7237, "eval_samples_per_second": 82.911, "eval_steps_per_second": 2.764, "step": 333 }, { "epoch": 43.35, "learning_rate": 8.888888888888889e-05, "loss": 0.1566, "step": 336 }, { "epoch": 44.0, "eval_accuracy": 0.7166666666666667, "eval_loss": 0.7579439282417297, "eval_runtime": 0.6901, "eval_samples_per_second": 86.944, "eval_steps_per_second": 2.898, "step": 341 }, { "epoch": 44.9, "learning_rate": 8.412698412698413e-05, "loss": 0.1361, "step": 348 }, { "epoch": 44.9, "eval_accuracy": 0.7166666666666667, "eval_loss": 0.6652740836143494, "eval_runtime": 0.7005, "eval_samples_per_second": 85.657, "eval_steps_per_second": 2.855, "step": 348 }, { "epoch": 45.94, "eval_accuracy": 0.7333333333333333, "eval_loss": 0.7401434779167175, "eval_runtime": 0.6837, "eval_samples_per_second": 87.759, "eval_steps_per_second": 2.925, "step": 356 }, { "epoch": 46.45, "learning_rate": 7.936507936507937e-05, "loss": 0.1273, "step": 360 }, { "epoch": 46.97, "eval_accuracy": 0.7333333333333333, "eval_loss": 0.8404071927070618, "eval_runtime": 0.852, "eval_samples_per_second": 70.419, "eval_steps_per_second": 2.347, "step": 364 }, { "epoch": 48.0, "learning_rate": 7.460317460317461e-05, "loss": 0.1312, "step": 372 }, { "epoch": 48.0, "eval_accuracy": 0.75, "eval_loss": 0.8388133645057678, "eval_runtime": 0.7625, "eval_samples_per_second": 78.691, "eval_steps_per_second": 2.623, "step": 372 }, { "epoch": 48.9, "eval_accuracy": 0.7666666666666667, "eval_loss": 0.7823358774185181, "eval_runtime": 0.7984, "eval_samples_per_second": 75.152, "eval_steps_per_second": 2.505, "step": 379 }, { "epoch": 49.55, "learning_rate": 6.984126984126984e-05, "loss": 0.1307, "step": 384 }, { "epoch": 49.94, "eval_accuracy": 0.7166666666666667, "eval_loss": 0.6979826092720032, "eval_runtime": 0.9296, "eval_samples_per_second": 64.545, "eval_steps_per_second": 2.151, "step": 387 }, { "epoch": 50.97, "eval_accuracy": 0.75, "eval_loss": 0.7589060664176941, "eval_runtime": 0.9421, "eval_samples_per_second": 63.685, "eval_steps_per_second": 2.123, "step": 395 }, { "epoch": 51.1, "learning_rate": 6.507936507936509e-05, "loss": 0.1061, "step": 396 }, { "epoch": 52.0, "eval_accuracy": 0.7333333333333333, "eval_loss": 0.664362907409668, "eval_runtime": 0.9581, "eval_samples_per_second": 62.625, "eval_steps_per_second": 2.088, "step": 403 }, { "epoch": 52.65, "learning_rate": 6.0317460317460316e-05, "loss": 0.1186, "step": 408 }, { "epoch": 52.9, "eval_accuracy": 0.7333333333333333, "eval_loss": 0.7056966423988342, "eval_runtime": 0.7627, "eval_samples_per_second": 78.67, "eval_steps_per_second": 2.622, "step": 410 }, { "epoch": 53.94, "eval_accuracy": 0.8166666666666667, "eval_loss": 0.6744123697280884, "eval_runtime": 0.7522, "eval_samples_per_second": 79.767, "eval_steps_per_second": 2.659, "step": 418 }, { "epoch": 54.19, "learning_rate": 5.555555555555556e-05, "loss": 0.1108, "step": 420 }, { "epoch": 54.97, "eval_accuracy": 0.7666666666666667, "eval_loss": 0.6327721476554871, "eval_runtime": 0.7919, "eval_samples_per_second": 75.763, "eval_steps_per_second": 2.525, "step": 426 }, { "epoch": 55.74, "learning_rate": 5.0793650793650794e-05, "loss": 0.1014, "step": 432 }, { "epoch": 56.0, "eval_accuracy": 0.7833333333333333, "eval_loss": 0.6401543021202087, "eval_runtime": 0.7047, "eval_samples_per_second": 85.145, "eval_steps_per_second": 2.838, "step": 434 }, { "epoch": 56.9, "eval_accuracy": 0.75, "eval_loss": 0.6631258726119995, "eval_runtime": 0.7567, "eval_samples_per_second": 79.292, "eval_steps_per_second": 2.643, "step": 441 }, { "epoch": 57.29, "learning_rate": 4.603174603174603e-05, "loss": 0.1082, "step": 444 }, { "epoch": 57.94, "eval_accuracy": 0.7333333333333333, "eval_loss": 0.7001372575759888, "eval_runtime": 0.7863, "eval_samples_per_second": 76.308, "eval_steps_per_second": 2.544, "step": 449 }, { "epoch": 58.84, "learning_rate": 4.126984126984127e-05, "loss": 0.1118, "step": 456 }, { "epoch": 58.97, "eval_accuracy": 0.7333333333333333, "eval_loss": 0.7898235321044922, "eval_runtime": 0.8339, "eval_samples_per_second": 71.951, "eval_steps_per_second": 2.398, "step": 457 }, { "epoch": 60.0, "eval_accuracy": 0.7333333333333333, "eval_loss": 0.7644184231758118, "eval_runtime": 0.916, "eval_samples_per_second": 65.499, "eval_steps_per_second": 2.183, "step": 465 }, { "epoch": 60.39, "learning_rate": 3.650793650793651e-05, "loss": 0.1051, "step": 468 }, { "epoch": 60.9, "eval_accuracy": 0.7333333333333333, "eval_loss": 0.7767077088356018, "eval_runtime": 0.7368, "eval_samples_per_second": 81.432, "eval_steps_per_second": 2.714, "step": 472 }, { "epoch": 61.94, "learning_rate": 3.1746031746031745e-05, "loss": 0.0979, "step": 480 }, { "epoch": 61.94, "eval_accuracy": 0.7333333333333333, "eval_loss": 0.7440354824066162, "eval_runtime": 0.8613, "eval_samples_per_second": 69.665, "eval_steps_per_second": 2.322, "step": 480 }, { "epoch": 62.97, "eval_accuracy": 0.7333333333333333, "eval_loss": 0.6826977133750916, "eval_runtime": 0.8727, "eval_samples_per_second": 68.75, "eval_steps_per_second": 2.292, "step": 488 }, { "epoch": 63.48, "learning_rate": 2.6984126984126984e-05, "loss": 0.0834, "step": 492 }, { "epoch": 64.0, "eval_accuracy": 0.7333333333333333, "eval_loss": 0.7008056640625, "eval_runtime": 1.0337, "eval_samples_per_second": 58.046, "eval_steps_per_second": 1.935, "step": 496 }, { "epoch": 64.9, "eval_accuracy": 0.7166666666666667, "eval_loss": 0.7242893576622009, "eval_runtime": 0.9008, "eval_samples_per_second": 66.608, "eval_steps_per_second": 2.22, "step": 503 }, { "epoch": 65.03, "learning_rate": 2.2222222222222223e-05, "loss": 0.0963, "step": 504 }, { "epoch": 65.94, "eval_accuracy": 0.7333333333333333, "eval_loss": 0.7655950784683228, "eval_runtime": 0.7846, "eval_samples_per_second": 76.471, "eval_steps_per_second": 2.549, "step": 511 }, { "epoch": 66.58, "learning_rate": 1.746031746031746e-05, "loss": 0.0989, "step": 516 }, { "epoch": 66.97, "eval_accuracy": 0.7333333333333333, "eval_loss": 0.733224093914032, "eval_runtime": 0.6942, "eval_samples_per_second": 86.424, "eval_steps_per_second": 2.881, "step": 519 }, { "epoch": 68.0, "eval_accuracy": 0.7333333333333333, "eval_loss": 0.7623672485351562, "eval_runtime": 0.6962, "eval_samples_per_second": 86.187, "eval_steps_per_second": 2.873, "step": 527 }, { "epoch": 68.13, "learning_rate": 1.2698412698412699e-05, "loss": 0.107, "step": 528 }, { "epoch": 68.9, "eval_accuracy": 0.75, "eval_loss": 0.7291642427444458, "eval_runtime": 0.8099, "eval_samples_per_second": 74.08, "eval_steps_per_second": 2.469, "step": 534 }, { "epoch": 69.68, "learning_rate": 7.936507936507936e-06, "loss": 0.0987, "step": 540 }, { "epoch": 69.94, "eval_accuracy": 0.7333333333333333, "eval_loss": 0.7168775796890259, "eval_runtime": 0.9171, "eval_samples_per_second": 65.421, "eval_steps_per_second": 2.181, "step": 542 }, { "epoch": 70.97, "eval_accuracy": 0.7333333333333333, "eval_loss": 0.746231734752655, "eval_runtime": 0.7381, "eval_samples_per_second": 81.293, "eval_steps_per_second": 2.71, "step": 550 }, { "epoch": 71.23, "learning_rate": 3.1746031746031746e-06, "loss": 0.0956, "step": 552 }, { "epoch": 72.0, "eval_accuracy": 0.75, "eval_loss": 0.6656435132026672, "eval_runtime": 0.7048, "eval_samples_per_second": 85.132, "eval_steps_per_second": 2.838, "step": 558 }, { "epoch": 72.26, "eval_accuracy": 0.7333333333333333, "eval_loss": 0.6873000860214233, "eval_runtime": 0.744, "eval_samples_per_second": 80.641, "eval_steps_per_second": 2.688, "step": 560 }, { "epoch": 72.26, "step": 560, "total_flos": 1.9293876649171354e+17, "train_loss": 0.3574366893087115, "train_runtime": 357.0027, "train_samples_per_second": 218.262, "train_steps_per_second": 1.569 } ], "logging_steps": 12, "max_steps": 560, "num_input_tokens_seen": 0, "num_train_epochs": 80, "save_steps": 500, "total_flos": 1.9293876649171354e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }