{ "best_metric": 0.8478260869565217, "best_model_checkpoint": "swiftformer-xs-ve-U13-b-80e\\checkpoint-169", "epoch": 73.84615384615384, "eval_steps": 500, "global_step": 480, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.92, "eval_accuracy": 0.2391304347826087, "eval_loss": 1.3858561515808105, "eval_runtime": 0.6465, "eval_samples_per_second": 71.154, "eval_steps_per_second": 3.094, "step": 6 }, { "epoch": 1.54, "learning_rate": 4.1666666666666665e-05, "loss": 1.3857, "step": 10 }, { "epoch": 2.0, "eval_accuracy": 0.32608695652173914, "eval_loss": 1.383376121520996, "eval_runtime": 0.6231, "eval_samples_per_second": 73.828, "eval_steps_per_second": 3.21, "step": 13 }, { "epoch": 2.92, "eval_accuracy": 0.1956521739130435, "eval_loss": 1.378893256187439, "eval_runtime": 0.5859, "eval_samples_per_second": 78.518, "eval_steps_per_second": 3.414, "step": 19 }, { "epoch": 3.08, "learning_rate": 8.333333333333333e-05, "loss": 1.3767, "step": 20 }, { "epoch": 4.0, "eval_accuracy": 0.17391304347826086, "eval_loss": 1.366553544998169, "eval_runtime": 0.5917, "eval_samples_per_second": 77.74, "eval_steps_per_second": 3.38, "step": 26 }, { "epoch": 4.62, "learning_rate": 0.000125, "loss": 1.3227, "step": 30 }, { "epoch": 4.92, "eval_accuracy": 0.15217391304347827, "eval_loss": 1.356503963470459, "eval_runtime": 0.6244, "eval_samples_per_second": 73.672, "eval_steps_per_second": 3.203, "step": 32 }, { "epoch": 6.0, "eval_accuracy": 0.10869565217391304, "eval_loss": 1.3886877298355103, "eval_runtime": 0.5987, "eval_samples_per_second": 76.83, "eval_steps_per_second": 3.34, "step": 39 }, { "epoch": 6.15, "learning_rate": 0.00016666666666666666, "loss": 1.1987, "step": 40 }, { "epoch": 6.92, "eval_accuracy": 0.21739130434782608, "eval_loss": 1.3718664646148682, "eval_runtime": 0.6257, "eval_samples_per_second": 73.514, "eval_steps_per_second": 3.196, "step": 45 }, { "epoch": 7.69, "learning_rate": 0.00020833333333333332, "loss": 1.1071, "step": 50 }, { "epoch": 8.0, "eval_accuracy": 0.30434782608695654, "eval_loss": 1.3271123170852661, "eval_runtime": 0.6134, "eval_samples_per_second": 74.99, "eval_steps_per_second": 3.26, "step": 52 }, { "epoch": 8.92, "eval_accuracy": 0.2608695652173913, "eval_loss": 1.356224536895752, "eval_runtime": 0.6427, "eval_samples_per_second": 71.579, "eval_steps_per_second": 3.112, "step": 58 }, { "epoch": 9.23, "learning_rate": 0.00025, "loss": 0.9926, "step": 60 }, { "epoch": 10.0, "eval_accuracy": 0.41304347826086957, "eval_loss": 1.2305893898010254, "eval_runtime": 0.6037, "eval_samples_per_second": 76.203, "eval_steps_per_second": 3.313, "step": 65 }, { "epoch": 10.77, "learning_rate": 0.00029166666666666664, "loss": 0.8721, "step": 70 }, { "epoch": 10.92, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.1952728033065796, "eval_runtime": 0.6138, "eval_samples_per_second": 74.943, "eval_steps_per_second": 3.258, "step": 71 }, { "epoch": 12.0, "eval_accuracy": 0.5652173913043478, "eval_loss": 1.0754497051239014, "eval_runtime": 0.6016, "eval_samples_per_second": 76.465, "eval_steps_per_second": 3.325, "step": 78 }, { "epoch": 12.31, "learning_rate": 0.0002941176470588235, "loss": 0.7746, "step": 80 }, { "epoch": 12.92, "eval_accuracy": 0.6739130434782609, "eval_loss": 0.9931479096412659, "eval_runtime": 0.596, "eval_samples_per_second": 77.175, "eval_steps_per_second": 3.355, "step": 84 }, { "epoch": 13.85, "learning_rate": 0.00028676470588235296, "loss": 0.6859, "step": 90 }, { "epoch": 14.0, "eval_accuracy": 0.6739130434782609, "eval_loss": 0.9978837370872498, "eval_runtime": 0.6322, "eval_samples_per_second": 72.759, "eval_steps_per_second": 3.163, "step": 91 }, { "epoch": 14.92, "eval_accuracy": 0.6956521739130435, "eval_loss": 0.8964032530784607, "eval_runtime": 0.6041, "eval_samples_per_second": 76.144, "eval_steps_per_second": 3.311, "step": 97 }, { "epoch": 15.38, "learning_rate": 0.00027941176470588236, "loss": 0.5777, "step": 100 }, { "epoch": 16.0, "eval_accuracy": 0.6521739130434783, "eval_loss": 0.918592631816864, "eval_runtime": 0.6311, "eval_samples_per_second": 72.89, "eval_steps_per_second": 3.169, "step": 104 }, { "epoch": 16.92, "learning_rate": 0.00027205882352941175, "loss": 0.5136, "step": 110 }, { "epoch": 16.92, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7949799299240112, "eval_runtime": 0.6157, "eval_samples_per_second": 74.714, "eval_steps_per_second": 3.248, "step": 110 }, { "epoch": 18.0, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.7793933153152466, "eval_runtime": 0.6552, "eval_samples_per_second": 70.208, "eval_steps_per_second": 3.053, "step": 117 }, { "epoch": 18.46, "learning_rate": 0.00026470588235294115, "loss": 0.5019, "step": 120 }, { "epoch": 18.92, "eval_accuracy": 0.717391304347826, "eval_loss": 0.8644534945487976, "eval_runtime": 0.6093, "eval_samples_per_second": 75.502, "eval_steps_per_second": 3.283, "step": 123 }, { "epoch": 20.0, "learning_rate": 0.00025735294117647055, "loss": 0.3879, "step": 130 }, { "epoch": 20.0, "eval_accuracy": 0.6956521739130435, "eval_loss": 0.8773152828216553, "eval_runtime": 0.6121, "eval_samples_per_second": 75.146, "eval_steps_per_second": 3.267, "step": 130 }, { "epoch": 20.92, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7303956747055054, "eval_runtime": 0.6736, "eval_samples_per_second": 68.293, "eval_steps_per_second": 2.969, "step": 136 }, { "epoch": 21.54, "learning_rate": 0.00025, "loss": 0.3532, "step": 140 }, { "epoch": 22.0, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.6918480396270752, "eval_runtime": 0.6237, "eval_samples_per_second": 73.758, "eval_steps_per_second": 3.207, "step": 143 }, { "epoch": 22.92, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7881707549095154, "eval_runtime": 0.63, "eval_samples_per_second": 73.014, "eval_steps_per_second": 3.175, "step": 149 }, { "epoch": 23.08, "learning_rate": 0.0002426470588235294, "loss": 0.3288, "step": 150 }, { "epoch": 24.0, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7132053375244141, "eval_runtime": 0.607, "eval_samples_per_second": 75.781, "eval_steps_per_second": 3.295, "step": 156 }, { "epoch": 24.62, "learning_rate": 0.0002352941176470588, "loss": 0.2573, "step": 160 }, { "epoch": 24.92, "eval_accuracy": 0.8043478260869565, "eval_loss": 0.6645399332046509, "eval_runtime": 0.5984, "eval_samples_per_second": 76.877, "eval_steps_per_second": 3.342, "step": 162 }, { "epoch": 26.0, "eval_accuracy": 0.8478260869565217, "eval_loss": 0.6617594361305237, "eval_runtime": 0.6083, "eval_samples_per_second": 75.626, "eval_steps_per_second": 3.288, "step": 169 }, { "epoch": 26.15, "learning_rate": 0.0002279411764705882, "loss": 0.239, "step": 170 }, { "epoch": 26.92, "eval_accuracy": 0.8043478260869565, "eval_loss": 0.6779818534851074, "eval_runtime": 0.6084, "eval_samples_per_second": 75.608, "eval_steps_per_second": 3.287, "step": 175 }, { "epoch": 27.69, "learning_rate": 0.00022058823529411765, "loss": 0.2018, "step": 180 }, { "epoch": 28.0, "eval_accuracy": 0.6956521739130435, "eval_loss": 0.8137790560722351, "eval_runtime": 0.6252, "eval_samples_per_second": 73.571, "eval_steps_per_second": 3.199, "step": 182 }, { "epoch": 28.92, "eval_accuracy": 0.6956521739130435, "eval_loss": 0.8797154426574707, "eval_runtime": 0.6147, "eval_samples_per_second": 74.836, "eval_steps_per_second": 3.254, "step": 188 }, { "epoch": 29.23, "learning_rate": 0.00021323529411764705, "loss": 0.1961, "step": 190 }, { "epoch": 30.0, "eval_accuracy": 0.717391304347826, "eval_loss": 0.8602275848388672, "eval_runtime": 0.606, "eval_samples_per_second": 75.907, "eval_steps_per_second": 3.3, "step": 195 }, { "epoch": 30.77, "learning_rate": 0.00020588235294117645, "loss": 0.214, "step": 200 }, { "epoch": 30.92, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.8187522888183594, "eval_runtime": 0.6148, "eval_samples_per_second": 74.818, "eval_steps_per_second": 3.253, "step": 201 }, { "epoch": 32.0, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.6955972909927368, "eval_runtime": 0.6018, "eval_samples_per_second": 76.431, "eval_steps_per_second": 3.323, "step": 208 }, { "epoch": 32.31, "learning_rate": 0.00019852941176470585, "loss": 0.1596, "step": 210 }, { "epoch": 32.92, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.7980555891990662, "eval_runtime": 0.5918, "eval_samples_per_second": 77.731, "eval_steps_per_second": 3.38, "step": 214 }, { "epoch": 33.85, "learning_rate": 0.00019117647058823528, "loss": 0.172, "step": 220 }, { "epoch": 34.0, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.6845263838768005, "eval_runtime": 0.6133, "eval_samples_per_second": 75.008, "eval_steps_per_second": 3.261, "step": 221 }, { "epoch": 34.92, "eval_accuracy": 0.717391304347826, "eval_loss": 0.9340136051177979, "eval_runtime": 0.5942, "eval_samples_per_second": 77.416, "eval_steps_per_second": 3.366, "step": 227 }, { "epoch": 35.38, "learning_rate": 0.0001838235294117647, "loss": 0.1852, "step": 230 }, { "epoch": 36.0, "eval_accuracy": 0.6521739130434783, "eval_loss": 0.9547532200813293, "eval_runtime": 0.7213, "eval_samples_per_second": 63.775, "eval_steps_per_second": 2.773, "step": 234 }, { "epoch": 36.92, "learning_rate": 0.0001764705882352941, "loss": 0.1492, "step": 240 }, { "epoch": 36.92, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.774700403213501, "eval_runtime": 0.6442, "eval_samples_per_second": 71.401, "eval_steps_per_second": 3.104, "step": 240 }, { "epoch": 38.0, "eval_accuracy": 0.6304347826086957, "eval_loss": 0.9906997680664062, "eval_runtime": 0.5977, "eval_samples_per_second": 76.967, "eval_steps_per_second": 3.346, "step": 247 }, { "epoch": 38.46, "learning_rate": 0.0001691176470588235, "loss": 0.1735, "step": 250 }, { "epoch": 38.92, "eval_accuracy": 0.717391304347826, "eval_loss": 0.8040443062782288, "eval_runtime": 0.6235, "eval_samples_per_second": 73.773, "eval_steps_per_second": 3.208, "step": 253 }, { "epoch": 40.0, "learning_rate": 0.00016176470588235293, "loss": 0.1405, "step": 260 }, { "epoch": 40.0, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.6945919990539551, "eval_runtime": 0.6077, "eval_samples_per_second": 75.694, "eval_steps_per_second": 3.291, "step": 260 }, { "epoch": 40.92, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7019456624984741, "eval_runtime": 0.6269, "eval_samples_per_second": 73.381, "eval_steps_per_second": 3.19, "step": 266 }, { "epoch": 41.54, "learning_rate": 0.00015441176470588233, "loss": 0.1269, "step": 270 }, { "epoch": 42.0, "eval_accuracy": 0.717391304347826, "eval_loss": 0.824578583240509, "eval_runtime": 0.6131, "eval_samples_per_second": 75.023, "eval_steps_per_second": 3.262, "step": 273 }, { "epoch": 42.92, "eval_accuracy": 0.6739130434782609, "eval_loss": 0.9238297939300537, "eval_runtime": 0.6079, "eval_samples_per_second": 75.675, "eval_steps_per_second": 3.29, "step": 279 }, { "epoch": 43.08, "learning_rate": 0.00014705882352941175, "loss": 0.1237, "step": 280 }, { "epoch": 44.0, "eval_accuracy": 0.6956521739130435, "eval_loss": 0.9354200959205627, "eval_runtime": 0.6235, "eval_samples_per_second": 73.772, "eval_steps_per_second": 3.207, "step": 286 }, { "epoch": 44.62, "learning_rate": 0.00013970588235294118, "loss": 0.1201, "step": 290 }, { "epoch": 44.92, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.7543055415153503, "eval_runtime": 0.6059, "eval_samples_per_second": 75.924, "eval_steps_per_second": 3.301, "step": 292 }, { "epoch": 46.0, "eval_accuracy": 0.717391304347826, "eval_loss": 0.7151233553886414, "eval_runtime": 0.6316, "eval_samples_per_second": 72.834, "eval_steps_per_second": 3.167, "step": 299 }, { "epoch": 46.15, "learning_rate": 0.00013235294117647058, "loss": 0.1134, "step": 300 }, { "epoch": 46.92, "eval_accuracy": 0.717391304347826, "eval_loss": 0.7283941507339478, "eval_runtime": 0.6109, "eval_samples_per_second": 75.305, "eval_steps_per_second": 3.274, "step": 305 }, { "epoch": 47.69, "learning_rate": 0.000125, "loss": 0.1141, "step": 310 }, { "epoch": 48.0, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7790563702583313, "eval_runtime": 0.6549, "eval_samples_per_second": 70.245, "eval_steps_per_second": 3.054, "step": 312 }, { "epoch": 48.92, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.7823971509933472, "eval_runtime": 0.5941, "eval_samples_per_second": 77.431, "eval_steps_per_second": 3.367, "step": 318 }, { "epoch": 49.23, "learning_rate": 0.0001176470588235294, "loss": 0.1253, "step": 320 }, { "epoch": 50.0, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7318633794784546, "eval_runtime": 0.6139, "eval_samples_per_second": 74.933, "eval_steps_per_second": 3.258, "step": 325 }, { "epoch": 50.77, "learning_rate": 0.00011029411764705883, "loss": 0.0968, "step": 330 }, { "epoch": 50.92, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.715053379535675, "eval_runtime": 0.6106, "eval_samples_per_second": 75.337, "eval_steps_per_second": 3.276, "step": 331 }, { "epoch": 52.0, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7661868333816528, "eval_runtime": 0.672, "eval_samples_per_second": 68.453, "eval_steps_per_second": 2.976, "step": 338 }, { "epoch": 52.31, "learning_rate": 0.00010294117647058823, "loss": 0.0996, "step": 340 }, { "epoch": 52.92, "eval_accuracy": 0.782608695652174, "eval_loss": 0.8085704445838928, "eval_runtime": 0.6284, "eval_samples_per_second": 73.205, "eval_steps_per_second": 3.183, "step": 344 }, { "epoch": 53.85, "learning_rate": 9.558823529411764e-05, "loss": 0.0844, "step": 350 }, { "epoch": 54.0, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.892122209072113, "eval_runtime": 0.6018, "eval_samples_per_second": 76.432, "eval_steps_per_second": 3.323, "step": 351 }, { "epoch": 54.92, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.8782345056533813, "eval_runtime": 0.5936, "eval_samples_per_second": 77.489, "eval_steps_per_second": 3.369, "step": 357 }, { "epoch": 55.38, "learning_rate": 8.823529411764705e-05, "loss": 0.1141, "step": 360 }, { "epoch": 56.0, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.7863830327987671, "eval_runtime": 0.6193, "eval_samples_per_second": 74.279, "eval_steps_per_second": 3.23, "step": 364 }, { "epoch": 56.92, "learning_rate": 8.088235294117646e-05, "loss": 0.1263, "step": 370 }, { "epoch": 56.92, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7125386595726013, "eval_runtime": 0.6664, "eval_samples_per_second": 69.025, "eval_steps_per_second": 3.001, "step": 370 }, { "epoch": 58.0, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.6758188605308533, "eval_runtime": 0.6415, "eval_samples_per_second": 71.704, "eval_steps_per_second": 3.118, "step": 377 }, { "epoch": 58.46, "learning_rate": 7.352941176470588e-05, "loss": 0.0966, "step": 380 }, { "epoch": 58.92, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7242967486381531, "eval_runtime": 0.6256, "eval_samples_per_second": 73.526, "eval_steps_per_second": 3.197, "step": 383 }, { "epoch": 60.0, "learning_rate": 6.617647058823529e-05, "loss": 0.0771, "step": 390 }, { "epoch": 60.0, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7089547514915466, "eval_runtime": 0.6119, "eval_samples_per_second": 75.172, "eval_steps_per_second": 3.268, "step": 390 }, { "epoch": 60.92, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7157294154167175, "eval_runtime": 0.6426, "eval_samples_per_second": 71.584, "eval_steps_per_second": 3.112, "step": 396 }, { "epoch": 61.54, "learning_rate": 5.88235294117647e-05, "loss": 0.0497, "step": 400 }, { "epoch": 62.0, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7549222707748413, "eval_runtime": 0.6404, "eval_samples_per_second": 71.829, "eval_steps_per_second": 3.123, "step": 403 }, { "epoch": 62.92, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7805795073509216, "eval_runtime": 0.5971, "eval_samples_per_second": 77.034, "eval_steps_per_second": 3.349, "step": 409 }, { "epoch": 63.08, "learning_rate": 5.147058823529411e-05, "loss": 0.0848, "step": 410 }, { "epoch": 64.0, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.7902293801307678, "eval_runtime": 0.6284, "eval_samples_per_second": 73.207, "eval_steps_per_second": 3.183, "step": 416 }, { "epoch": 64.62, "learning_rate": 4.4117647058823526e-05, "loss": 0.0477, "step": 420 }, { "epoch": 64.92, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.7683706879615784, "eval_runtime": 0.6332, "eval_samples_per_second": 72.646, "eval_steps_per_second": 3.159, "step": 422 }, { "epoch": 66.0, "eval_accuracy": 0.6956521739130435, "eval_loss": 0.8037605881690979, "eval_runtime": 0.6154, "eval_samples_per_second": 74.747, "eval_steps_per_second": 3.25, "step": 429 }, { "epoch": 66.15, "learning_rate": 3.676470588235294e-05, "loss": 0.0823, "step": 430 }, { "epoch": 66.92, "eval_accuracy": 0.6956521739130435, "eval_loss": 0.7502751350402832, "eval_runtime": 0.619, "eval_samples_per_second": 74.309, "eval_steps_per_second": 3.231, "step": 435 }, { "epoch": 67.69, "learning_rate": 2.941176470588235e-05, "loss": 0.0726, "step": 440 }, { "epoch": 68.0, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7634245753288269, "eval_runtime": 0.5953, "eval_samples_per_second": 77.278, "eval_steps_per_second": 3.36, "step": 442 }, { "epoch": 68.92, "eval_accuracy": 0.782608695652174, "eval_loss": 0.7860420346260071, "eval_runtime": 0.6305, "eval_samples_per_second": 72.962, "eval_steps_per_second": 3.172, "step": 448 }, { "epoch": 69.23, "learning_rate": 2.2058823529411763e-05, "loss": 0.0799, "step": 450 }, { "epoch": 70.0, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7629542350769043, "eval_runtime": 0.6052, "eval_samples_per_second": 76.009, "eval_steps_per_second": 3.305, "step": 455 }, { "epoch": 70.77, "learning_rate": 1.4705882352941175e-05, "loss": 0.067, "step": 460 }, { "epoch": 70.92, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.809410035610199, "eval_runtime": 0.6232, "eval_samples_per_second": 73.811, "eval_steps_per_second": 3.209, "step": 461 }, { "epoch": 72.0, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.751130998134613, "eval_runtime": 0.6317, "eval_samples_per_second": 72.822, "eval_steps_per_second": 3.166, "step": 468 }, { "epoch": 72.31, "learning_rate": 7.352941176470588e-06, "loss": 0.0893, "step": 470 }, { "epoch": 72.92, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.7738297581672668, "eval_runtime": 0.6364, "eval_samples_per_second": 72.286, "eval_steps_per_second": 3.143, "step": 474 }, { "epoch": 73.85, "learning_rate": 0.0, "loss": 0.0738, "step": 480 }, { "epoch": 73.85, "eval_accuracy": 0.7391304347826086, "eval_loss": 0.797092080116272, "eval_runtime": 0.6138, "eval_samples_per_second": 74.941, "eval_steps_per_second": 3.258, "step": 480 }, { "epoch": 73.85, "step": 480, "total_flos": 1.6581977329862246e+17, "train_loss": 0.3444344773267706, "train_runtime": 276.4072, "train_samples_per_second": 237.042, "train_steps_per_second": 1.737 } ], "logging_steps": 10, "max_steps": 480, "num_input_tokens_seen": 0, "num_train_epochs": 80, "save_steps": 500, "total_flos": 1.6581977329862246e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }