{ "best_metric": 0.18984103202819824, "best_model_checkpoint": "/kaggle/working/model/checkpoint-1900", "epoch": 4.0, "eval_steps": 100, "global_step": 3852, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 0.00019948078920041537, "loss": 0.6119, "step": 10 }, { "epoch": 0.02, "learning_rate": 0.00019896157840083075, "loss": 0.556, "step": 20 }, { "epoch": 0.03, "learning_rate": 0.00019844236760124614, "loss": 0.4675, "step": 30 }, { "epoch": 0.04, "learning_rate": 0.0001979231568016615, "loss": 0.4472, "step": 40 }, { "epoch": 0.05, "learning_rate": 0.00019740394600207685, "loss": 0.3713, "step": 50 }, { "epoch": 0.06, "learning_rate": 0.0001968847352024922, "loss": 0.339, "step": 60 }, { "epoch": 0.07, "learning_rate": 0.0001963655244029076, "loss": 0.4431, "step": 70 }, { "epoch": 0.08, "learning_rate": 0.00019584631360332295, "loss": 0.4784, "step": 80 }, { "epoch": 0.09, "learning_rate": 0.00019532710280373834, "loss": 0.4167, "step": 90 }, { "epoch": 0.1, "learning_rate": 0.0001948078920041537, "loss": 0.2732, "step": 100 }, { "epoch": 0.1, "eval_accuracy": 0.8460839266777761, "eval_loss": 0.39686205983161926, "eval_runtime": 106.8457, "eval_samples_per_second": 61.781, "eval_steps_per_second": 7.731, "step": 100 }, { "epoch": 0.11, "learning_rate": 0.00019428868120456908, "loss": 0.5051, "step": 110 }, { "epoch": 0.12, "learning_rate": 0.00019376947040498444, "loss": 0.306, "step": 120 }, { "epoch": 0.13, "learning_rate": 0.0001932502596053998, "loss": 0.3317, "step": 130 }, { "epoch": 0.15, "learning_rate": 0.00019273104880581515, "loss": 0.4264, "step": 140 }, { "epoch": 0.16, "learning_rate": 0.00019221183800623054, "loss": 0.4038, "step": 150 }, { "epoch": 0.17, "learning_rate": 0.00019169262720664592, "loss": 0.4715, "step": 160 }, { "epoch": 0.18, "learning_rate": 0.00019117341640706128, "loss": 0.3561, "step": 170 }, { "epoch": 0.19, "learning_rate": 0.00019065420560747664, "loss": 0.3471, "step": 180 }, { "epoch": 0.2, "learning_rate": 0.00019013499480789202, "loss": 0.3163, "step": 190 }, { "epoch": 0.21, "learning_rate": 0.00018961578400830738, "loss": 0.2784, "step": 200 }, { "epoch": 0.21, "eval_accuracy": 0.8579003181336161, "eval_loss": 0.37141430377960205, "eval_runtime": 106.5876, "eval_samples_per_second": 61.93, "eval_steps_per_second": 7.749, "step": 200 }, { "epoch": 0.22, "learning_rate": 0.00018909657320872276, "loss": 0.4061, "step": 210 }, { "epoch": 0.23, "learning_rate": 0.00018857736240913812, "loss": 0.2339, "step": 220 }, { "epoch": 0.24, "learning_rate": 0.00018805815160955348, "loss": 0.3515, "step": 230 }, { "epoch": 0.25, "learning_rate": 0.00018753894080996886, "loss": 0.2553, "step": 240 }, { "epoch": 0.26, "learning_rate": 0.00018701973001038422, "loss": 0.4057, "step": 250 }, { "epoch": 0.27, "learning_rate": 0.00018650051921079958, "loss": 0.2917, "step": 260 }, { "epoch": 0.28, "learning_rate": 0.00018598130841121496, "loss": 0.384, "step": 270 }, { "epoch": 0.29, "learning_rate": 0.00018546209761163035, "loss": 0.3622, "step": 280 }, { "epoch": 0.3, "learning_rate": 0.0001849428868120457, "loss": 0.3697, "step": 290 }, { "epoch": 0.31, "learning_rate": 0.00018442367601246106, "loss": 0.301, "step": 300 }, { "epoch": 0.31, "eval_accuracy": 0.8376003635812755, "eval_loss": 0.35040077567100525, "eval_runtime": 106.1282, "eval_samples_per_second": 62.198, "eval_steps_per_second": 7.783, "step": 300 }, { "epoch": 0.32, "learning_rate": 0.00018390446521287642, "loss": 0.3128, "step": 310 }, { "epoch": 0.33, "learning_rate": 0.0001833852544132918, "loss": 0.2876, "step": 320 }, { "epoch": 0.34, "learning_rate": 0.0001828660436137072, "loss": 0.2879, "step": 330 }, { "epoch": 0.35, "learning_rate": 0.00018234683281412255, "loss": 0.2336, "step": 340 }, { "epoch": 0.36, "learning_rate": 0.0001818276220145379, "loss": 0.3879, "step": 350 }, { "epoch": 0.37, "learning_rate": 0.0001813084112149533, "loss": 0.333, "step": 360 }, { "epoch": 0.38, "learning_rate": 0.00018078920041536865, "loss": 0.3831, "step": 370 }, { "epoch": 0.39, "learning_rate": 0.000180269989615784, "loss": 0.2895, "step": 380 }, { "epoch": 0.4, "learning_rate": 0.0001797507788161994, "loss": 0.3258, "step": 390 }, { "epoch": 0.42, "learning_rate": 0.00017923156801661475, "loss": 0.2372, "step": 400 }, { "epoch": 0.42, "eval_accuracy": 0.8812301166489925, "eval_loss": 0.3391087055206299, "eval_runtime": 106.4818, "eval_samples_per_second": 61.992, "eval_steps_per_second": 7.757, "step": 400 }, { "epoch": 0.43, "learning_rate": 0.00017871235721703013, "loss": 0.4028, "step": 410 }, { "epoch": 0.44, "learning_rate": 0.0001781931464174455, "loss": 0.244, "step": 420 }, { "epoch": 0.45, "learning_rate": 0.00017767393561786085, "loss": 0.2848, "step": 430 }, { "epoch": 0.46, "learning_rate": 0.00017715472481827623, "loss": 0.3357, "step": 440 }, { "epoch": 0.47, "learning_rate": 0.00017663551401869162, "loss": 0.3361, "step": 450 }, { "epoch": 0.48, "learning_rate": 0.00017611630321910697, "loss": 0.3148, "step": 460 }, { "epoch": 0.49, "learning_rate": 0.00017559709241952233, "loss": 0.3378, "step": 470 }, { "epoch": 0.5, "learning_rate": 0.0001750778816199377, "loss": 0.3758, "step": 480 }, { "epoch": 0.51, "learning_rate": 0.00017455867082035307, "loss": 0.2882, "step": 490 }, { "epoch": 0.52, "learning_rate": 0.00017403946002076843, "loss": 0.3136, "step": 500 }, { "epoch": 0.52, "eval_accuracy": 0.8966823208604757, "eval_loss": 0.2559354901313782, "eval_runtime": 106.8533, "eval_samples_per_second": 61.776, "eval_steps_per_second": 7.73, "step": 500 }, { "epoch": 0.53, "learning_rate": 0.00017352024922118382, "loss": 0.2467, "step": 510 }, { "epoch": 0.54, "learning_rate": 0.00017300103842159917, "loss": 0.2569, "step": 520 }, { "epoch": 0.55, "learning_rate": 0.00017248182762201456, "loss": 0.2846, "step": 530 }, { "epoch": 0.56, "learning_rate": 0.00017196261682242992, "loss": 0.3017, "step": 540 }, { "epoch": 0.57, "learning_rate": 0.00017144340602284527, "loss": 0.2795, "step": 550 }, { "epoch": 0.58, "learning_rate": 0.00017092419522326063, "loss": 0.3047, "step": 560 }, { "epoch": 0.59, "learning_rate": 0.00017040498442367602, "loss": 0.3351, "step": 570 }, { "epoch": 0.6, "learning_rate": 0.0001698857736240914, "loss": 0.2825, "step": 580 }, { "epoch": 0.61, "learning_rate": 0.00016936656282450676, "loss": 0.2019, "step": 590 }, { "epoch": 0.62, "learning_rate": 0.00016884735202492212, "loss": 0.3517, "step": 600 }, { "epoch": 0.62, "eval_accuracy": 0.8397212543554007, "eval_loss": 0.41408056020736694, "eval_runtime": 104.998, "eval_samples_per_second": 62.868, "eval_steps_per_second": 7.867, "step": 600 }, { "epoch": 0.63, "learning_rate": 0.0001683281412253375, "loss": 0.3487, "step": 610 }, { "epoch": 0.64, "learning_rate": 0.00016780893042575286, "loss": 0.2473, "step": 620 }, { "epoch": 0.65, "learning_rate": 0.00016728971962616824, "loss": 0.2432, "step": 630 }, { "epoch": 0.66, "learning_rate": 0.0001667705088265836, "loss": 0.3265, "step": 640 }, { "epoch": 0.67, "learning_rate": 0.00016625129802699896, "loss": 0.2954, "step": 650 }, { "epoch": 0.69, "learning_rate": 0.00016573208722741434, "loss": 0.2389, "step": 660 }, { "epoch": 0.7, "learning_rate": 0.0001652128764278297, "loss": 0.2444, "step": 670 }, { "epoch": 0.71, "learning_rate": 0.00016469366562824509, "loss": 0.2776, "step": 680 }, { "epoch": 0.72, "learning_rate": 0.00016417445482866044, "loss": 0.3284, "step": 690 }, { "epoch": 0.73, "learning_rate": 0.00016365524402907583, "loss": 0.3312, "step": 700 }, { "epoch": 0.73, "eval_accuracy": 0.8841084684138767, "eval_loss": 0.3043135702610016, "eval_runtime": 108.7426, "eval_samples_per_second": 60.703, "eval_steps_per_second": 7.596, "step": 700 }, { "epoch": 0.74, "learning_rate": 0.00016313603322949118, "loss": 0.1868, "step": 710 }, { "epoch": 0.75, "learning_rate": 0.00016261682242990654, "loss": 0.3254, "step": 720 }, { "epoch": 0.76, "learning_rate": 0.0001620976116303219, "loss": 0.3054, "step": 730 }, { "epoch": 0.77, "learning_rate": 0.00016157840083073728, "loss": 0.1782, "step": 740 }, { "epoch": 0.78, "learning_rate": 0.00016105919003115267, "loss": 0.2538, "step": 750 }, { "epoch": 0.79, "learning_rate": 0.00016053997923156803, "loss": 0.2088, "step": 760 }, { "epoch": 0.8, "learning_rate": 0.00016002076843198338, "loss": 0.3186, "step": 770 }, { "epoch": 0.81, "learning_rate": 0.00015950155763239877, "loss": 0.3071, "step": 780 }, { "epoch": 0.82, "learning_rate": 0.00015898234683281413, "loss": 0.2838, "step": 790 }, { "epoch": 0.83, "learning_rate": 0.0001584631360332295, "loss": 0.2515, "step": 800 }, { "epoch": 0.83, "eval_accuracy": 0.9062263293440388, "eval_loss": 0.2541460394859314, "eval_runtime": 106.4027, "eval_samples_per_second": 62.038, "eval_steps_per_second": 7.763, "step": 800 }, { "epoch": 0.84, "learning_rate": 0.00015794392523364487, "loss": 0.3887, "step": 810 }, { "epoch": 0.85, "learning_rate": 0.00015742471443406023, "loss": 0.2235, "step": 820 }, { "epoch": 0.86, "learning_rate": 0.0001569055036344756, "loss": 0.2313, "step": 830 }, { "epoch": 0.87, "learning_rate": 0.00015638629283489097, "loss": 0.274, "step": 840 }, { "epoch": 0.88, "learning_rate": 0.00015586708203530633, "loss": 0.2223, "step": 850 }, { "epoch": 0.89, "learning_rate": 0.0001553478712357217, "loss": 0.3155, "step": 860 }, { "epoch": 0.9, "learning_rate": 0.0001548286604361371, "loss": 0.2737, "step": 870 }, { "epoch": 0.91, "learning_rate": 0.00015430944963655245, "loss": 0.2564, "step": 880 }, { "epoch": 0.92, "learning_rate": 0.0001537902388369678, "loss": 0.2587, "step": 890 }, { "epoch": 0.93, "learning_rate": 0.00015327102803738317, "loss": 0.2854, "step": 900 }, { "epoch": 0.93, "eval_accuracy": 0.9006211180124224, "eval_loss": 0.2561000883579254, "eval_runtime": 107.0006, "eval_samples_per_second": 61.691, "eval_steps_per_second": 7.72, "step": 900 }, { "epoch": 0.94, "learning_rate": 0.00015275181723779855, "loss": 0.2348, "step": 910 }, { "epoch": 0.96, "learning_rate": 0.00015223260643821394, "loss": 0.2356, "step": 920 }, { "epoch": 0.97, "learning_rate": 0.0001517133956386293, "loss": 0.2324, "step": 930 }, { "epoch": 0.98, "learning_rate": 0.00015119418483904465, "loss": 0.265, "step": 940 }, { "epoch": 0.99, "learning_rate": 0.00015067497403946004, "loss": 0.2554, "step": 950 }, { "epoch": 1.0, "learning_rate": 0.0001501557632398754, "loss": 0.2845, "step": 960 }, { "epoch": 1.01, "learning_rate": 0.00014963655244029075, "loss": 0.3135, "step": 970 }, { "epoch": 1.02, "learning_rate": 0.00014911734164070614, "loss": 0.1335, "step": 980 }, { "epoch": 1.03, "learning_rate": 0.0001485981308411215, "loss": 0.3072, "step": 990 }, { "epoch": 1.04, "learning_rate": 0.00014807892004153688, "loss": 0.2594, "step": 1000 }, { "epoch": 1.04, "eval_accuracy": 0.9019845477957885, "eval_loss": 0.26812365651130676, "eval_runtime": 106.2371, "eval_samples_per_second": 62.135, "eval_steps_per_second": 7.775, "step": 1000 }, { "epoch": 1.05, "learning_rate": 0.00014755970924195224, "loss": 0.1768, "step": 1010 }, { "epoch": 1.06, "learning_rate": 0.0001470404984423676, "loss": 0.187, "step": 1020 }, { "epoch": 1.07, "learning_rate": 0.00014652128764278298, "loss": 0.1335, "step": 1030 }, { "epoch": 1.08, "learning_rate": 0.00014600207684319837, "loss": 0.1969, "step": 1040 }, { "epoch": 1.09, "learning_rate": 0.00014548286604361372, "loss": 0.228, "step": 1050 }, { "epoch": 1.1, "learning_rate": 0.00014496365524402908, "loss": 0.1665, "step": 1060 }, { "epoch": 1.11, "learning_rate": 0.00014444444444444444, "loss": 0.1743, "step": 1070 }, { "epoch": 1.12, "learning_rate": 0.00014392523364485982, "loss": 0.228, "step": 1080 }, { "epoch": 1.13, "learning_rate": 0.00014340602284527518, "loss": 0.1356, "step": 1090 }, { "epoch": 1.14, "learning_rate": 0.00014288681204569056, "loss": 0.177, "step": 1100 }, { "epoch": 1.14, "eval_accuracy": 0.8772913194970459, "eval_loss": 0.3406391143798828, "eval_runtime": 105.7842, "eval_samples_per_second": 62.401, "eval_steps_per_second": 7.808, "step": 1100 }, { "epoch": 1.15, "learning_rate": 0.00014236760124610592, "loss": 0.1793, "step": 1110 }, { "epoch": 1.16, "learning_rate": 0.0001418483904465213, "loss": 0.2743, "step": 1120 }, { "epoch": 1.17, "learning_rate": 0.00014132917964693666, "loss": 0.3143, "step": 1130 }, { "epoch": 1.18, "learning_rate": 0.00014080996884735202, "loss": 0.1482, "step": 1140 }, { "epoch": 1.19, "learning_rate": 0.00014029075804776738, "loss": 0.1937, "step": 1150 }, { "epoch": 1.2, "learning_rate": 0.00013977154724818276, "loss": 0.1386, "step": 1160 }, { "epoch": 1.21, "learning_rate": 0.00013925233644859815, "loss": 0.1427, "step": 1170 }, { "epoch": 1.23, "learning_rate": 0.0001387331256490135, "loss": 0.1565, "step": 1180 }, { "epoch": 1.24, "learning_rate": 0.00013821391484942886, "loss": 0.2196, "step": 1190 }, { "epoch": 1.25, "learning_rate": 0.00013769470404984425, "loss": 0.2717, "step": 1200 }, { "epoch": 1.25, "eval_accuracy": 0.917133767610968, "eval_loss": 0.22659926116466522, "eval_runtime": 105.225, "eval_samples_per_second": 62.732, "eval_steps_per_second": 7.85, "step": 1200 }, { "epoch": 1.26, "learning_rate": 0.0001371754932502596, "loss": 0.2137, "step": 1210 }, { "epoch": 1.27, "learning_rate": 0.000136656282450675, "loss": 0.1641, "step": 1220 }, { "epoch": 1.28, "learning_rate": 0.00013613707165109035, "loss": 0.1665, "step": 1230 }, { "epoch": 1.29, "learning_rate": 0.0001356178608515057, "loss": 0.1631, "step": 1240 }, { "epoch": 1.3, "learning_rate": 0.0001350986500519211, "loss": 0.2048, "step": 1250 }, { "epoch": 1.31, "learning_rate": 0.00013457943925233645, "loss": 0.1899, "step": 1260 }, { "epoch": 1.32, "learning_rate": 0.0001340602284527518, "loss": 0.1521, "step": 1270 }, { "epoch": 1.33, "learning_rate": 0.0001335410176531672, "loss": 0.1784, "step": 1280 }, { "epoch": 1.34, "learning_rate": 0.00013302180685358258, "loss": 0.122, "step": 1290 }, { "epoch": 1.35, "learning_rate": 0.00013250259605399793, "loss": 0.2197, "step": 1300 }, { "epoch": 1.35, "eval_accuracy": 0.9236479321314952, "eval_loss": 0.20801472663879395, "eval_runtime": 105.552, "eval_samples_per_second": 62.538, "eval_steps_per_second": 7.826, "step": 1300 }, { "epoch": 1.36, "learning_rate": 0.0001319833852544133, "loss": 0.0729, "step": 1310 }, { "epoch": 1.37, "learning_rate": 0.00013146417445482865, "loss": 0.2226, "step": 1320 }, { "epoch": 1.38, "learning_rate": 0.00013094496365524403, "loss": 0.1275, "step": 1330 }, { "epoch": 1.39, "learning_rate": 0.00013042575285565942, "loss": 0.2397, "step": 1340 }, { "epoch": 1.4, "learning_rate": 0.00012990654205607478, "loss": 0.153, "step": 1350 }, { "epoch": 1.41, "learning_rate": 0.00012938733125649013, "loss": 0.178, "step": 1360 }, { "epoch": 1.42, "learning_rate": 0.00012886812045690552, "loss": 0.2325, "step": 1370 }, { "epoch": 1.43, "learning_rate": 0.00012834890965732088, "loss": 0.2731, "step": 1380 }, { "epoch": 1.44, "learning_rate": 0.00012782969885773626, "loss": 0.1804, "step": 1390 }, { "epoch": 1.45, "learning_rate": 0.00012731048805815162, "loss": 0.155, "step": 1400 }, { "epoch": 1.45, "eval_accuracy": 0.9236479321314952, "eval_loss": 0.20478595793247223, "eval_runtime": 104.1144, "eval_samples_per_second": 63.401, "eval_steps_per_second": 7.934, "step": 1400 }, { "epoch": 1.46, "learning_rate": 0.00012679127725856698, "loss": 0.1645, "step": 1410 }, { "epoch": 1.47, "learning_rate": 0.00012627206645898236, "loss": 0.2205, "step": 1420 }, { "epoch": 1.48, "learning_rate": 0.00012575285565939772, "loss": 0.1507, "step": 1430 }, { "epoch": 1.5, "learning_rate": 0.00012523364485981308, "loss": 0.4007, "step": 1440 }, { "epoch": 1.51, "learning_rate": 0.00012471443406022846, "loss": 0.1829, "step": 1450 }, { "epoch": 1.52, "learning_rate": 0.00012419522326064384, "loss": 0.1755, "step": 1460 }, { "epoch": 1.53, "learning_rate": 0.0001236760124610592, "loss": 0.0943, "step": 1470 }, { "epoch": 1.54, "learning_rate": 0.00012315680166147456, "loss": 0.1359, "step": 1480 }, { "epoch": 1.55, "learning_rate": 0.00012263759086188992, "loss": 0.2615, "step": 1490 }, { "epoch": 1.56, "learning_rate": 0.0001221183800623053, "loss": 0.2657, "step": 1500 }, { "epoch": 1.56, "eval_accuracy": 0.9256173307074685, "eval_loss": 0.2036730796098709, "eval_runtime": 104.5196, "eval_samples_per_second": 63.156, "eval_steps_per_second": 7.903, "step": 1500 }, { "epoch": 1.57, "learning_rate": 0.00012159916926272067, "loss": 0.1653, "step": 1510 }, { "epoch": 1.58, "learning_rate": 0.00012107995846313604, "loss": 0.2066, "step": 1520 }, { "epoch": 1.59, "learning_rate": 0.00012056074766355142, "loss": 0.1706, "step": 1530 }, { "epoch": 1.6, "learning_rate": 0.00012004153686396677, "loss": 0.2103, "step": 1540 }, { "epoch": 1.61, "learning_rate": 0.00011952232606438214, "loss": 0.2336, "step": 1550 }, { "epoch": 1.62, "learning_rate": 0.0001190031152647975, "loss": 0.1792, "step": 1560 }, { "epoch": 1.63, "learning_rate": 0.00011848390446521289, "loss": 0.0966, "step": 1570 }, { "epoch": 1.64, "learning_rate": 0.00011796469366562826, "loss": 0.1713, "step": 1580 }, { "epoch": 1.65, "learning_rate": 0.00011744548286604362, "loss": 0.1289, "step": 1590 }, { "epoch": 1.66, "learning_rate": 0.00011692627206645899, "loss": 0.118, "step": 1600 }, { "epoch": 1.66, "eval_accuracy": 0.9095591577033783, "eval_loss": 0.26160144805908203, "eval_runtime": 104.9104, "eval_samples_per_second": 62.92, "eval_steps_per_second": 7.873, "step": 1600 }, { "epoch": 1.67, "learning_rate": 0.00011640706126687434, "loss": 0.1872, "step": 1610 }, { "epoch": 1.68, "learning_rate": 0.00011588785046728972, "loss": 0.152, "step": 1620 }, { "epoch": 1.69, "learning_rate": 0.0001153686396677051, "loss": 0.1337, "step": 1630 }, { "epoch": 1.7, "learning_rate": 0.00011484942886812047, "loss": 0.1553, "step": 1640 }, { "epoch": 1.71, "learning_rate": 0.00011433021806853583, "loss": 0.1428, "step": 1650 }, { "epoch": 1.72, "learning_rate": 0.0001138110072689512, "loss": 0.1911, "step": 1660 }, { "epoch": 1.73, "learning_rate": 0.00011329179646936656, "loss": 0.1263, "step": 1670 }, { "epoch": 1.74, "learning_rate": 0.00011277258566978193, "loss": 0.1414, "step": 1680 }, { "epoch": 1.75, "learning_rate": 0.00011225337487019731, "loss": 0.2111, "step": 1690 }, { "epoch": 1.77, "learning_rate": 0.00011173416407061268, "loss": 0.1823, "step": 1700 }, { "epoch": 1.77, "eval_accuracy": 0.9241024087259506, "eval_loss": 0.21581591665744781, "eval_runtime": 104.5278, "eval_samples_per_second": 63.151, "eval_steps_per_second": 7.902, "step": 1700 }, { "epoch": 1.78, "learning_rate": 0.00011121495327102804, "loss": 0.1266, "step": 1710 }, { "epoch": 1.79, "learning_rate": 0.00011069574247144341, "loss": 0.1765, "step": 1720 }, { "epoch": 1.8, "learning_rate": 0.00011017653167185877, "loss": 0.1983, "step": 1730 }, { "epoch": 1.81, "learning_rate": 0.00010965732087227414, "loss": 0.253, "step": 1740 }, { "epoch": 1.82, "learning_rate": 0.00010913811007268953, "loss": 0.1542, "step": 1750 }, { "epoch": 1.83, "learning_rate": 0.00010861889927310488, "loss": 0.1171, "step": 1760 }, { "epoch": 1.84, "learning_rate": 0.00010809968847352026, "loss": 0.1701, "step": 1770 }, { "epoch": 1.85, "learning_rate": 0.00010758047767393563, "loss": 0.1689, "step": 1780 }, { "epoch": 1.86, "learning_rate": 0.00010706126687435098, "loss": 0.1405, "step": 1790 }, { "epoch": 1.87, "learning_rate": 0.00010654205607476636, "loss": 0.2175, "step": 1800 }, { "epoch": 1.87, "eval_accuracy": 0.9181942129980306, "eval_loss": 0.21594561636447906, "eval_runtime": 103.9435, "eval_samples_per_second": 63.506, "eval_steps_per_second": 7.947, "step": 1800 }, { "epoch": 1.88, "learning_rate": 0.00010602284527518174, "loss": 0.1923, "step": 1810 }, { "epoch": 1.89, "learning_rate": 0.0001055036344755971, "loss": 0.2476, "step": 1820 }, { "epoch": 1.9, "learning_rate": 0.00010498442367601247, "loss": 0.2243, "step": 1830 }, { "epoch": 1.91, "learning_rate": 0.00010446521287642783, "loss": 0.2611, "step": 1840 }, { "epoch": 1.92, "learning_rate": 0.0001039460020768432, "loss": 0.2186, "step": 1850 }, { "epoch": 1.93, "learning_rate": 0.00010342679127725856, "loss": 0.141, "step": 1860 }, { "epoch": 1.94, "learning_rate": 0.00010290758047767395, "loss": 0.1086, "step": 1870 }, { "epoch": 1.95, "learning_rate": 0.00010238836967808931, "loss": 0.1948, "step": 1880 }, { "epoch": 1.96, "learning_rate": 0.00010186915887850468, "loss": 0.166, "step": 1890 }, { "epoch": 1.97, "learning_rate": 0.00010134994807892004, "loss": 0.143, "step": 1900 }, { "epoch": 1.97, "eval_accuracy": 0.9242539009241024, "eval_loss": 0.18984103202819824, "eval_runtime": 107.0187, "eval_samples_per_second": 61.681, "eval_steps_per_second": 7.718, "step": 1900 }, { "epoch": 1.98, "learning_rate": 0.00010083073727933541, "loss": 0.1538, "step": 1910 }, { "epoch": 1.99, "learning_rate": 0.00010031152647975077, "loss": 0.1682, "step": 1920 }, { "epoch": 2.0, "learning_rate": 9.979231568016615e-05, "loss": 0.09, "step": 1930 }, { "epoch": 2.01, "learning_rate": 9.927310488058152e-05, "loss": 0.0338, "step": 1940 }, { "epoch": 2.02, "learning_rate": 9.87538940809969e-05, "loss": 0.1026, "step": 1950 }, { "epoch": 2.04, "learning_rate": 9.823468328141225e-05, "loss": 0.1165, "step": 1960 }, { "epoch": 2.05, "learning_rate": 9.771547248182762e-05, "loss": 0.0652, "step": 1970 }, { "epoch": 2.06, "learning_rate": 9.7196261682243e-05, "loss": 0.1643, "step": 1980 }, { "epoch": 2.07, "learning_rate": 9.667705088265837e-05, "loss": 0.0903, "step": 1990 }, { "epoch": 2.08, "learning_rate": 9.615784008307374e-05, "loss": 0.1051, "step": 2000 }, { "epoch": 2.08, "eval_accuracy": 0.9225874867444327, "eval_loss": 0.23080797493457794, "eval_runtime": 105.3241, "eval_samples_per_second": 62.673, "eval_steps_per_second": 7.842, "step": 2000 }, { "epoch": 2.09, "learning_rate": 9.56386292834891e-05, "loss": 0.1135, "step": 2010 }, { "epoch": 2.1, "learning_rate": 9.511941848390447e-05, "loss": 0.0586, "step": 2020 }, { "epoch": 2.11, "learning_rate": 9.460020768431984e-05, "loss": 0.1016, "step": 2030 }, { "epoch": 2.12, "learning_rate": 9.408099688473521e-05, "loss": 0.0808, "step": 2040 }, { "epoch": 2.13, "learning_rate": 9.356178608515057e-05, "loss": 0.0502, "step": 2050 }, { "epoch": 2.14, "learning_rate": 9.304257528556595e-05, "loss": 0.0712, "step": 2060 }, { "epoch": 2.15, "learning_rate": 9.252336448598131e-05, "loss": 0.0938, "step": 2070 }, { "epoch": 2.16, "learning_rate": 9.200415368639668e-05, "loss": 0.0539, "step": 2080 }, { "epoch": 2.17, "learning_rate": 9.148494288681205e-05, "loss": 0.1137, "step": 2090 }, { "epoch": 2.18, "learning_rate": 9.096573208722742e-05, "loss": 0.1963, "step": 2100 }, { "epoch": 2.18, "eval_accuracy": 0.9204665959703076, "eval_loss": 0.23536920547485352, "eval_runtime": 106.2404, "eval_samples_per_second": 62.133, "eval_steps_per_second": 7.775, "step": 2100 }, { "epoch": 2.19, "learning_rate": 9.044652128764278e-05, "loss": 0.1132, "step": 2110 }, { "epoch": 2.2, "learning_rate": 8.992731048805816e-05, "loss": 0.0318, "step": 2120 }, { "epoch": 2.21, "learning_rate": 8.940809968847352e-05, "loss": 0.0562, "step": 2130 }, { "epoch": 2.22, "learning_rate": 8.888888888888889e-05, "loss": 0.0991, "step": 2140 }, { "epoch": 2.23, "learning_rate": 8.836967808930426e-05, "loss": 0.1263, "step": 2150 }, { "epoch": 2.24, "learning_rate": 8.785046728971964e-05, "loss": 0.093, "step": 2160 }, { "epoch": 2.25, "learning_rate": 8.733125649013499e-05, "loss": 0.0992, "step": 2170 }, { "epoch": 2.26, "learning_rate": 8.681204569055036e-05, "loss": 0.0787, "step": 2180 }, { "epoch": 2.27, "learning_rate": 8.629283489096574e-05, "loss": 0.1306, "step": 2190 }, { "epoch": 2.28, "learning_rate": 8.577362409138109e-05, "loss": 0.0524, "step": 2200 }, { "epoch": 2.28, "eval_accuracy": 0.9281926980760491, "eval_loss": 0.22982539236545563, "eval_runtime": 105.4663, "eval_samples_per_second": 62.589, "eval_steps_per_second": 7.832, "step": 2200 }, { "epoch": 2.29, "learning_rate": 8.525441329179648e-05, "loss": 0.1902, "step": 2210 }, { "epoch": 2.31, "learning_rate": 8.473520249221184e-05, "loss": 0.0755, "step": 2220 }, { "epoch": 2.32, "learning_rate": 8.42159916926272e-05, "loss": 0.0826, "step": 2230 }, { "epoch": 2.33, "learning_rate": 8.369678089304258e-05, "loss": 0.1252, "step": 2240 }, { "epoch": 2.34, "learning_rate": 8.317757009345795e-05, "loss": 0.0848, "step": 2250 }, { "epoch": 2.35, "learning_rate": 8.265835929387332e-05, "loss": 0.0134, "step": 2260 }, { "epoch": 2.36, "learning_rate": 8.213914849428869e-05, "loss": 0.1297, "step": 2270 }, { "epoch": 2.37, "learning_rate": 8.161993769470405e-05, "loss": 0.0633, "step": 2280 }, { "epoch": 2.38, "learning_rate": 8.110072689511943e-05, "loss": 0.0625, "step": 2290 }, { "epoch": 2.39, "learning_rate": 8.058151609553479e-05, "loss": 0.097, "step": 2300 }, { "epoch": 2.39, "eval_accuracy": 0.9241024087259506, "eval_loss": 0.24952168762683868, "eval_runtime": 105.3776, "eval_samples_per_second": 62.641, "eval_steps_per_second": 7.838, "step": 2300 }, { "epoch": 2.4, "learning_rate": 8.006230529595016e-05, "loss": 0.0439, "step": 2310 }, { "epoch": 2.41, "learning_rate": 7.954309449636553e-05, "loss": 0.0743, "step": 2320 }, { "epoch": 2.42, "learning_rate": 7.90238836967809e-05, "loss": 0.1643, "step": 2330 }, { "epoch": 2.43, "learning_rate": 7.850467289719626e-05, "loss": 0.1348, "step": 2340 }, { "epoch": 2.44, "learning_rate": 7.798546209761163e-05, "loss": 0.1049, "step": 2350 }, { "epoch": 2.45, "learning_rate": 7.7466251298027e-05, "loss": 0.1646, "step": 2360 }, { "epoch": 2.46, "learning_rate": 7.694704049844238e-05, "loss": 0.123, "step": 2370 }, { "epoch": 2.47, "learning_rate": 7.642782969885775e-05, "loss": 0.0567, "step": 2380 }, { "epoch": 2.48, "learning_rate": 7.59086188992731e-05, "loss": 0.1242, "step": 2390 }, { "epoch": 2.49, "learning_rate": 7.538940809968848e-05, "loss": 0.0744, "step": 2400 }, { "epoch": 2.49, "eval_accuracy": 0.9194061505832449, "eval_loss": 0.24930374324321747, "eval_runtime": 104.785, "eval_samples_per_second": 62.996, "eval_steps_per_second": 7.883, "step": 2400 }, { "epoch": 2.5, "learning_rate": 7.487019730010385e-05, "loss": 0.0935, "step": 2410 }, { "epoch": 2.51, "learning_rate": 7.435098650051922e-05, "loss": 0.0632, "step": 2420 }, { "epoch": 2.52, "learning_rate": 7.383177570093458e-05, "loss": 0.0975, "step": 2430 }, { "epoch": 2.53, "learning_rate": 7.331256490134996e-05, "loss": 0.0355, "step": 2440 }, { "epoch": 2.54, "learning_rate": 7.279335410176532e-05, "loss": 0.0485, "step": 2450 }, { "epoch": 2.55, "learning_rate": 7.227414330218069e-05, "loss": 0.0503, "step": 2460 }, { "epoch": 2.56, "learning_rate": 7.175493250259606e-05, "loss": 0.0745, "step": 2470 }, { "epoch": 2.58, "learning_rate": 7.123572170301143e-05, "loss": 0.1093, "step": 2480 }, { "epoch": 2.59, "learning_rate": 7.071651090342679e-05, "loss": 0.0554, "step": 2490 }, { "epoch": 2.6, "learning_rate": 7.019730010384217e-05, "loss": 0.0744, "step": 2500 }, { "epoch": 2.6, "eval_accuracy": 0.9322829874261476, "eval_loss": 0.2428741455078125, "eval_runtime": 104.9683, "eval_samples_per_second": 62.886, "eval_steps_per_second": 7.869, "step": 2500 }, { "epoch": 2.61, "learning_rate": 6.967808930425753e-05, "loss": 0.0998, "step": 2510 }, { "epoch": 2.62, "learning_rate": 6.91588785046729e-05, "loss": 0.0561, "step": 2520 }, { "epoch": 2.63, "learning_rate": 6.863966770508827e-05, "loss": 0.0452, "step": 2530 }, { "epoch": 2.64, "learning_rate": 6.812045690550364e-05, "loss": 0.1821, "step": 2540 }, { "epoch": 2.65, "learning_rate": 6.7601246105919e-05, "loss": 0.0598, "step": 2550 }, { "epoch": 2.66, "learning_rate": 6.708203530633437e-05, "loss": 0.0722, "step": 2560 }, { "epoch": 2.67, "learning_rate": 6.656282450674974e-05, "loss": 0.0532, "step": 2570 }, { "epoch": 2.68, "learning_rate": 6.60436137071651e-05, "loss": 0.0644, "step": 2580 }, { "epoch": 2.69, "learning_rate": 6.552440290758049e-05, "loss": 0.1235, "step": 2590 }, { "epoch": 2.7, "learning_rate": 6.500519210799584e-05, "loss": 0.0345, "step": 2600 }, { "epoch": 2.7, "eval_accuracy": 0.9251628541130131, "eval_loss": 0.2586543560028076, "eval_runtime": 104.6212, "eval_samples_per_second": 63.094, "eval_steps_per_second": 7.895, "step": 2600 }, { "epoch": 2.71, "learning_rate": 6.448598130841122e-05, "loss": 0.1686, "step": 2610 }, { "epoch": 2.72, "learning_rate": 6.396677050882659e-05, "loss": 0.0775, "step": 2620 }, { "epoch": 2.73, "learning_rate": 6.344755970924196e-05, "loss": 0.0956, "step": 2630 }, { "epoch": 2.74, "learning_rate": 6.292834890965732e-05, "loss": 0.135, "step": 2640 }, { "epoch": 2.75, "learning_rate": 6.24091381100727e-05, "loss": 0.1582, "step": 2650 }, { "epoch": 2.76, "learning_rate": 6.188992731048806e-05, "loss": 0.0668, "step": 2660 }, { "epoch": 2.77, "learning_rate": 6.137071651090343e-05, "loss": 0.1317, "step": 2670 }, { "epoch": 2.78, "learning_rate": 6.08515057113188e-05, "loss": 0.0756, "step": 2680 }, { "epoch": 2.79, "learning_rate": 6.0332294911734164e-05, "loss": 0.0565, "step": 2690 }, { "epoch": 2.8, "learning_rate": 5.981308411214953e-05, "loss": 0.0097, "step": 2700 }, { "epoch": 2.8, "eval_accuracy": 0.9265262838963794, "eval_loss": 0.22841490805149078, "eval_runtime": 104.4263, "eval_samples_per_second": 63.212, "eval_steps_per_second": 7.91, "step": 2700 }, { "epoch": 2.81, "learning_rate": 5.9293873312564906e-05, "loss": 0.0621, "step": 2710 }, { "epoch": 2.82, "learning_rate": 5.877466251298027e-05, "loss": 0.0684, "step": 2720 }, { "epoch": 2.83, "learning_rate": 5.8255451713395635e-05, "loss": 0.0408, "step": 2730 }, { "epoch": 2.85, "learning_rate": 5.773624091381101e-05, "loss": 0.1156, "step": 2740 }, { "epoch": 2.86, "learning_rate": 5.721703011422638e-05, "loss": 0.0228, "step": 2750 }, { "epoch": 2.87, "learning_rate": 5.669781931464174e-05, "loss": 0.1417, "step": 2760 }, { "epoch": 2.88, "learning_rate": 5.617860851505712e-05, "loss": 0.0443, "step": 2770 }, { "epoch": 2.89, "learning_rate": 5.5659397715472484e-05, "loss": 0.0903, "step": 2780 }, { "epoch": 2.9, "learning_rate": 5.514018691588785e-05, "loss": 0.0395, "step": 2790 }, { "epoch": 2.91, "learning_rate": 5.4620976116303226e-05, "loss": 0.0775, "step": 2800 }, { "epoch": 2.91, "eval_accuracy": 0.9321314952279958, "eval_loss": 0.22424499690532684, "eval_runtime": 104.7535, "eval_samples_per_second": 63.015, "eval_steps_per_second": 7.885, "step": 2800 }, { "epoch": 2.92, "learning_rate": 5.410176531671859e-05, "loss": 0.0281, "step": 2810 }, { "epoch": 2.93, "learning_rate": 5.3582554517133955e-05, "loss": 0.0756, "step": 2820 }, { "epoch": 2.94, "learning_rate": 5.306334371754933e-05, "loss": 0.083, "step": 2830 }, { "epoch": 2.95, "learning_rate": 5.25441329179647e-05, "loss": 0.0804, "step": 2840 }, { "epoch": 2.96, "learning_rate": 5.202492211838006e-05, "loss": 0.068, "step": 2850 }, { "epoch": 2.97, "learning_rate": 5.150571131879543e-05, "loss": 0.0549, "step": 2860 }, { "epoch": 2.98, "learning_rate": 5.09865005192108e-05, "loss": 0.1113, "step": 2870 }, { "epoch": 2.99, "learning_rate": 5.046728971962617e-05, "loss": 0.1297, "step": 2880 }, { "epoch": 3.0, "learning_rate": 4.994807892004154e-05, "loss": 0.0397, "step": 2890 }, { "epoch": 3.01, "learning_rate": 4.9428868120456904e-05, "loss": 0.0634, "step": 2900 }, { "epoch": 3.01, "eval_accuracy": 0.9286471746705045, "eval_loss": 0.23135913908481598, "eval_runtime": 104.8164, "eval_samples_per_second": 62.977, "eval_steps_per_second": 7.88, "step": 2900 }, { "epoch": 3.02, "learning_rate": 4.8909657320872275e-05, "loss": 0.02, "step": 2910 }, { "epoch": 3.03, "learning_rate": 4.8390446521287646e-05, "loss": 0.0324, "step": 2920 }, { "epoch": 3.04, "learning_rate": 4.787123572170301e-05, "loss": 0.0064, "step": 2930 }, { "epoch": 3.05, "learning_rate": 4.735202492211838e-05, "loss": 0.0063, "step": 2940 }, { "epoch": 3.06, "learning_rate": 4.683281412253375e-05, "loss": 0.0372, "step": 2950 }, { "epoch": 3.07, "learning_rate": 4.631360332294912e-05, "loss": 0.0679, "step": 2960 }, { "epoch": 3.08, "learning_rate": 4.579439252336449e-05, "loss": 0.0517, "step": 2970 }, { "epoch": 3.09, "learning_rate": 4.527518172377986e-05, "loss": 0.097, "step": 2980 }, { "epoch": 3.1, "learning_rate": 4.4755970924195224e-05, "loss": 0.1, "step": 2990 }, { "epoch": 3.12, "learning_rate": 4.4236760124610595e-05, "loss": 0.0109, "step": 3000 }, { "epoch": 3.12, "eval_accuracy": 0.9337979094076655, "eval_loss": 0.22030013799667358, "eval_runtime": 105.6713, "eval_samples_per_second": 62.467, "eval_steps_per_second": 7.817, "step": 3000 }, { "epoch": 3.13, "learning_rate": 4.3717549325025966e-05, "loss": 0.0143, "step": 3010 }, { "epoch": 3.14, "learning_rate": 4.319833852544133e-05, "loss": 0.0638, "step": 3020 }, { "epoch": 3.15, "learning_rate": 4.26791277258567e-05, "loss": 0.0213, "step": 3030 }, { "epoch": 3.16, "learning_rate": 4.2159916926272066e-05, "loss": 0.0153, "step": 3040 }, { "epoch": 3.17, "learning_rate": 4.164070612668744e-05, "loss": 0.0052, "step": 3050 }, { "epoch": 3.18, "learning_rate": 4.11214953271028e-05, "loss": 0.0406, "step": 3060 }, { "epoch": 3.19, "learning_rate": 4.060228452751817e-05, "loss": 0.0109, "step": 3070 }, { "epoch": 3.2, "learning_rate": 4.0083073727933544e-05, "loss": 0.0058, "step": 3080 }, { "epoch": 3.21, "learning_rate": 3.956386292834891e-05, "loss": 0.005, "step": 3090 }, { "epoch": 3.22, "learning_rate": 3.904465212876428e-05, "loss": 0.0039, "step": 3100 }, { "epoch": 3.22, "eval_accuracy": 0.9357673079836388, "eval_loss": 0.2575285732746124, "eval_runtime": 105.4895, "eval_samples_per_second": 62.575, "eval_steps_per_second": 7.83, "step": 3100 }, { "epoch": 3.23, "learning_rate": 3.852544132917965e-05, "loss": 0.0284, "step": 3110 }, { "epoch": 3.24, "learning_rate": 3.8006230529595015e-05, "loss": 0.0042, "step": 3120 }, { "epoch": 3.25, "learning_rate": 3.7487019730010386e-05, "loss": 0.0407, "step": 3130 }, { "epoch": 3.26, "learning_rate": 3.696780893042576e-05, "loss": 0.009, "step": 3140 }, { "epoch": 3.27, "learning_rate": 3.644859813084112e-05, "loss": 0.0342, "step": 3150 }, { "epoch": 3.28, "learning_rate": 3.592938733125649e-05, "loss": 0.056, "step": 3160 }, { "epoch": 3.29, "learning_rate": 3.5410176531671864e-05, "loss": 0.0384, "step": 3170 }, { "epoch": 3.3, "learning_rate": 3.489096573208723e-05, "loss": 0.0319, "step": 3180 }, { "epoch": 3.31, "learning_rate": 3.43717549325026e-05, "loss": 0.0619, "step": 3190 }, { "epoch": 3.32, "learning_rate": 3.385254413291797e-05, "loss": 0.0139, "step": 3200 }, { "epoch": 3.32, "eval_accuracy": 0.935615815785487, "eval_loss": 0.25701212882995605, "eval_runtime": 105.4027, "eval_samples_per_second": 62.626, "eval_steps_per_second": 7.837, "step": 3200 }, { "epoch": 3.33, "learning_rate": 3.3333333333333335e-05, "loss": 0.0035, "step": 3210 }, { "epoch": 3.34, "learning_rate": 3.2814122533748706e-05, "loss": 0.0232, "step": 3220 }, { "epoch": 3.35, "learning_rate": 3.229491173416408e-05, "loss": 0.0078, "step": 3230 }, { "epoch": 3.36, "learning_rate": 3.177570093457944e-05, "loss": 0.041, "step": 3240 }, { "epoch": 3.37, "learning_rate": 3.1256490134994806e-05, "loss": 0.0408, "step": 3250 }, { "epoch": 3.39, "learning_rate": 3.073727933541018e-05, "loss": 0.0372, "step": 3260 }, { "epoch": 3.4, "learning_rate": 3.0218068535825545e-05, "loss": 0.0639, "step": 3270 }, { "epoch": 3.41, "learning_rate": 2.9698857736240916e-05, "loss": 0.0205, "step": 3280 }, { "epoch": 3.42, "learning_rate": 2.9179646936656284e-05, "loss": 0.0579, "step": 3290 }, { "epoch": 3.43, "learning_rate": 2.866043613707165e-05, "loss": 0.0358, "step": 3300 }, { "epoch": 3.43, "eval_accuracy": 0.9334949250113619, "eval_loss": 0.2629750072956085, "eval_runtime": 105.1034, "eval_samples_per_second": 62.805, "eval_steps_per_second": 7.859, "step": 3300 }, { "epoch": 3.44, "learning_rate": 2.814122533748702e-05, "loss": 0.0598, "step": 3310 }, { "epoch": 3.45, "learning_rate": 2.762201453790239e-05, "loss": 0.0089, "step": 3320 }, { "epoch": 3.46, "learning_rate": 2.7102803738317755e-05, "loss": 0.0032, "step": 3330 }, { "epoch": 3.47, "learning_rate": 2.6583592938733126e-05, "loss": 0.0039, "step": 3340 }, { "epoch": 3.48, "learning_rate": 2.6064382139148497e-05, "loss": 0.0053, "step": 3350 }, { "epoch": 3.49, "learning_rate": 2.554517133956386e-05, "loss": 0.0093, "step": 3360 }, { "epoch": 3.5, "learning_rate": 2.5025960539979233e-05, "loss": 0.0657, "step": 3370 }, { "epoch": 3.51, "learning_rate": 2.4506749740394604e-05, "loss": 0.0027, "step": 3380 }, { "epoch": 3.52, "learning_rate": 2.398753894080997e-05, "loss": 0.0775, "step": 3390 }, { "epoch": 3.53, "learning_rate": 2.3468328141225336e-05, "loss": 0.0347, "step": 3400 }, { "epoch": 3.53, "eval_accuracy": 0.9357673079836388, "eval_loss": 0.2633425295352936, "eval_runtime": 105.2554, "eval_samples_per_second": 62.714, "eval_steps_per_second": 7.848, "step": 3400 }, { "epoch": 3.54, "learning_rate": 2.2949117341640707e-05, "loss": 0.0288, "step": 3410 }, { "epoch": 3.55, "learning_rate": 2.2429906542056075e-05, "loss": 0.039, "step": 3420 }, { "epoch": 3.56, "learning_rate": 2.1910695742471443e-05, "loss": 0.0565, "step": 3430 }, { "epoch": 3.57, "learning_rate": 2.1391484942886814e-05, "loss": 0.0412, "step": 3440 }, { "epoch": 3.58, "learning_rate": 2.087227414330218e-05, "loss": 0.0391, "step": 3450 }, { "epoch": 3.59, "learning_rate": 2.0353063343717553e-05, "loss": 0.0036, "step": 3460 }, { "epoch": 3.6, "learning_rate": 1.983385254413292e-05, "loss": 0.0045, "step": 3470 }, { "epoch": 3.61, "learning_rate": 1.9314641744548288e-05, "loss": 0.0679, "step": 3480 }, { "epoch": 3.62, "learning_rate": 1.8795430944963656e-05, "loss": 0.0169, "step": 3490 }, { "epoch": 3.63, "learning_rate": 1.8276220145379024e-05, "loss": 0.0408, "step": 3500 }, { "epoch": 3.63, "eval_accuracy": 0.9334949250113619, "eval_loss": 0.2590745985507965, "eval_runtime": 105.6016, "eval_samples_per_second": 62.509, "eval_steps_per_second": 7.822, "step": 3500 }, { "epoch": 3.64, "learning_rate": 1.775700934579439e-05, "loss": 0.0217, "step": 3510 }, { "epoch": 3.66, "learning_rate": 1.7237798546209763e-05, "loss": 0.0186, "step": 3520 }, { "epoch": 3.67, "learning_rate": 1.671858774662513e-05, "loss": 0.006, "step": 3530 }, { "epoch": 3.68, "learning_rate": 1.6199376947040498e-05, "loss": 0.0216, "step": 3540 }, { "epoch": 3.69, "learning_rate": 1.568016614745587e-05, "loss": 0.0108, "step": 3550 }, { "epoch": 3.7, "learning_rate": 1.5160955347871237e-05, "loss": 0.0033, "step": 3560 }, { "epoch": 3.71, "learning_rate": 1.4641744548286603e-05, "loss": 0.0038, "step": 3570 }, { "epoch": 3.72, "learning_rate": 1.4122533748701974e-05, "loss": 0.0039, "step": 3580 }, { "epoch": 3.73, "learning_rate": 1.3603322949117342e-05, "loss": 0.0311, "step": 3590 }, { "epoch": 3.74, "learning_rate": 1.308411214953271e-05, "loss": 0.041, "step": 3600 }, { "epoch": 3.74, "eval_accuracy": 0.9366762611725497, "eval_loss": 0.2612759470939636, "eval_runtime": 105.1174, "eval_samples_per_second": 62.796, "eval_steps_per_second": 7.858, "step": 3600 }, { "epoch": 3.75, "learning_rate": 1.256490134994808e-05, "loss": 0.0035, "step": 3610 }, { "epoch": 3.76, "learning_rate": 1.2045690550363447e-05, "loss": 0.0501, "step": 3620 }, { "epoch": 3.77, "learning_rate": 1.1526479750778816e-05, "loss": 0.0398, "step": 3630 }, { "epoch": 3.78, "learning_rate": 1.1007268951194186e-05, "loss": 0.0042, "step": 3640 }, { "epoch": 3.79, "learning_rate": 1.0488058151609554e-05, "loss": 0.0044, "step": 3650 }, { "epoch": 3.8, "learning_rate": 9.968847352024923e-06, "loss": 0.087, "step": 3660 }, { "epoch": 3.81, "learning_rate": 9.44963655244029e-06, "loss": 0.005, "step": 3670 }, { "epoch": 3.82, "learning_rate": 8.93042575285566e-06, "loss": 0.0404, "step": 3680 }, { "epoch": 3.83, "learning_rate": 8.411214953271028e-06, "loss": 0.0401, "step": 3690 }, { "epoch": 3.84, "learning_rate": 7.892004153686398e-06, "loss": 0.004, "step": 3700 }, { "epoch": 3.84, "eval_accuracy": 0.9369792455688533, "eval_loss": 0.25872641801834106, "eval_runtime": 106.8562, "eval_samples_per_second": 61.775, "eval_steps_per_second": 7.73, "step": 3700 }, { "epoch": 3.85, "learning_rate": 7.372793354101766e-06, "loss": 0.0411, "step": 3710 }, { "epoch": 3.86, "learning_rate": 6.853582554517134e-06, "loss": 0.0689, "step": 3720 }, { "epoch": 3.87, "learning_rate": 6.3343717549325025e-06, "loss": 0.0428, "step": 3730 }, { "epoch": 3.88, "learning_rate": 5.815160955347872e-06, "loss": 0.0464, "step": 3740 }, { "epoch": 3.89, "learning_rate": 5.29595015576324e-06, "loss": 0.024, "step": 3750 }, { "epoch": 3.9, "learning_rate": 4.776739356178609e-06, "loss": 0.0732, "step": 3760 }, { "epoch": 3.91, "learning_rate": 4.257528556593978e-06, "loss": 0.0386, "step": 3770 }, { "epoch": 3.93, "learning_rate": 3.7383177570093455e-06, "loss": 0.0532, "step": 3780 }, { "epoch": 3.94, "learning_rate": 3.2191069574247146e-06, "loss": 0.0413, "step": 3790 }, { "epoch": 3.95, "learning_rate": 2.699896157840083e-06, "loss": 0.0389, "step": 3800 }, { "epoch": 3.95, "eval_accuracy": 0.9372822299651568, "eval_loss": 0.25349465012550354, "eval_runtime": 110.0172, "eval_samples_per_second": 60.0, "eval_steps_per_second": 7.508, "step": 3800 }, { "epoch": 3.96, "learning_rate": 2.1806853582554518e-06, "loss": 0.0383, "step": 3810 }, { "epoch": 3.97, "learning_rate": 1.6614745586708204e-06, "loss": 0.0169, "step": 3820 }, { "epoch": 3.98, "learning_rate": 1.142263759086189e-06, "loss": 0.0072, "step": 3830 }, { "epoch": 3.99, "learning_rate": 6.230529595015577e-07, "loss": 0.0054, "step": 3840 }, { "epoch": 4.0, "learning_rate": 1.0384215991692627e-07, "loss": 0.0044, "step": 3850 }, { "epoch": 4.0, "step": 3852, "total_flos": 4.774126496175784e+18, "train_loss": 0.1544622457234444, "train_runtime": 5879.4453, "train_samples_per_second": 10.479, "train_steps_per_second": 0.655 } ], "logging_steps": 10, "max_steps": 3852, "num_train_epochs": 4, "save_steps": 100, "total_flos": 4.774126496175784e+18, "trial_name": null, "trial_params": null }