{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.3016596160104132, "eval_steps": 500, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 4.9875000000000006e-05, "loss": 4.8021, "step": 10 }, { "epoch": 0.01, "learning_rate": 4.975e-05, "loss": 3.4946, "step": 20 }, { "epoch": 0.01, "learning_rate": 4.962500000000001e-05, "loss": 4.2365, "step": 30 }, { "epoch": 0.01, "learning_rate": 4.9500000000000004e-05, "loss": 3.11, "step": 40 }, { "epoch": 0.02, "learning_rate": 4.937500000000001e-05, "loss": 2.2186, "step": 50 }, { "epoch": 0.02, "learning_rate": 4.9250000000000004e-05, "loss": 1.5203, "step": 60 }, { "epoch": 0.02, "learning_rate": 4.9125e-05, "loss": 1.203, "step": 70 }, { "epoch": 0.03, "learning_rate": 4.9e-05, "loss": 0.9516, "step": 80 }, { "epoch": 0.03, "learning_rate": 4.8875e-05, "loss": 0.6212, "step": 90 }, { "epoch": 0.03, "learning_rate": 4.875e-05, "loss": 0.7066, "step": 100 }, { "epoch": 0.04, "learning_rate": 4.8625e-05, "loss": 0.2459, "step": 110 }, { "epoch": 0.04, "learning_rate": 4.85e-05, "loss": 0.4349, "step": 120 }, { "epoch": 0.04, "learning_rate": 4.8375000000000004e-05, "loss": 0.2229, "step": 130 }, { "epoch": 0.05, "learning_rate": 4.825e-05, "loss": 0.6896, "step": 140 }, { "epoch": 0.05, "learning_rate": 4.8125000000000004e-05, "loss": 0.3415, "step": 150 }, { "epoch": 0.05, "learning_rate": 4.8e-05, "loss": 0.2647, "step": 160 }, { "epoch": 0.06, "learning_rate": 4.7875000000000005e-05, "loss": 0.3146, "step": 170 }, { "epoch": 0.06, "learning_rate": 4.775e-05, "loss": 0.259, "step": 180 }, { "epoch": 0.06, "learning_rate": 4.7625000000000006e-05, "loss": 0.3262, "step": 190 }, { "epoch": 0.07, "learning_rate": 4.75e-05, "loss": 0.2983, "step": 200 }, { "epoch": 0.07, "learning_rate": 4.7375e-05, "loss": 0.3234, "step": 210 }, { "epoch": 0.07, "learning_rate": 4.7249999999999997e-05, "loss": 0.3344, "step": 220 }, { "epoch": 0.07, "learning_rate": 4.7125e-05, "loss": 0.3257, "step": 230 }, { "epoch": 0.08, "learning_rate": 4.7e-05, "loss": 0.3293, "step": 240 }, { "epoch": 0.08, "learning_rate": 4.6875e-05, "loss": 0.1742, "step": 250 }, { "epoch": 0.08, "learning_rate": 4.6750000000000005e-05, "loss": 0.4563, "step": 260 }, { "epoch": 0.09, "learning_rate": 4.6625e-05, "loss": 0.2687, "step": 270 }, { "epoch": 0.09, "learning_rate": 4.6500000000000005e-05, "loss": 0.267, "step": 280 }, { "epoch": 0.09, "learning_rate": 4.6375e-05, "loss": 0.4485, "step": 290 }, { "epoch": 0.1, "learning_rate": 4.6250000000000006e-05, "loss": 0.2024, "step": 300 }, { "epoch": 0.1, "learning_rate": 4.6125e-05, "loss": 0.2136, "step": 310 }, { "epoch": 0.1, "learning_rate": 4.600000000000001e-05, "loss": 0.2776, "step": 320 }, { "epoch": 0.11, "learning_rate": 4.5875000000000004e-05, "loss": 0.3112, "step": 330 }, { "epoch": 0.11, "learning_rate": 4.575e-05, "loss": 0.1997, "step": 340 }, { "epoch": 0.11, "learning_rate": 4.5625e-05, "loss": 0.2204, "step": 350 }, { "epoch": 0.12, "learning_rate": 4.55e-05, "loss": 0.2537, "step": 360 }, { "epoch": 0.12, "learning_rate": 4.5375e-05, "loss": 0.2956, "step": 370 }, { "epoch": 0.12, "learning_rate": 4.525e-05, "loss": 0.3163, "step": 380 }, { "epoch": 0.13, "learning_rate": 4.5125e-05, "loss": 0.3704, "step": 390 }, { "epoch": 0.13, "learning_rate": 4.5e-05, "loss": 0.308, "step": 400 }, { "epoch": 0.13, "learning_rate": 4.4875e-05, "loss": 0.2247, "step": 410 }, { "epoch": 0.14, "learning_rate": 4.4750000000000004e-05, "loss": 0.2089, "step": 420 }, { "epoch": 0.14, "learning_rate": 4.4625e-05, "loss": 0.2159, "step": 430 }, { "epoch": 0.14, "learning_rate": 4.4500000000000004e-05, "loss": 0.253, "step": 440 }, { "epoch": 0.15, "learning_rate": 4.4375e-05, "loss": 0.0905, "step": 450 }, { "epoch": 0.15, "learning_rate": 4.4250000000000005e-05, "loss": 0.3106, "step": 460 }, { "epoch": 0.15, "learning_rate": 4.4125e-05, "loss": 0.1635, "step": 470 }, { "epoch": 0.16, "learning_rate": 4.4000000000000006e-05, "loss": 0.2771, "step": 480 }, { "epoch": 0.16, "learning_rate": 4.3875e-05, "loss": 0.193, "step": 490 }, { "epoch": 0.16, "learning_rate": 4.375e-05, "loss": 0.3506, "step": 500 }, { "epoch": 0.16, "eval_bleu-4": 0.5123334214062311, "eval_rouge-1": 74.69764, "eval_rouge-2": 56.37476000000001, "eval_rouge-l": 71.65215, "eval_runtime": 4.9156, "eval_samples_per_second": 10.172, "eval_steps_per_second": 0.814, "step": 500 }, { "epoch": 0.17, "learning_rate": 4.3625e-05, "loss": 0.2192, "step": 510 }, { "epoch": 0.17, "learning_rate": 4.35e-05, "loss": 0.3669, "step": 520 }, { "epoch": 0.17, "learning_rate": 4.3375000000000004e-05, "loss": 0.1304, "step": 530 }, { "epoch": 0.18, "learning_rate": 4.325e-05, "loss": 0.3289, "step": 540 }, { "epoch": 0.18, "learning_rate": 4.3125000000000005e-05, "loss": 0.3846, "step": 550 }, { "epoch": 0.18, "learning_rate": 4.3e-05, "loss": 0.2141, "step": 560 }, { "epoch": 0.19, "learning_rate": 4.2875000000000005e-05, "loss": 0.1857, "step": 570 }, { "epoch": 0.19, "learning_rate": 4.275e-05, "loss": 0.2101, "step": 580 }, { "epoch": 0.19, "learning_rate": 4.2625000000000006e-05, "loss": 0.2363, "step": 590 }, { "epoch": 0.2, "learning_rate": 4.25e-05, "loss": 0.2324, "step": 600 }, { "epoch": 0.2, "learning_rate": 4.237500000000001e-05, "loss": 0.1082, "step": 610 }, { "epoch": 0.2, "learning_rate": 4.2250000000000004e-05, "loss": 0.2095, "step": 620 }, { "epoch": 0.21, "learning_rate": 4.2125e-05, "loss": 0.3072, "step": 630 }, { "epoch": 0.21, "learning_rate": 4.2e-05, "loss": 0.2708, "step": 640 }, { "epoch": 0.21, "learning_rate": 4.1875e-05, "loss": 0.1592, "step": 650 }, { "epoch": 0.21, "learning_rate": 4.175e-05, "loss": 0.3118, "step": 660 }, { "epoch": 0.22, "learning_rate": 4.1625e-05, "loss": 0.5071, "step": 670 }, { "epoch": 0.22, "learning_rate": 4.15e-05, "loss": 0.3046, "step": 680 }, { "epoch": 0.22, "learning_rate": 4.1375e-05, "loss": 0.0869, "step": 690 }, { "epoch": 0.23, "learning_rate": 4.125e-05, "loss": 0.1031, "step": 700 }, { "epoch": 0.23, "learning_rate": 4.1125000000000004e-05, "loss": 0.4183, "step": 710 }, { "epoch": 0.23, "learning_rate": 4.1e-05, "loss": 0.1242, "step": 720 }, { "epoch": 0.24, "learning_rate": 4.0875000000000004e-05, "loss": 0.2465, "step": 730 }, { "epoch": 0.24, "learning_rate": 4.075e-05, "loss": 0.1623, "step": 740 }, { "epoch": 0.24, "learning_rate": 4.0625000000000005e-05, "loss": 0.2403, "step": 750 }, { "epoch": 0.25, "learning_rate": 4.05e-05, "loss": 0.3134, "step": 760 }, { "epoch": 0.25, "learning_rate": 4.0375e-05, "loss": 0.2647, "step": 770 }, { "epoch": 0.25, "learning_rate": 4.025e-05, "loss": 0.2368, "step": 780 }, { "epoch": 0.26, "learning_rate": 4.0125e-05, "loss": 0.2985, "step": 790 }, { "epoch": 0.26, "learning_rate": 4e-05, "loss": 0.2929, "step": 800 }, { "epoch": 0.26, "learning_rate": 3.9875e-05, "loss": 0.2982, "step": 810 }, { "epoch": 0.27, "learning_rate": 3.9750000000000004e-05, "loss": 0.2775, "step": 820 }, { "epoch": 0.27, "learning_rate": 3.9625e-05, "loss": 0.3314, "step": 830 }, { "epoch": 0.27, "learning_rate": 3.9500000000000005e-05, "loss": 0.1401, "step": 840 }, { "epoch": 0.28, "learning_rate": 3.9375e-05, "loss": 0.2998, "step": 850 }, { "epoch": 0.28, "learning_rate": 3.9250000000000005e-05, "loss": 0.2041, "step": 860 }, { "epoch": 0.28, "learning_rate": 3.9125e-05, "loss": 0.3031, "step": 870 }, { "epoch": 0.29, "learning_rate": 3.9000000000000006e-05, "loss": 0.3725, "step": 880 }, { "epoch": 0.29, "learning_rate": 3.8875e-05, "loss": 0.1381, "step": 890 }, { "epoch": 0.29, "learning_rate": 3.875e-05, "loss": 0.166, "step": 900 }, { "epoch": 0.3, "learning_rate": 3.8625e-05, "loss": 0.242, "step": 910 }, { "epoch": 0.3, "learning_rate": 3.85e-05, "loss": 0.2182, "step": 920 }, { "epoch": 0.3, "learning_rate": 3.8375e-05, "loss": 0.2386, "step": 930 }, { "epoch": 0.31, "learning_rate": 3.825e-05, "loss": 0.0829, "step": 940 }, { "epoch": 0.31, "learning_rate": 3.8125e-05, "loss": 0.1273, "step": 950 }, { "epoch": 0.31, "learning_rate": 3.8e-05, "loss": 0.2222, "step": 960 }, { "epoch": 0.32, "learning_rate": 3.7875e-05, "loss": 0.2664, "step": 970 }, { "epoch": 0.32, "learning_rate": 3.775e-05, "loss": 0.1874, "step": 980 }, { "epoch": 0.32, "learning_rate": 3.7625e-05, "loss": 0.1742, "step": 990 }, { "epoch": 0.33, "learning_rate": 3.7500000000000003e-05, "loss": 0.222, "step": 1000 }, { "epoch": 0.33, "eval_bleu-4": 0.5308189442308736, "eval_rouge-1": 73.44375, "eval_rouge-2": 54.7447, "eval_rouge-l": 71.83562, "eval_runtime": 4.1772, "eval_samples_per_second": 11.97, "eval_steps_per_second": 0.958, "step": 1000 }, { "epoch": 0.33, "learning_rate": 3.737500000000001e-05, "loss": 0.2489, "step": 1010 }, { "epoch": 0.33, "learning_rate": 3.7250000000000004e-05, "loss": 0.2511, "step": 1020 }, { "epoch": 0.34, "learning_rate": 3.7125e-05, "loss": 0.2092, "step": 1030 }, { "epoch": 0.34, "learning_rate": 3.7e-05, "loss": 0.1308, "step": 1040 }, { "epoch": 0.34, "learning_rate": 3.6875e-05, "loss": 0.2085, "step": 1050 }, { "epoch": 0.34, "learning_rate": 3.675e-05, "loss": 0.226, "step": 1060 }, { "epoch": 0.35, "learning_rate": 3.6625e-05, "loss": 0.252, "step": 1070 }, { "epoch": 0.35, "learning_rate": 3.65e-05, "loss": 0.122, "step": 1080 }, { "epoch": 0.35, "learning_rate": 3.6375e-05, "loss": 0.1906, "step": 1090 }, { "epoch": 0.36, "learning_rate": 3.625e-05, "loss": 0.1968, "step": 1100 }, { "epoch": 0.36, "learning_rate": 3.6125000000000004e-05, "loss": 0.1916, "step": 1110 }, { "epoch": 0.36, "learning_rate": 3.6e-05, "loss": 0.4047, "step": 1120 }, { "epoch": 0.37, "learning_rate": 3.5875000000000005e-05, "loss": 0.1004, "step": 1130 }, { "epoch": 0.37, "learning_rate": 3.575e-05, "loss": 0.1582, "step": 1140 }, { "epoch": 0.37, "learning_rate": 3.5625000000000005e-05, "loss": 0.1458, "step": 1150 }, { "epoch": 0.38, "learning_rate": 3.55e-05, "loss": 0.2366, "step": 1160 }, { "epoch": 0.38, "learning_rate": 3.5375e-05, "loss": 0.3445, "step": 1170 }, { "epoch": 0.38, "learning_rate": 3.525e-05, "loss": 0.1609, "step": 1180 }, { "epoch": 0.39, "learning_rate": 3.5125e-05, "loss": 0.2032, "step": 1190 }, { "epoch": 0.39, "learning_rate": 3.5e-05, "loss": 0.2324, "step": 1200 }, { "epoch": 0.39, "learning_rate": 3.4875e-05, "loss": 0.2468, "step": 1210 }, { "epoch": 0.4, "learning_rate": 3.475e-05, "loss": 0.304, "step": 1220 }, { "epoch": 0.4, "learning_rate": 3.4625e-05, "loss": 0.1962, "step": 1230 }, { "epoch": 0.4, "learning_rate": 3.45e-05, "loss": 0.2164, "step": 1240 }, { "epoch": 0.41, "learning_rate": 3.4375e-05, "loss": 0.3032, "step": 1250 }, { "epoch": 0.41, "learning_rate": 3.4250000000000006e-05, "loss": 0.1629, "step": 1260 }, { "epoch": 0.41, "learning_rate": 3.4125e-05, "loss": 0.1357, "step": 1270 }, { "epoch": 0.42, "learning_rate": 3.4000000000000007e-05, "loss": 0.1634, "step": 1280 }, { "epoch": 0.42, "learning_rate": 3.3875000000000003e-05, "loss": 0.1235, "step": 1290 }, { "epoch": 0.42, "learning_rate": 3.375000000000001e-05, "loss": 0.2326, "step": 1300 }, { "epoch": 0.43, "learning_rate": 3.3625000000000004e-05, "loss": 0.1841, "step": 1310 }, { "epoch": 0.43, "learning_rate": 3.35e-05, "loss": 0.1866, "step": 1320 }, { "epoch": 0.43, "learning_rate": 3.3375e-05, "loss": 0.3667, "step": 1330 }, { "epoch": 0.44, "learning_rate": 3.325e-05, "loss": 0.2589, "step": 1340 }, { "epoch": 0.44, "learning_rate": 3.3125e-05, "loss": 0.0811, "step": 1350 }, { "epoch": 0.44, "learning_rate": 3.3e-05, "loss": 0.1739, "step": 1360 }, { "epoch": 0.45, "learning_rate": 3.2875e-05, "loss": 0.2833, "step": 1370 }, { "epoch": 0.45, "learning_rate": 3.275e-05, "loss": 0.1666, "step": 1380 }, { "epoch": 0.45, "learning_rate": 3.2625e-05, "loss": 0.272, "step": 1390 }, { "epoch": 0.46, "learning_rate": 3.2500000000000004e-05, "loss": 0.2286, "step": 1400 }, { "epoch": 0.46, "learning_rate": 3.2375e-05, "loss": 0.1464, "step": 1410 }, { "epoch": 0.46, "learning_rate": 3.2250000000000005e-05, "loss": 0.2486, "step": 1420 }, { "epoch": 0.47, "learning_rate": 3.2125e-05, "loss": 0.0797, "step": 1430 }, { "epoch": 0.47, "learning_rate": 3.2000000000000005e-05, "loss": 0.2126, "step": 1440 }, { "epoch": 0.47, "learning_rate": 3.1875e-05, "loss": 0.2081, "step": 1450 }, { "epoch": 0.48, "learning_rate": 3.175e-05, "loss": 0.2769, "step": 1460 }, { "epoch": 0.48, "learning_rate": 3.1624999999999996e-05, "loss": 0.1976, "step": 1470 }, { "epoch": 0.48, "learning_rate": 3.15e-05, "loss": 0.1582, "step": 1480 }, { "epoch": 0.48, "learning_rate": 3.1375e-05, "loss": 0.1035, "step": 1490 }, { "epoch": 0.49, "learning_rate": 3.125e-05, "loss": 0.1899, "step": 1500 }, { "epoch": 0.49, "eval_bleu-4": 0.5282664478029883, "eval_rouge-1": 73.29392000000001, "eval_rouge-2": 55.69114, "eval_rouge-l": 71.72303, "eval_runtime": 4.6282, "eval_samples_per_second": 10.803, "eval_steps_per_second": 0.864, "step": 1500 }, { "epoch": 0.49, "learning_rate": 3.1125000000000004e-05, "loss": 0.167, "step": 1510 }, { "epoch": 0.49, "learning_rate": 3.1e-05, "loss": 0.1767, "step": 1520 }, { "epoch": 0.5, "learning_rate": 3.0875000000000005e-05, "loss": 0.2311, "step": 1530 }, { "epoch": 0.5, "learning_rate": 3.075e-05, "loss": 0.2156, "step": 1540 }, { "epoch": 0.5, "learning_rate": 3.0625000000000006e-05, "loss": 0.1911, "step": 1550 }, { "epoch": 0.51, "learning_rate": 3.05e-05, "loss": 0.1312, "step": 1560 }, { "epoch": 0.51, "learning_rate": 3.0375000000000003e-05, "loss": 0.1461, "step": 1570 }, { "epoch": 0.51, "learning_rate": 3.025e-05, "loss": 0.2821, "step": 1580 }, { "epoch": 0.52, "learning_rate": 3.0125000000000004e-05, "loss": 0.1689, "step": 1590 }, { "epoch": 0.52, "learning_rate": 3e-05, "loss": 0.1499, "step": 1600 }, { "epoch": 0.52, "learning_rate": 2.9875000000000004e-05, "loss": 0.2317, "step": 1610 }, { "epoch": 0.53, "learning_rate": 2.975e-05, "loss": 0.1639, "step": 1620 }, { "epoch": 0.53, "learning_rate": 2.9625000000000002e-05, "loss": 0.275, "step": 1630 }, { "epoch": 0.53, "learning_rate": 2.95e-05, "loss": 0.2628, "step": 1640 }, { "epoch": 0.54, "learning_rate": 2.9375000000000003e-05, "loss": 0.1617, "step": 1650 }, { "epoch": 0.54, "learning_rate": 2.925e-05, "loss": 0.303, "step": 1660 }, { "epoch": 0.54, "learning_rate": 2.9125000000000003e-05, "loss": 0.0989, "step": 1670 }, { "epoch": 0.55, "learning_rate": 2.9e-05, "loss": 0.0732, "step": 1680 }, { "epoch": 0.55, "learning_rate": 2.8875e-05, "loss": 0.2459, "step": 1690 }, { "epoch": 0.55, "learning_rate": 2.8749999999999997e-05, "loss": 0.2137, "step": 1700 }, { "epoch": 0.56, "learning_rate": 2.8625e-05, "loss": 0.2293, "step": 1710 }, { "epoch": 0.56, "learning_rate": 2.8499999999999998e-05, "loss": 0.1635, "step": 1720 }, { "epoch": 0.56, "learning_rate": 2.8375000000000002e-05, "loss": 0.1743, "step": 1730 }, { "epoch": 0.57, "learning_rate": 2.825e-05, "loss": 0.1878, "step": 1740 }, { "epoch": 0.57, "learning_rate": 2.8125000000000003e-05, "loss": 0.2032, "step": 1750 }, { "epoch": 0.57, "learning_rate": 2.8000000000000003e-05, "loss": 0.2278, "step": 1760 }, { "epoch": 0.58, "learning_rate": 2.7875e-05, "loss": 0.2181, "step": 1770 }, { "epoch": 0.58, "learning_rate": 2.7750000000000004e-05, "loss": 0.17, "step": 1780 }, { "epoch": 0.58, "learning_rate": 2.7625e-05, "loss": 0.2624, "step": 1790 }, { "epoch": 0.59, "learning_rate": 2.7500000000000004e-05, "loss": 0.2703, "step": 1800 }, { "epoch": 0.59, "learning_rate": 2.7375e-05, "loss": 0.27, "step": 1810 }, { "epoch": 0.59, "learning_rate": 2.725e-05, "loss": 0.1659, "step": 1820 }, { "epoch": 0.6, "learning_rate": 2.7125000000000002e-05, "loss": 0.1295, "step": 1830 }, { "epoch": 0.6, "learning_rate": 2.7000000000000002e-05, "loss": 0.3632, "step": 1840 }, { "epoch": 0.6, "learning_rate": 2.6875e-05, "loss": 0.1305, "step": 1850 }, { "epoch": 0.61, "learning_rate": 2.6750000000000003e-05, "loss": 0.287, "step": 1860 }, { "epoch": 0.61, "learning_rate": 2.6625e-05, "loss": 0.1507, "step": 1870 }, { "epoch": 0.61, "learning_rate": 2.6500000000000004e-05, "loss": 0.0894, "step": 1880 }, { "epoch": 0.62, "learning_rate": 2.6375e-05, "loss": 0.1896, "step": 1890 }, { "epoch": 0.62, "learning_rate": 2.625e-05, "loss": 0.1462, "step": 1900 }, { "epoch": 0.62, "learning_rate": 2.6124999999999998e-05, "loss": 0.1062, "step": 1910 }, { "epoch": 0.62, "learning_rate": 2.6000000000000002e-05, "loss": 0.1156, "step": 1920 }, { "epoch": 0.63, "learning_rate": 2.5875e-05, "loss": 0.1539, "step": 1930 }, { "epoch": 0.63, "learning_rate": 2.5750000000000002e-05, "loss": 0.0697, "step": 1940 }, { "epoch": 0.63, "learning_rate": 2.5625e-05, "loss": 0.1109, "step": 1950 }, { "epoch": 0.64, "learning_rate": 2.5500000000000003e-05, "loss": 0.1212, "step": 1960 }, { "epoch": 0.64, "learning_rate": 2.5375e-05, "loss": 0.1903, "step": 1970 }, { "epoch": 0.64, "learning_rate": 2.525e-05, "loss": 0.1561, "step": 1980 }, { "epoch": 0.65, "learning_rate": 2.5124999999999997e-05, "loss": 0.3047, "step": 1990 }, { "epoch": 0.65, "learning_rate": 2.5e-05, "loss": 0.3121, "step": 2000 }, { "epoch": 0.65, "eval_bleu-4": 0.5122339544955379, "eval_rouge-1": 72.57773, "eval_rouge-2": 52.838330000000006, "eval_rouge-l": 70.44752, "eval_runtime": 4.1957, "eval_samples_per_second": 11.917, "eval_steps_per_second": 0.953, "step": 2000 }, { "epoch": 0.65, "learning_rate": 2.4875e-05, "loss": 0.1308, "step": 2010 }, { "epoch": 0.66, "learning_rate": 2.4750000000000002e-05, "loss": 0.241, "step": 2020 }, { "epoch": 0.66, "learning_rate": 2.4625000000000002e-05, "loss": 0.2604, "step": 2030 }, { "epoch": 0.66, "learning_rate": 2.45e-05, "loss": 0.178, "step": 2040 }, { "epoch": 0.67, "learning_rate": 2.4375e-05, "loss": 0.2828, "step": 2050 }, { "epoch": 0.67, "learning_rate": 2.425e-05, "loss": 0.3039, "step": 2060 }, { "epoch": 0.67, "learning_rate": 2.4125e-05, "loss": 0.1629, "step": 2070 }, { "epoch": 0.68, "learning_rate": 2.4e-05, "loss": 0.2903, "step": 2080 }, { "epoch": 0.68, "learning_rate": 2.3875e-05, "loss": 0.2469, "step": 2090 }, { "epoch": 0.68, "learning_rate": 2.375e-05, "loss": 0.1666, "step": 2100 }, { "epoch": 0.69, "learning_rate": 2.3624999999999998e-05, "loss": 0.1642, "step": 2110 }, { "epoch": 0.69, "learning_rate": 2.35e-05, "loss": 0.1691, "step": 2120 }, { "epoch": 0.69, "learning_rate": 2.3375000000000002e-05, "loss": 0.1123, "step": 2130 }, { "epoch": 0.7, "learning_rate": 2.3250000000000003e-05, "loss": 0.2387, "step": 2140 }, { "epoch": 0.7, "learning_rate": 2.3125000000000003e-05, "loss": 0.1778, "step": 2150 }, { "epoch": 0.7, "learning_rate": 2.3000000000000003e-05, "loss": 0.1903, "step": 2160 }, { "epoch": 0.71, "learning_rate": 2.2875e-05, "loss": 0.161, "step": 2170 }, { "epoch": 0.71, "learning_rate": 2.275e-05, "loss": 0.2142, "step": 2180 }, { "epoch": 0.71, "learning_rate": 2.2625e-05, "loss": 0.1828, "step": 2190 }, { "epoch": 0.72, "learning_rate": 2.25e-05, "loss": 0.1311, "step": 2200 }, { "epoch": 0.72, "learning_rate": 2.2375000000000002e-05, "loss": 0.361, "step": 2210 }, { "epoch": 0.72, "learning_rate": 2.2250000000000002e-05, "loss": 0.1018, "step": 2220 }, { "epoch": 0.73, "learning_rate": 2.2125000000000002e-05, "loss": 0.2022, "step": 2230 }, { "epoch": 0.73, "learning_rate": 2.2000000000000003e-05, "loss": 0.2202, "step": 2240 }, { "epoch": 0.73, "learning_rate": 2.1875e-05, "loss": 0.1076, "step": 2250 }, { "epoch": 0.74, "learning_rate": 2.175e-05, "loss": 0.1287, "step": 2260 }, { "epoch": 0.74, "learning_rate": 2.1625e-05, "loss": 0.126, "step": 2270 }, { "epoch": 0.74, "learning_rate": 2.15e-05, "loss": 0.2802, "step": 2280 }, { "epoch": 0.75, "learning_rate": 2.1375e-05, "loss": 0.2538, "step": 2290 }, { "epoch": 0.75, "learning_rate": 2.125e-05, "loss": 0.1371, "step": 2300 }, { "epoch": 0.75, "learning_rate": 2.1125000000000002e-05, "loss": 0.1423, "step": 2310 }, { "epoch": 0.75, "learning_rate": 2.1e-05, "loss": 0.1213, "step": 2320 }, { "epoch": 0.76, "learning_rate": 2.0875e-05, "loss": 0.0962, "step": 2330 }, { "epoch": 0.76, "learning_rate": 2.075e-05, "loss": 0.1903, "step": 2340 }, { "epoch": 0.76, "learning_rate": 2.0625e-05, "loss": 0.25, "step": 2350 }, { "epoch": 0.77, "learning_rate": 2.05e-05, "loss": 0.1043, "step": 2360 }, { "epoch": 0.77, "learning_rate": 2.0375e-05, "loss": 0.2189, "step": 2370 }, { "epoch": 0.77, "learning_rate": 2.025e-05, "loss": 0.2064, "step": 2380 }, { "epoch": 0.78, "learning_rate": 2.0125e-05, "loss": 0.2896, "step": 2390 }, { "epoch": 0.78, "learning_rate": 2e-05, "loss": 0.1534, "step": 2400 }, { "epoch": 0.78, "learning_rate": 1.9875000000000002e-05, "loss": 0.152, "step": 2410 }, { "epoch": 0.79, "learning_rate": 1.9750000000000002e-05, "loss": 0.1111, "step": 2420 }, { "epoch": 0.79, "learning_rate": 1.9625000000000003e-05, "loss": 0.2323, "step": 2430 }, { "epoch": 0.79, "learning_rate": 1.9500000000000003e-05, "loss": 0.2582, "step": 2440 }, { "epoch": 0.8, "learning_rate": 1.9375e-05, "loss": 0.2148, "step": 2450 }, { "epoch": 0.8, "learning_rate": 1.925e-05, "loss": 0.2708, "step": 2460 }, { "epoch": 0.8, "learning_rate": 1.9125e-05, "loss": 0.1505, "step": 2470 }, { "epoch": 0.81, "learning_rate": 1.9e-05, "loss": 0.1196, "step": 2480 }, { "epoch": 0.81, "learning_rate": 1.8875e-05, "loss": 0.2677, "step": 2490 }, { "epoch": 0.81, "learning_rate": 1.8750000000000002e-05, "loss": 0.1215, "step": 2500 }, { "epoch": 0.81, "eval_bleu-4": 0.5122156520784964, "eval_rouge-1": 74.73468000000001, "eval_rouge-2": 53.852579999999996, "eval_rouge-l": 72.49319, "eval_runtime": 4.2013, "eval_samples_per_second": 11.901, "eval_steps_per_second": 0.952, "step": 2500 }, { "epoch": 0.82, "learning_rate": 1.8625000000000002e-05, "loss": 0.1761, "step": 2510 }, { "epoch": 0.82, "learning_rate": 1.85e-05, "loss": 0.1645, "step": 2520 }, { "epoch": 0.82, "learning_rate": 1.8375e-05, "loss": 0.1746, "step": 2530 }, { "epoch": 0.83, "learning_rate": 1.825e-05, "loss": 0.2182, "step": 2540 }, { "epoch": 0.83, "learning_rate": 1.8125e-05, "loss": 0.2265, "step": 2550 }, { "epoch": 0.83, "learning_rate": 1.8e-05, "loss": 0.2604, "step": 2560 }, { "epoch": 0.84, "learning_rate": 1.7875e-05, "loss": 0.2326, "step": 2570 }, { "epoch": 0.84, "learning_rate": 1.775e-05, "loss": 0.1621, "step": 2580 }, { "epoch": 0.84, "learning_rate": 1.7625e-05, "loss": 0.1838, "step": 2590 }, { "epoch": 0.85, "learning_rate": 1.75e-05, "loss": 0.1687, "step": 2600 }, { "epoch": 0.85, "learning_rate": 1.7375e-05, "loss": 0.0636, "step": 2610 }, { "epoch": 0.85, "learning_rate": 1.725e-05, "loss": 0.0831, "step": 2620 }, { "epoch": 0.86, "learning_rate": 1.7125000000000003e-05, "loss": 0.2057, "step": 2630 }, { "epoch": 0.86, "learning_rate": 1.7000000000000003e-05, "loss": 0.1846, "step": 2640 }, { "epoch": 0.86, "learning_rate": 1.6875000000000004e-05, "loss": 0.1197, "step": 2650 }, { "epoch": 0.87, "learning_rate": 1.675e-05, "loss": 0.245, "step": 2660 }, { "epoch": 0.87, "learning_rate": 1.6625e-05, "loss": 0.1317, "step": 2670 }, { "epoch": 0.87, "learning_rate": 1.65e-05, "loss": 0.2095, "step": 2680 }, { "epoch": 0.88, "learning_rate": 1.6375e-05, "loss": 0.11, "step": 2690 }, { "epoch": 0.88, "learning_rate": 1.6250000000000002e-05, "loss": 0.1467, "step": 2700 }, { "epoch": 0.88, "learning_rate": 1.6125000000000002e-05, "loss": 0.3093, "step": 2710 }, { "epoch": 0.89, "learning_rate": 1.6000000000000003e-05, "loss": 0.1817, "step": 2720 }, { "epoch": 0.89, "learning_rate": 1.5875e-05, "loss": 0.2466, "step": 2730 }, { "epoch": 0.89, "learning_rate": 1.575e-05, "loss": 0.1661, "step": 2740 }, { "epoch": 0.89, "learning_rate": 1.5625e-05, "loss": 0.2242, "step": 2750 }, { "epoch": 0.9, "learning_rate": 1.55e-05, "loss": 0.2207, "step": 2760 }, { "epoch": 0.9, "learning_rate": 1.5375e-05, "loss": 0.1961, "step": 2770 }, { "epoch": 0.9, "learning_rate": 1.525e-05, "loss": 0.2673, "step": 2780 }, { "epoch": 0.91, "learning_rate": 1.5125e-05, "loss": 0.1608, "step": 2790 }, { "epoch": 0.91, "learning_rate": 1.5e-05, "loss": 0.0949, "step": 2800 }, { "epoch": 0.91, "learning_rate": 1.4875e-05, "loss": 0.1448, "step": 2810 }, { "epoch": 0.92, "learning_rate": 1.475e-05, "loss": 0.2233, "step": 2820 }, { "epoch": 0.92, "learning_rate": 1.4625e-05, "loss": 0.2646, "step": 2830 }, { "epoch": 0.92, "learning_rate": 1.45e-05, "loss": 0.1473, "step": 2840 }, { "epoch": 0.93, "learning_rate": 1.4374999999999999e-05, "loss": 0.1528, "step": 2850 }, { "epoch": 0.93, "learning_rate": 1.4249999999999999e-05, "loss": 0.3152, "step": 2860 }, { "epoch": 0.93, "learning_rate": 1.4125e-05, "loss": 0.138, "step": 2870 }, { "epoch": 0.94, "learning_rate": 1.4000000000000001e-05, "loss": 0.0929, "step": 2880 }, { "epoch": 0.94, "learning_rate": 1.3875000000000002e-05, "loss": 0.1869, "step": 2890 }, { "epoch": 0.94, "learning_rate": 1.3750000000000002e-05, "loss": 0.2122, "step": 2900 }, { "epoch": 0.95, "learning_rate": 1.3625e-05, "loss": 0.2408, "step": 2910 }, { "epoch": 0.95, "learning_rate": 1.3500000000000001e-05, "loss": 0.3734, "step": 2920 }, { "epoch": 0.95, "learning_rate": 1.3375000000000002e-05, "loss": 0.3088, "step": 2930 }, { "epoch": 0.96, "learning_rate": 1.3250000000000002e-05, "loss": 0.1843, "step": 2940 }, { "epoch": 0.96, "learning_rate": 1.3125e-05, "loss": 0.2538, "step": 2950 }, { "epoch": 0.96, "learning_rate": 1.3000000000000001e-05, "loss": 0.1594, "step": 2960 }, { "epoch": 0.97, "learning_rate": 1.2875000000000001e-05, "loss": 0.1556, "step": 2970 }, { "epoch": 0.97, "learning_rate": 1.2750000000000002e-05, "loss": 0.1398, "step": 2980 }, { "epoch": 0.97, "learning_rate": 1.2625e-05, "loss": 0.2144, "step": 2990 }, { "epoch": 0.98, "learning_rate": 1.25e-05, "loss": 0.1546, "step": 3000 }, { "epoch": 0.98, "eval_bleu-4": 0.5104840369798801, "eval_rouge-1": 73.24027000000001, "eval_rouge-2": 54.12907, "eval_rouge-l": 71.3942, "eval_runtime": 4.0442, "eval_samples_per_second": 12.363, "eval_steps_per_second": 0.989, "step": 3000 }, { "epoch": 0.98, "learning_rate": 1.2375000000000001e-05, "loss": 0.2081, "step": 3010 }, { "epoch": 0.98, "learning_rate": 1.225e-05, "loss": 0.2799, "step": 3020 }, { "epoch": 0.99, "learning_rate": 1.2125e-05, "loss": 0.1146, "step": 3030 }, { "epoch": 0.99, "learning_rate": 1.2e-05, "loss": 0.1702, "step": 3040 }, { "epoch": 0.99, "learning_rate": 1.1875e-05, "loss": 0.1742, "step": 3050 }, { "epoch": 1.0, "learning_rate": 1.175e-05, "loss": 0.1785, "step": 3060 }, { "epoch": 1.0, "learning_rate": 1.1625000000000001e-05, "loss": 0.1969, "step": 3070 }, { "epoch": 1.0, "learning_rate": 1.1500000000000002e-05, "loss": 0.1249, "step": 3080 }, { "epoch": 1.01, "learning_rate": 1.1375e-05, "loss": 0.0861, "step": 3090 }, { "epoch": 1.01, "learning_rate": 1.125e-05, "loss": 0.2002, "step": 3100 }, { "epoch": 1.01, "learning_rate": 1.1125000000000001e-05, "loss": 0.1078, "step": 3110 }, { "epoch": 1.02, "learning_rate": 1.1000000000000001e-05, "loss": 0.1899, "step": 3120 }, { "epoch": 1.02, "learning_rate": 1.0875e-05, "loss": 0.1088, "step": 3130 }, { "epoch": 1.02, "learning_rate": 1.075e-05, "loss": 0.1926, "step": 3140 }, { "epoch": 1.03, "learning_rate": 1.0625e-05, "loss": 0.1961, "step": 3150 }, { "epoch": 1.03, "learning_rate": 1.05e-05, "loss": 0.2562, "step": 3160 }, { "epoch": 1.03, "learning_rate": 1.0375e-05, "loss": 0.0961, "step": 3170 }, { "epoch": 1.03, "learning_rate": 1.025e-05, "loss": 0.1666, "step": 3180 }, { "epoch": 1.04, "learning_rate": 1.0125e-05, "loss": 0.2314, "step": 3190 }, { "epoch": 1.04, "learning_rate": 1e-05, "loss": 0.2981, "step": 3200 }, { "epoch": 1.04, "learning_rate": 9.875000000000001e-06, "loss": 0.1123, "step": 3210 }, { "epoch": 1.05, "learning_rate": 9.750000000000002e-06, "loss": 0.1554, "step": 3220 }, { "epoch": 1.05, "learning_rate": 9.625e-06, "loss": 0.2253, "step": 3230 }, { "epoch": 1.05, "learning_rate": 9.5e-06, "loss": 0.1191, "step": 3240 }, { "epoch": 1.06, "learning_rate": 9.375000000000001e-06, "loss": 0.1305, "step": 3250 }, { "epoch": 1.06, "learning_rate": 9.25e-06, "loss": 0.1272, "step": 3260 }, { "epoch": 1.06, "learning_rate": 9.125e-06, "loss": 0.1163, "step": 3270 }, { "epoch": 1.07, "learning_rate": 9e-06, "loss": 0.1584, "step": 3280 }, { "epoch": 1.07, "learning_rate": 8.875e-06, "loss": 0.0484, "step": 3290 }, { "epoch": 1.07, "learning_rate": 8.75e-06, "loss": 0.192, "step": 3300 }, { "epoch": 1.08, "learning_rate": 8.625e-06, "loss": 0.1396, "step": 3310 }, { "epoch": 1.08, "learning_rate": 8.500000000000002e-06, "loss": 0.2213, "step": 3320 }, { "epoch": 1.08, "learning_rate": 8.375e-06, "loss": 0.1782, "step": 3330 }, { "epoch": 1.09, "learning_rate": 8.25e-06, "loss": 0.2035, "step": 3340 }, { "epoch": 1.09, "learning_rate": 8.125000000000001e-06, "loss": 0.281, "step": 3350 }, { "epoch": 1.09, "learning_rate": 8.000000000000001e-06, "loss": 0.0522, "step": 3360 }, { "epoch": 1.1, "learning_rate": 7.875e-06, "loss": 0.1353, "step": 3370 }, { "epoch": 1.1, "learning_rate": 7.75e-06, "loss": 0.081, "step": 3380 }, { "epoch": 1.1, "learning_rate": 7.625e-06, "loss": 0.1356, "step": 3390 }, { "epoch": 1.11, "learning_rate": 7.5e-06, "loss": 0.1515, "step": 3400 }, { "epoch": 1.11, "learning_rate": 7.375e-06, "loss": 0.1786, "step": 3410 }, { "epoch": 1.11, "learning_rate": 7.25e-06, "loss": 0.0977, "step": 3420 }, { "epoch": 1.12, "learning_rate": 7.1249999999999995e-06, "loss": 0.3118, "step": 3430 }, { "epoch": 1.12, "learning_rate": 7.000000000000001e-06, "loss": 0.2391, "step": 3440 }, { "epoch": 1.12, "learning_rate": 6.875000000000001e-06, "loss": 0.2432, "step": 3450 }, { "epoch": 1.13, "learning_rate": 6.750000000000001e-06, "loss": 0.2216, "step": 3460 }, { "epoch": 1.13, "learning_rate": 6.625000000000001e-06, "loss": 0.1875, "step": 3470 }, { "epoch": 1.13, "learning_rate": 6.5000000000000004e-06, "loss": 0.1712, "step": 3480 }, { "epoch": 1.14, "learning_rate": 6.375000000000001e-06, "loss": 0.1565, "step": 3490 }, { "epoch": 1.14, "learning_rate": 6.25e-06, "loss": 0.2192, "step": 3500 }, { "epoch": 1.14, "eval_bleu-4": 0.4973012355868603, "eval_rouge-1": 73.60392, "eval_rouge-2": 51.70972, "eval_rouge-l": 70.33331, "eval_runtime": 4.1995, "eval_samples_per_second": 11.906, "eval_steps_per_second": 0.952, "step": 3500 }, { "epoch": 1.14, "learning_rate": 6.125e-06, "loss": 0.0656, "step": 3510 }, { "epoch": 1.15, "learning_rate": 6e-06, "loss": 0.1279, "step": 3520 }, { "epoch": 1.15, "learning_rate": 5.875e-06, "loss": 0.2456, "step": 3530 }, { "epoch": 1.15, "learning_rate": 5.750000000000001e-06, "loss": 0.1141, "step": 3540 }, { "epoch": 1.16, "learning_rate": 5.625e-06, "loss": 0.1019, "step": 3550 }, { "epoch": 1.16, "learning_rate": 5.500000000000001e-06, "loss": 0.1476, "step": 3560 }, { "epoch": 1.16, "learning_rate": 5.375e-06, "loss": 0.1546, "step": 3570 }, { "epoch": 1.16, "learning_rate": 5.25e-06, "loss": 0.0519, "step": 3580 }, { "epoch": 1.17, "learning_rate": 5.125e-06, "loss": 0.0889, "step": 3590 }, { "epoch": 1.17, "learning_rate": 5e-06, "loss": 0.1611, "step": 3600 }, { "epoch": 1.17, "learning_rate": 4.875000000000001e-06, "loss": 0.1526, "step": 3610 }, { "epoch": 1.18, "learning_rate": 4.75e-06, "loss": 0.1407, "step": 3620 }, { "epoch": 1.18, "learning_rate": 4.625e-06, "loss": 0.1389, "step": 3630 }, { "epoch": 1.18, "learning_rate": 4.5e-06, "loss": 0.1123, "step": 3640 }, { "epoch": 1.19, "learning_rate": 4.375e-06, "loss": 0.1315, "step": 3650 }, { "epoch": 1.19, "learning_rate": 4.250000000000001e-06, "loss": 0.1828, "step": 3660 }, { "epoch": 1.19, "learning_rate": 4.125e-06, "loss": 0.0759, "step": 3670 }, { "epoch": 1.2, "learning_rate": 4.000000000000001e-06, "loss": 0.1373, "step": 3680 }, { "epoch": 1.2, "learning_rate": 3.875e-06, "loss": 0.0871, "step": 3690 }, { "epoch": 1.2, "learning_rate": 3.75e-06, "loss": 0.1816, "step": 3700 }, { "epoch": 1.21, "learning_rate": 3.625e-06, "loss": 0.1257, "step": 3710 }, { "epoch": 1.21, "learning_rate": 3.5000000000000004e-06, "loss": 0.152, "step": 3720 }, { "epoch": 1.21, "learning_rate": 3.3750000000000003e-06, "loss": 0.2231, "step": 3730 }, { "epoch": 1.22, "learning_rate": 3.2500000000000002e-06, "loss": 0.4279, "step": 3740 }, { "epoch": 1.22, "learning_rate": 3.125e-06, "loss": 0.1672, "step": 3750 }, { "epoch": 1.22, "learning_rate": 3e-06, "loss": 0.1343, "step": 3760 }, { "epoch": 1.23, "learning_rate": 2.8750000000000004e-06, "loss": 0.1901, "step": 3770 }, { "epoch": 1.23, "learning_rate": 2.7500000000000004e-06, "loss": 0.2439, "step": 3780 }, { "epoch": 1.23, "learning_rate": 2.625e-06, "loss": 0.196, "step": 3790 }, { "epoch": 1.24, "learning_rate": 2.5e-06, "loss": 0.2084, "step": 3800 }, { "epoch": 1.24, "learning_rate": 2.375e-06, "loss": 0.3074, "step": 3810 }, { "epoch": 1.24, "learning_rate": 2.25e-06, "loss": 0.117, "step": 3820 }, { "epoch": 1.25, "learning_rate": 2.1250000000000004e-06, "loss": 0.1888, "step": 3830 }, { "epoch": 1.25, "learning_rate": 2.0000000000000003e-06, "loss": 0.1905, "step": 3840 }, { "epoch": 1.25, "learning_rate": 1.875e-06, "loss": 0.1035, "step": 3850 }, { "epoch": 1.26, "learning_rate": 1.7500000000000002e-06, "loss": 0.1589, "step": 3860 }, { "epoch": 1.26, "learning_rate": 1.6250000000000001e-06, "loss": 0.1249, "step": 3870 }, { "epoch": 1.26, "learning_rate": 1.5e-06, "loss": 0.1453, "step": 3880 }, { "epoch": 1.27, "learning_rate": 1.3750000000000002e-06, "loss": 0.1559, "step": 3890 }, { "epoch": 1.27, "learning_rate": 1.25e-06, "loss": 0.0999, "step": 3900 }, { "epoch": 1.27, "learning_rate": 1.125e-06, "loss": 0.1109, "step": 3910 }, { "epoch": 1.28, "learning_rate": 1.0000000000000002e-06, "loss": 0.1272, "step": 3920 }, { "epoch": 1.28, "learning_rate": 8.750000000000001e-07, "loss": 0.1684, "step": 3930 }, { "epoch": 1.28, "learning_rate": 7.5e-07, "loss": 0.0721, "step": 3940 }, { "epoch": 1.29, "learning_rate": 6.25e-07, "loss": 0.0791, "step": 3950 }, { "epoch": 1.29, "learning_rate": 5.000000000000001e-07, "loss": 0.169, "step": 3960 }, { "epoch": 1.29, "learning_rate": 3.75e-07, "loss": 0.1407, "step": 3970 }, { "epoch": 1.3, "learning_rate": 2.5000000000000004e-07, "loss": 0.1715, "step": 3980 }, { "epoch": 1.3, "learning_rate": 1.2500000000000002e-07, "loss": 0.1107, "step": 3990 }, { "epoch": 1.3, "learning_rate": 0.0, "loss": 0.1829, "step": 4000 }, { "epoch": 1.3, "eval_bleu-4": 0.5180444909128761, "eval_rouge-1": 74.50813, "eval_rouge-2": 52.175689999999996, "eval_rouge-l": 71.20649999999999, "eval_runtime": 4.1977, "eval_samples_per_second": 11.911, "eval_steps_per_second": 0.953, "step": 4000 } ], "logging_steps": 10, "max_steps": 4000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "total_flos": 1.0020007388258304e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }