{ "best_metric": 2.000014066696167, "best_model_checkpoint": "./model_tweets_2020_Q2_full/checkpoint-2400000", "epoch": 2.618552224296455, "eval_steps": 8000, "global_step": 2400000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "eval_loss": 2.1043460369110107, "eval_runtime": 841.4118, "eval_samples_per_second": 917.292, "eval_steps_per_second": 57.331, "step": 8000 }, { "epoch": 0.02, "learning_rate": 4.0726666666666665e-07, "loss": 2.2608, "step": 16000 }, { "epoch": 0.02, "eval_loss": 2.093374013900757, "eval_runtime": 839.5651, "eval_samples_per_second": 919.309, "eval_steps_per_second": 57.457, "step": 16000 }, { "epoch": 0.03, "eval_loss": 2.0861904621124268, "eval_runtime": 840.327, "eval_samples_per_second": 918.476, "eval_steps_per_second": 57.405, "step": 24000 }, { "epoch": 0.03, "learning_rate": 4.0453333333333336e-07, "loss": 2.2409, "step": 32000 }, { "epoch": 0.03, "eval_loss": 2.080547332763672, "eval_runtime": 841.1888, "eval_samples_per_second": 917.535, "eval_steps_per_second": 57.346, "step": 32000 }, { "epoch": 0.04, "eval_loss": 2.079263210296631, "eval_runtime": 841.5773, "eval_samples_per_second": 917.111, "eval_steps_per_second": 57.32, "step": 40000 }, { "epoch": 0.05, "learning_rate": 4.018e-07, "loss": 2.2278, "step": 48000 }, { "epoch": 0.05, "eval_loss": 2.071790933609009, "eval_runtime": 841.5993, "eval_samples_per_second": 917.087, "eval_steps_per_second": 57.318, "step": 48000 }, { "epoch": 0.06, "eval_loss": 2.0752639770507812, "eval_runtime": 841.4195, "eval_samples_per_second": 917.283, "eval_steps_per_second": 57.33, "step": 56000 }, { "epoch": 0.07, "learning_rate": 3.9906666666666667e-07, "loss": 2.2059, "step": 64000 }, { "epoch": 0.07, "eval_loss": 2.0668046474456787, "eval_runtime": 844.6529, "eval_samples_per_second": 913.772, "eval_steps_per_second": 57.111, "step": 64000 }, { "epoch": 0.08, "eval_loss": 2.0657169818878174, "eval_runtime": 844.5291, "eval_samples_per_second": 913.906, "eval_steps_per_second": 57.119, "step": 72000 }, { "epoch": 0.09, "learning_rate": 3.963333333333333e-07, "loss": 2.1997, "step": 80000 }, { "epoch": 0.09, "eval_loss": 2.062004566192627, "eval_runtime": 845.6772, "eval_samples_per_second": 912.665, "eval_steps_per_second": 57.042, "step": 80000 }, { "epoch": 0.1, "eval_loss": 2.0553247928619385, "eval_runtime": 846.4058, "eval_samples_per_second": 911.879, "eval_steps_per_second": 56.993, "step": 88000 }, { "epoch": 0.1, "learning_rate": 3.936e-07, "loss": 2.1988, "step": 96000 }, { "epoch": 0.1, "eval_loss": 2.0569465160369873, "eval_runtime": 841.5807, "eval_samples_per_second": 917.108, "eval_steps_per_second": 57.32, "step": 96000 }, { "epoch": 0.11, "eval_loss": 2.052541971206665, "eval_runtime": 845.3721, "eval_samples_per_second": 912.994, "eval_steps_per_second": 57.062, "step": 104000 }, { "epoch": 0.12, "learning_rate": 3.908666666666667e-07, "loss": 2.1861, "step": 112000 }, { "epoch": 0.12, "eval_loss": 2.05564284324646, "eval_runtime": 847.9385, "eval_samples_per_second": 910.231, "eval_steps_per_second": 56.89, "step": 112000 }, { "epoch": 0.13, "eval_loss": 2.04929256439209, "eval_runtime": 842.77, "eval_samples_per_second": 915.813, "eval_steps_per_second": 57.239, "step": 120000 }, { "epoch": 0.14, "learning_rate": 3.8813333333333334e-07, "loss": 2.1823, "step": 128000 }, { "epoch": 0.14, "eval_loss": 2.0508854389190674, "eval_runtime": 846.1188, "eval_samples_per_second": 912.189, "eval_steps_per_second": 57.012, "step": 128000 }, { "epoch": 0.15, "eval_loss": 2.0460989475250244, "eval_runtime": 845.0745, "eval_samples_per_second": 913.316, "eval_steps_per_second": 57.083, "step": 136000 }, { "epoch": 0.16, "learning_rate": 3.854e-07, "loss": 2.1851, "step": 144000 }, { "epoch": 0.16, "eval_loss": 2.0476059913635254, "eval_runtime": 844.5648, "eval_samples_per_second": 913.867, "eval_steps_per_second": 57.117, "step": 144000 }, { "epoch": 0.17, "eval_loss": 2.04502010345459, "eval_runtime": 845.4437, "eval_samples_per_second": 912.917, "eval_steps_per_second": 57.058, "step": 152000 }, { "epoch": 0.17, "learning_rate": 3.8266666666666665e-07, "loss": 2.1862, "step": 160000 }, { "epoch": 0.17, "eval_loss": 2.046872854232788, "eval_runtime": 843.2469, "eval_samples_per_second": 915.295, "eval_steps_per_second": 57.206, "step": 160000 }, { "epoch": 0.18, "eval_loss": 2.0441744327545166, "eval_runtime": 845.6614, "eval_samples_per_second": 912.682, "eval_steps_per_second": 57.043, "step": 168000 }, { "epoch": 0.19, "learning_rate": 3.799333333333333e-07, "loss": 2.1741, "step": 176000 }, { "epoch": 0.19, "eval_loss": 2.0456435680389404, "eval_runtime": 847.979, "eval_samples_per_second": 910.188, "eval_steps_per_second": 56.887, "step": 176000 }, { "epoch": 0.2, "eval_loss": 2.0441741943359375, "eval_runtime": 846.0243, "eval_samples_per_second": 912.291, "eval_steps_per_second": 57.018, "step": 184000 }, { "epoch": 0.21, "learning_rate": 3.772e-07, "loss": 2.181, "step": 192000 }, { "epoch": 0.21, "eval_loss": 2.040196418762207, "eval_runtime": 851.9304, "eval_samples_per_second": 905.966, "eval_steps_per_second": 56.623, "step": 192000 }, { "epoch": 0.22, "eval_loss": 2.0422918796539307, "eval_runtime": 847.5458, "eval_samples_per_second": 910.653, "eval_steps_per_second": 56.916, "step": 200000 }, { "epoch": 0.23, "learning_rate": 3.7446666666666667e-07, "loss": 2.1692, "step": 208000 }, { "epoch": 0.23, "eval_loss": 2.041342258453369, "eval_runtime": 847.0952, "eval_samples_per_second": 911.137, "eval_steps_per_second": 56.946, "step": 208000 }, { "epoch": 0.24, "eval_loss": 2.0448198318481445, "eval_runtime": 846.694, "eval_samples_per_second": 911.569, "eval_steps_per_second": 56.973, "step": 216000 }, { "epoch": 0.24, "learning_rate": 3.7173333333333333e-07, "loss": 2.1678, "step": 224000 }, { "epoch": 0.24, "eval_loss": 2.0417792797088623, "eval_runtime": 846.2034, "eval_samples_per_second": 912.098, "eval_steps_per_second": 57.006, "step": 224000 }, { "epoch": 0.25, "eval_loss": 2.041692018508911, "eval_runtime": 848.6147, "eval_samples_per_second": 909.506, "eval_steps_per_second": 56.844, "step": 232000 }, { "epoch": 0.26, "learning_rate": 3.69e-07, "loss": 2.1756, "step": 240000 }, { "epoch": 0.26, "eval_loss": 2.034193754196167, "eval_runtime": 847.0585, "eval_samples_per_second": 911.177, "eval_steps_per_second": 56.949, "step": 240000 }, { "epoch": 0.27, "eval_loss": 2.037684202194214, "eval_runtime": 846.2239, "eval_samples_per_second": 912.075, "eval_steps_per_second": 57.005, "step": 248000 }, { "epoch": 0.28, "learning_rate": 3.6626666666666664e-07, "loss": 2.1752, "step": 256000 }, { "epoch": 0.28, "eval_loss": 2.0381019115448, "eval_runtime": 846.7119, "eval_samples_per_second": 911.55, "eval_steps_per_second": 56.972, "step": 256000 }, { "epoch": 0.29, "eval_loss": 2.035405158996582, "eval_runtime": 851.4653, "eval_samples_per_second": 906.461, "eval_steps_per_second": 56.654, "step": 264000 }, { "epoch": 0.3, "learning_rate": 3.6353333333333335e-07, "loss": 2.1673, "step": 272000 }, { "epoch": 0.3, "eval_loss": 2.038097858428955, "eval_runtime": 846.955, "eval_samples_per_second": 911.288, "eval_steps_per_second": 56.956, "step": 272000 }, { "epoch": 0.31, "eval_loss": 2.0375349521636963, "eval_runtime": 846.7581, "eval_samples_per_second": 911.5, "eval_steps_per_second": 56.969, "step": 280000 }, { "epoch": 0.31, "learning_rate": 3.608e-07, "loss": 2.1585, "step": 288000 }, { "epoch": 0.31, "eval_loss": 2.033590078353882, "eval_runtime": 848.1336, "eval_samples_per_second": 910.022, "eval_steps_per_second": 56.877, "step": 288000 }, { "epoch": 0.32, "eval_loss": 2.0344314575195312, "eval_runtime": 847.2304, "eval_samples_per_second": 910.992, "eval_steps_per_second": 56.937, "step": 296000 }, { "epoch": 0.33, "learning_rate": 3.5806666666666666e-07, "loss": 2.1703, "step": 304000 }, { "epoch": 0.33, "eval_loss": 2.034810781478882, "eval_runtime": 846.3544, "eval_samples_per_second": 911.935, "eval_steps_per_second": 56.996, "step": 304000 }, { "epoch": 0.34, "eval_loss": 2.0329954624176025, "eval_runtime": 847.3997, "eval_samples_per_second": 910.81, "eval_steps_per_second": 56.926, "step": 312000 }, { "epoch": 0.35, "learning_rate": 3.553333333333333e-07, "loss": 2.1667, "step": 320000 }, { "epoch": 0.35, "eval_loss": 2.0352213382720947, "eval_runtime": 846.3586, "eval_samples_per_second": 911.93, "eval_steps_per_second": 56.996, "step": 320000 }, { "epoch": 0.36, "eval_loss": 2.0359089374542236, "eval_runtime": 848.8487, "eval_samples_per_second": 909.255, "eval_steps_per_second": 56.829, "step": 328000 }, { "epoch": 0.37, "learning_rate": 3.5259999999999997e-07, "loss": 2.1649, "step": 336000 }, { "epoch": 0.37, "eval_loss": 2.031733512878418, "eval_runtime": 848.6246, "eval_samples_per_second": 909.495, "eval_steps_per_second": 56.844, "step": 336000 }, { "epoch": 0.38, "eval_loss": 2.0314059257507324, "eval_runtime": 851.4761, "eval_samples_per_second": 906.449, "eval_steps_per_second": 56.653, "step": 344000 }, { "epoch": 0.38, "learning_rate": 3.498666666666667e-07, "loss": 2.1564, "step": 352000 }, { "epoch": 0.38, "eval_loss": 2.030597686767578, "eval_runtime": 850.4287, "eval_samples_per_second": 907.566, "eval_steps_per_second": 56.723, "step": 352000 }, { "epoch": 0.39, "eval_loss": 2.029878616333008, "eval_runtime": 850.6967, "eval_samples_per_second": 907.28, "eval_steps_per_second": 56.705, "step": 360000 }, { "epoch": 0.4, "learning_rate": 3.4713333333333333e-07, "loss": 2.161, "step": 368000 }, { "epoch": 0.4, "eval_loss": 2.0317320823669434, "eval_runtime": 851.1347, "eval_samples_per_second": 906.813, "eval_steps_per_second": 56.676, "step": 368000 }, { "epoch": 0.41, "eval_loss": 2.032505989074707, "eval_runtime": 854.4271, "eval_samples_per_second": 903.319, "eval_steps_per_second": 56.458, "step": 376000 }, { "epoch": 0.42, "learning_rate": 3.444e-07, "loss": 2.1551, "step": 384000 }, { "epoch": 0.42, "eval_loss": 2.0273916721343994, "eval_runtime": 850.6274, "eval_samples_per_second": 907.354, "eval_steps_per_second": 56.71, "step": 384000 }, { "epoch": 0.43, "eval_loss": 2.0281741619110107, "eval_runtime": 850.1523, "eval_samples_per_second": 907.861, "eval_steps_per_second": 56.742, "step": 392000 }, { "epoch": 0.44, "learning_rate": 3.416666666666667e-07, "loss": 2.1602, "step": 400000 }, { "epoch": 0.44, "eval_loss": 2.0300543308258057, "eval_runtime": 852.8839, "eval_samples_per_second": 904.953, "eval_steps_per_second": 56.56, "step": 400000 }, { "epoch": 0.45, "eval_loss": 2.0302786827087402, "eval_runtime": 854.7636, "eval_samples_per_second": 902.963, "eval_steps_per_second": 56.435, "step": 408000 }, { "epoch": 0.45, "learning_rate": 3.3893333333333335e-07, "loss": 2.1581, "step": 416000 }, { "epoch": 0.45, "eval_loss": 2.026031732559204, "eval_runtime": 852.2087, "eval_samples_per_second": 905.67, "eval_steps_per_second": 56.605, "step": 416000 }, { "epoch": 0.46, "eval_loss": 2.0248208045959473, "eval_runtime": 850.4117, "eval_samples_per_second": 907.584, "eval_steps_per_second": 56.724, "step": 424000 }, { "epoch": 0.47, "learning_rate": 3.3619999999999995e-07, "loss": 2.1494, "step": 432000 }, { "epoch": 0.47, "eval_loss": 2.026501178741455, "eval_runtime": 848.7671, "eval_samples_per_second": 909.343, "eval_steps_per_second": 56.834, "step": 432000 }, { "epoch": 0.48, "eval_loss": 2.0246880054473877, "eval_runtime": 849.7267, "eval_samples_per_second": 908.316, "eval_steps_per_second": 56.77, "step": 440000 }, { "epoch": 0.49, "learning_rate": 3.3346666666666666e-07, "loss": 2.1508, "step": 448000 }, { "epoch": 0.49, "eval_loss": 2.0231027603149414, "eval_runtime": 849.0484, "eval_samples_per_second": 909.041, "eval_steps_per_second": 56.815, "step": 448000 }, { "epoch": 0.5, "eval_loss": 2.0276315212249756, "eval_runtime": 849.4168, "eval_samples_per_second": 908.647, "eval_steps_per_second": 56.791, "step": 456000 }, { "epoch": 0.51, "learning_rate": 3.307333333333333e-07, "loss": 2.153, "step": 464000 }, { "epoch": 0.51, "eval_loss": 2.0275754928588867, "eval_runtime": 848.4629, "eval_samples_per_second": 909.669, "eval_steps_per_second": 56.855, "step": 464000 }, { "epoch": 0.51, "eval_loss": 2.0241763591766357, "eval_runtime": 849.6091, "eval_samples_per_second": 908.441, "eval_steps_per_second": 56.778, "step": 472000 }, { "epoch": 0.52, "learning_rate": 3.28e-07, "loss": 2.1489, "step": 480000 }, { "epoch": 0.52, "eval_loss": 2.0259480476379395, "eval_runtime": 849.4664, "eval_samples_per_second": 908.594, "eval_steps_per_second": 56.787, "step": 480000 }, { "epoch": 0.53, "eval_loss": 2.025740623474121, "eval_runtime": 850.1732, "eval_samples_per_second": 907.839, "eval_steps_per_second": 56.74, "step": 488000 }, { "epoch": 0.54, "learning_rate": 3.252666666666667e-07, "loss": 2.1468, "step": 496000 }, { "epoch": 0.54, "eval_loss": 2.027461528778076, "eval_runtime": 850.2923, "eval_samples_per_second": 907.711, "eval_steps_per_second": 56.732, "step": 496000 }, { "epoch": 0.55, "eval_loss": 2.030271053314209, "eval_runtime": 851.4114, "eval_samples_per_second": 906.518, "eval_steps_per_second": 56.658, "step": 504000 }, { "epoch": 0.56, "learning_rate": 3.2253333333333334e-07, "loss": 2.1446, "step": 512000 }, { "epoch": 0.56, "eval_loss": 2.0248193740844727, "eval_runtime": 852.1182, "eval_samples_per_second": 905.766, "eval_steps_per_second": 56.611, "step": 512000 }, { "epoch": 0.57, "eval_loss": 2.0285604000091553, "eval_runtime": 849.8013, "eval_samples_per_second": 908.236, "eval_steps_per_second": 56.765, "step": 520000 }, { "epoch": 0.58, "learning_rate": 3.198e-07, "loss": 2.1409, "step": 528000 }, { "epoch": 0.58, "eval_loss": 2.0211498737335205, "eval_runtime": 855.0597, "eval_samples_per_second": 902.65, "eval_steps_per_second": 56.416, "step": 528000 }, { "epoch": 0.58, "eval_loss": 2.0204012393951416, "eval_runtime": 856.0145, "eval_samples_per_second": 901.644, "eval_steps_per_second": 56.353, "step": 536000 }, { "epoch": 0.59, "learning_rate": 3.1706666666666665e-07, "loss": 2.1536, "step": 544000 }, { "epoch": 0.59, "eval_loss": 2.0198850631713867, "eval_runtime": 856.7067, "eval_samples_per_second": 900.915, "eval_steps_per_second": 56.307, "step": 544000 }, { "epoch": 0.6, "eval_loss": 2.0281307697296143, "eval_runtime": 867.0343, "eval_samples_per_second": 890.184, "eval_steps_per_second": 55.637, "step": 552000 }, { "epoch": 0.61, "learning_rate": 3.1433333333333336e-07, "loss": 2.1416, "step": 560000 }, { "epoch": 0.61, "eval_loss": 2.0237483978271484, "eval_runtime": 866.1166, "eval_samples_per_second": 891.127, "eval_steps_per_second": 55.696, "step": 560000 }, { "epoch": 0.62, "eval_loss": 2.0231337547302246, "eval_runtime": 863.3507, "eval_samples_per_second": 893.982, "eval_steps_per_second": 55.874, "step": 568000 }, { "epoch": 0.63, "learning_rate": 3.116e-07, "loss": 2.1502, "step": 576000 }, { "epoch": 0.63, "eval_loss": 2.0205323696136475, "eval_runtime": 857.8171, "eval_samples_per_second": 899.749, "eval_steps_per_second": 56.235, "step": 576000 }, { "epoch": 0.64, "eval_loss": 2.021655559539795, "eval_runtime": 853.6943, "eval_samples_per_second": 904.094, "eval_steps_per_second": 56.506, "step": 584000 }, { "epoch": 0.65, "learning_rate": 3.0886666666666667e-07, "loss": 2.1424, "step": 592000 }, { "epoch": 0.65, "eval_loss": 2.024162769317627, "eval_runtime": 861.2895, "eval_samples_per_second": 896.121, "eval_steps_per_second": 56.008, "step": 592000 }, { "epoch": 0.65, "eval_loss": 2.0237643718719482, "eval_runtime": 859.5317, "eval_samples_per_second": 897.954, "eval_steps_per_second": 56.122, "step": 600000 }, { "epoch": 0.66, "learning_rate": 3.061333333333333e-07, "loss": 2.1469, "step": 608000 }, { "epoch": 0.66, "eval_loss": 2.0191547870635986, "eval_runtime": 855.9495, "eval_samples_per_second": 901.712, "eval_steps_per_second": 56.357, "step": 608000 }, { "epoch": 0.67, "eval_loss": 2.024866819381714, "eval_runtime": 857.0469, "eval_samples_per_second": 900.557, "eval_steps_per_second": 56.285, "step": 616000 }, { "epoch": 0.68, "learning_rate": 3.034e-07, "loss": 2.145, "step": 624000 }, { "epoch": 0.68, "eval_loss": 2.0195770263671875, "eval_runtime": 858.8544, "eval_samples_per_second": 898.662, "eval_steps_per_second": 56.167, "step": 624000 }, { "epoch": 0.69, "eval_loss": 2.022365093231201, "eval_runtime": 854.0414, "eval_samples_per_second": 903.727, "eval_steps_per_second": 56.483, "step": 632000 }, { "epoch": 0.7, "learning_rate": 3.0066666666666663e-07, "loss": 2.1503, "step": 640000 }, { "epoch": 0.7, "eval_loss": 2.0216493606567383, "eval_runtime": 854.8203, "eval_samples_per_second": 902.903, "eval_steps_per_second": 56.432, "step": 640000 }, { "epoch": 0.71, "eval_loss": 2.022836208343506, "eval_runtime": 857.6145, "eval_samples_per_second": 899.962, "eval_steps_per_second": 56.248, "step": 648000 }, { "epoch": 0.72, "learning_rate": 2.9793333333333334e-07, "loss": 2.1355, "step": 656000 }, { "epoch": 0.72, "eval_loss": 2.019666910171509, "eval_runtime": 859.7029, "eval_samples_per_second": 897.775, "eval_steps_per_second": 56.111, "step": 656000 }, { "epoch": 0.72, "eval_loss": 2.0240182876586914, "eval_runtime": 858.0715, "eval_samples_per_second": 899.482, "eval_steps_per_second": 56.218, "step": 664000 }, { "epoch": 0.73, "learning_rate": 2.952e-07, "loss": 2.1392, "step": 672000 }, { "epoch": 0.73, "eval_loss": 2.0232093334198, "eval_runtime": 856.593, "eval_samples_per_second": 901.035, "eval_steps_per_second": 56.315, "step": 672000 }, { "epoch": 0.74, "eval_loss": 2.020932912826538, "eval_runtime": 858.8309, "eval_samples_per_second": 898.687, "eval_steps_per_second": 56.168, "step": 680000 }, { "epoch": 0.75, "learning_rate": 2.9246666666666665e-07, "loss": 2.1378, "step": 688000 }, { "epoch": 0.75, "eval_loss": 2.0219063758850098, "eval_runtime": 860.0126, "eval_samples_per_second": 897.452, "eval_steps_per_second": 56.091, "step": 688000 }, { "epoch": 0.76, "eval_loss": 2.019192695617676, "eval_runtime": 861.8149, "eval_samples_per_second": 895.575, "eval_steps_per_second": 55.974, "step": 696000 }, { "epoch": 0.77, "learning_rate": 2.897333333333333e-07, "loss": 2.1446, "step": 704000 }, { "epoch": 0.77, "eval_loss": 2.0194740295410156, "eval_runtime": 857.8914, "eval_samples_per_second": 899.671, "eval_steps_per_second": 56.23, "step": 704000 }, { "epoch": 0.78, "eval_loss": 2.01971173286438, "eval_runtime": 857.8638, "eval_samples_per_second": 899.7, "eval_steps_per_second": 56.232, "step": 712000 }, { "epoch": 0.79, "learning_rate": 2.8699999999999996e-07, "loss": 2.1351, "step": 720000 }, { "epoch": 0.79, "eval_loss": 2.0183634757995605, "eval_runtime": 857.8713, "eval_samples_per_second": 899.692, "eval_steps_per_second": 56.231, "step": 720000 }, { "epoch": 0.79, "eval_loss": 2.0162270069122314, "eval_runtime": 857.9238, "eval_samples_per_second": 899.637, "eval_steps_per_second": 56.228, "step": 728000 }, { "epoch": 0.8, "learning_rate": 2.8426666666666667e-07, "loss": 2.1437, "step": 736000 }, { "epoch": 0.8, "eval_loss": 2.015068531036377, "eval_runtime": 857.7851, "eval_samples_per_second": 899.783, "eval_steps_per_second": 56.237, "step": 736000 }, { "epoch": 0.81, "eval_loss": 2.0202245712280273, "eval_runtime": 857.6732, "eval_samples_per_second": 899.9, "eval_steps_per_second": 56.244, "step": 744000 }, { "epoch": 0.82, "learning_rate": 2.815333333333333e-07, "loss": 2.1249, "step": 752000 }, { "epoch": 0.82, "eval_loss": 2.0169003009796143, "eval_runtime": 860.8823, "eval_samples_per_second": 896.545, "eval_steps_per_second": 56.034, "step": 752000 }, { "epoch": 0.83, "eval_loss": 2.018857002258301, "eval_runtime": 856.9399, "eval_samples_per_second": 900.67, "eval_steps_per_second": 56.292, "step": 760000 }, { "epoch": 0.84, "learning_rate": 2.7880000000000003e-07, "loss": 2.1355, "step": 768000 }, { "epoch": 0.84, "eval_loss": 2.022115707397461, "eval_runtime": 860.0914, "eval_samples_per_second": 897.37, "eval_steps_per_second": 56.086, "step": 768000 }, { "epoch": 0.85, "eval_loss": 2.0194284915924072, "eval_runtime": 858.1451, "eval_samples_per_second": 899.405, "eval_steps_per_second": 56.213, "step": 776000 }, { "epoch": 0.86, "learning_rate": 2.7606666666666664e-07, "loss": 2.1387, "step": 784000 }, { "epoch": 0.86, "eval_loss": 2.018942356109619, "eval_runtime": 862.7177, "eval_samples_per_second": 894.638, "eval_steps_per_second": 55.915, "step": 784000 }, { "epoch": 0.86, "eval_loss": 2.016535520553589, "eval_runtime": 858.1148, "eval_samples_per_second": 899.437, "eval_steps_per_second": 56.215, "step": 792000 }, { "epoch": 0.87, "learning_rate": 2.733333333333333e-07, "loss": 2.1334, "step": 800000 }, { "epoch": 0.87, "eval_loss": 2.0169451236724854, "eval_runtime": 860.3041, "eval_samples_per_second": 897.148, "eval_steps_per_second": 56.072, "step": 800000 }, { "epoch": 0.88, "eval_loss": 2.0188918113708496, "eval_runtime": 861.004, "eval_samples_per_second": 896.419, "eval_steps_per_second": 56.026, "step": 808000 }, { "epoch": 0.89, "learning_rate": 2.706e-07, "loss": 2.137, "step": 816000 }, { "epoch": 0.89, "eval_loss": 2.016237258911133, "eval_runtime": 862.2544, "eval_samples_per_second": 895.119, "eval_steps_per_second": 55.945, "step": 816000 }, { "epoch": 0.9, "eval_loss": 2.0168325901031494, "eval_runtime": 860.8877, "eval_samples_per_second": 896.54, "eval_steps_per_second": 56.034, "step": 824000 }, { "epoch": 0.91, "learning_rate": 2.6786666666666666e-07, "loss": 2.1331, "step": 832000 }, { "epoch": 0.91, "eval_loss": 2.0192737579345703, "eval_runtime": 859.4597, "eval_samples_per_second": 898.029, "eval_steps_per_second": 56.127, "step": 832000 }, { "epoch": 0.92, "eval_loss": 2.016619920730591, "eval_runtime": 863.1851, "eval_samples_per_second": 894.153, "eval_steps_per_second": 55.885, "step": 840000 }, { "epoch": 0.93, "learning_rate": 2.651333333333333e-07, "loss": 2.1293, "step": 848000 }, { "epoch": 0.93, "eval_loss": 2.013720989227295, "eval_runtime": 863.4541, "eval_samples_per_second": 893.875, "eval_steps_per_second": 55.867, "step": 848000 }, { "epoch": 0.93, "eval_loss": 2.018291711807251, "eval_runtime": 877.742, "eval_samples_per_second": 879.324, "eval_steps_per_second": 54.958, "step": 856000 }, { "epoch": 0.94, "learning_rate": 2.624e-07, "loss": 2.1358, "step": 864000 }, { "epoch": 0.94, "eval_loss": 2.018421173095703, "eval_runtime": 873.6563, "eval_samples_per_second": 883.437, "eval_steps_per_second": 55.215, "step": 864000 }, { "epoch": 0.95, "eval_loss": 2.017104387283325, "eval_runtime": 874.261, "eval_samples_per_second": 882.826, "eval_steps_per_second": 55.177, "step": 872000 }, { "epoch": 0.96, "learning_rate": 2.596666666666667e-07, "loss": 2.1296, "step": 880000 }, { "epoch": 0.96, "eval_loss": 2.0179190635681152, "eval_runtime": 874.7051, "eval_samples_per_second": 882.377, "eval_steps_per_second": 55.149, "step": 880000 }, { "epoch": 0.97, "eval_loss": 2.015188455581665, "eval_runtime": 875.6595, "eval_samples_per_second": 881.416, "eval_steps_per_second": 55.089, "step": 888000 }, { "epoch": 0.98, "learning_rate": 2.5693333333333333e-07, "loss": 2.1319, "step": 896000 }, { "epoch": 0.98, "eval_loss": 2.0173678398132324, "eval_runtime": 877.4749, "eval_samples_per_second": 879.592, "eval_steps_per_second": 54.975, "step": 896000 }, { "epoch": 0.99, "eval_loss": 2.020580291748047, "eval_runtime": 874.219, "eval_samples_per_second": 882.868, "eval_steps_per_second": 55.18, "step": 904000 }, { "epoch": 1.0, "learning_rate": 2.542e-07, "loss": 2.1344, "step": 912000 }, { "epoch": 1.0, "eval_loss": 2.0178616046905518, "eval_runtime": 871.4372, "eval_samples_per_second": 885.686, "eval_steps_per_second": 55.356, "step": 912000 }, { "epoch": 1.0, "eval_loss": 2.0153729915618896, "eval_runtime": 874.8229, "eval_samples_per_second": 882.259, "eval_steps_per_second": 55.141, "step": 920000 }, { "epoch": 1.01, "learning_rate": 2.5146666666666664e-07, "loss": 2.1352, "step": 928000 }, { "epoch": 1.01, "eval_loss": 2.018483877182007, "eval_runtime": 876.0163, "eval_samples_per_second": 881.057, "eval_steps_per_second": 55.066, "step": 928000 }, { "epoch": 1.02, "eval_loss": 2.016976833343506, "eval_runtime": 878.2619, "eval_samples_per_second": 878.804, "eval_steps_per_second": 54.926, "step": 936000 }, { "epoch": 1.03, "learning_rate": 2.4873333333333335e-07, "loss": 2.1336, "step": 944000 }, { "epoch": 1.03, "eval_loss": 2.016388416290283, "eval_runtime": 877.6593, "eval_samples_per_second": 879.407, "eval_steps_per_second": 54.963, "step": 944000 }, { "epoch": 1.04, "eval_loss": 2.013742208480835, "eval_runtime": 871.0407, "eval_samples_per_second": 886.09, "eval_steps_per_second": 55.381, "step": 952000 }, { "epoch": 1.05, "learning_rate": 2.46e-07, "loss": 2.1315, "step": 960000 }, { "epoch": 1.05, "eval_loss": 2.0176327228546143, "eval_runtime": 877.004, "eval_samples_per_second": 880.064, "eval_steps_per_second": 55.004, "step": 960000 }, { "epoch": 1.06, "eval_loss": 2.0155346393585205, "eval_runtime": 872.5922, "eval_samples_per_second": 884.514, "eval_steps_per_second": 55.282, "step": 968000 }, { "epoch": 1.06, "learning_rate": 2.4326666666666666e-07, "loss": 2.1255, "step": 976000 }, { "epoch": 1.06, "eval_loss": 2.014533281326294, "eval_runtime": 871.4139, "eval_samples_per_second": 885.71, "eval_steps_per_second": 55.357, "step": 976000 }, { "epoch": 1.07, "eval_loss": 2.023314952850342, "eval_runtime": 879.3224, "eval_samples_per_second": 877.744, "eval_steps_per_second": 54.859, "step": 984000 }, { "epoch": 1.08, "learning_rate": 2.405333333333333e-07, "loss": 2.1249, "step": 992000 }, { "epoch": 1.08, "eval_loss": 2.0147762298583984, "eval_runtime": 866.8225, "eval_samples_per_second": 890.401, "eval_steps_per_second": 55.65, "step": 992000 }, { "epoch": 1.09, "eval_loss": 2.016249895095825, "eval_runtime": 867.6683, "eval_samples_per_second": 889.533, "eval_steps_per_second": 55.596, "step": 1000000 }, { "epoch": 1.1, "learning_rate": 2.3779999999999997e-07, "loss": 2.123, "step": 1008000 }, { "epoch": 1.1, "eval_loss": 2.017381191253662, "eval_runtime": 868.2141, "eval_samples_per_second": 888.974, "eval_steps_per_second": 55.561, "step": 1008000 }, { "epoch": 1.11, "eval_loss": 2.015009880065918, "eval_runtime": 865.5792, "eval_samples_per_second": 891.68, "eval_steps_per_second": 55.73, "step": 1016000 }, { "epoch": 1.12, "learning_rate": 2.3506666666666668e-07, "loss": 2.1263, "step": 1024000 }, { "epoch": 1.12, "eval_loss": 2.0160863399505615, "eval_runtime": 869.2474, "eval_samples_per_second": 887.917, "eval_steps_per_second": 55.495, "step": 1024000 }, { "epoch": 1.13, "eval_loss": 2.0128889083862305, "eval_runtime": 866.9502, "eval_samples_per_second": 890.27, "eval_steps_per_second": 55.642, "step": 1032000 }, { "epoch": 1.13, "learning_rate": 2.3233333333333334e-07, "loss": 2.1232, "step": 1040000 }, { "epoch": 1.13, "eval_loss": 2.0166754722595215, "eval_runtime": 901.7962, "eval_samples_per_second": 855.87, "eval_steps_per_second": 53.492, "step": 1040000 }, { "epoch": 1.14, "eval_loss": 2.012477397918701, "eval_runtime": 911.6669, "eval_samples_per_second": 846.603, "eval_steps_per_second": 52.913, "step": 1048000 }, { "epoch": 1.15, "learning_rate": 2.2960000000000002e-07, "loss": 2.1168, "step": 1056000 }, { "epoch": 1.15, "eval_loss": 2.0113391876220703, "eval_runtime": 912.2557, "eval_samples_per_second": 846.057, "eval_steps_per_second": 52.879, "step": 1056000 }, { "epoch": 1.16, "eval_loss": 2.013575792312622, "eval_runtime": 901.3301, "eval_samples_per_second": 856.312, "eval_steps_per_second": 53.52, "step": 1064000 }, { "epoch": 1.17, "learning_rate": 2.2686666666666667e-07, "loss": 2.1307, "step": 1072000 }, { "epoch": 1.17, "eval_loss": 2.014338254928589, "eval_runtime": 891.2807, "eval_samples_per_second": 865.967, "eval_steps_per_second": 54.123, "step": 1072000 }, { "epoch": 1.18, "eval_loss": 2.0166401863098145, "eval_runtime": 886.4005, "eval_samples_per_second": 870.735, "eval_steps_per_second": 54.421, "step": 1080000 }, { "epoch": 1.19, "learning_rate": 2.2413333333333333e-07, "loss": 2.1336, "step": 1088000 }, { "epoch": 1.19, "eval_loss": 2.0103185176849365, "eval_runtime": 886.4458, "eval_samples_per_second": 870.691, "eval_steps_per_second": 54.418, "step": 1088000 }, { "epoch": 1.2, "eval_loss": 2.0129764080047607, "eval_runtime": 890.355, "eval_samples_per_second": 866.868, "eval_steps_per_second": 54.18, "step": 1096000 }, { "epoch": 1.2, "learning_rate": 2.214e-07, "loss": 2.1227, "step": 1104000 }, { "epoch": 1.2, "eval_loss": 2.012451648712158, "eval_runtime": 895.3428, "eval_samples_per_second": 862.039, "eval_steps_per_second": 53.878, "step": 1104000 }, { "epoch": 1.21, "eval_loss": 2.0183231830596924, "eval_runtime": 888.3913, "eval_samples_per_second": 868.784, "eval_steps_per_second": 54.299, "step": 1112000 }, { "epoch": 1.22, "learning_rate": 2.1866666666666667e-07, "loss": 2.1223, "step": 1120000 }, { "epoch": 1.22, "eval_loss": 2.014848470687866, "eval_runtime": 889.5583, "eval_samples_per_second": 867.644, "eval_steps_per_second": 54.228, "step": 1120000 }, { "epoch": 1.23, "eval_loss": 2.0147109031677246, "eval_runtime": 884.3146, "eval_samples_per_second": 872.789, "eval_steps_per_second": 54.55, "step": 1128000 }, { "epoch": 1.24, "learning_rate": 2.1593333333333332e-07, "loss": 2.1289, "step": 1136000 }, { "epoch": 1.24, "eval_loss": 2.0108699798583984, "eval_runtime": 888.3584, "eval_samples_per_second": 868.816, "eval_steps_per_second": 54.301, "step": 1136000 }, { "epoch": 1.25, "eval_loss": 2.0163819789886475, "eval_runtime": 887.4195, "eval_samples_per_second": 869.735, "eval_steps_per_second": 54.359, "step": 1144000 }, { "epoch": 1.26, "learning_rate": 2.132e-07, "loss": 2.1278, "step": 1152000 }, { "epoch": 1.26, "eval_loss": 2.0163345336914062, "eval_runtime": 886.1604, "eval_samples_per_second": 870.971, "eval_steps_per_second": 54.436, "step": 1152000 }, { "epoch": 1.27, "eval_loss": 2.012103319168091, "eval_runtime": 889.5174, "eval_samples_per_second": 867.684, "eval_steps_per_second": 54.231, "step": 1160000 }, { "epoch": 1.27, "learning_rate": 2.1046666666666666e-07, "loss": 2.1261, "step": 1168000 }, { "epoch": 1.27, "eval_loss": 2.011343240737915, "eval_runtime": 890.9332, "eval_samples_per_second": 866.305, "eval_steps_per_second": 54.144, "step": 1168000 }, { "epoch": 1.28, "eval_loss": 2.0137104988098145, "eval_runtime": 883.4659, "eval_samples_per_second": 873.627, "eval_steps_per_second": 54.602, "step": 1176000 }, { "epoch": 1.29, "learning_rate": 2.0773333333333334e-07, "loss": 2.126, "step": 1184000 }, { "epoch": 1.29, "eval_loss": 2.015174627304077, "eval_runtime": 885.9678, "eval_samples_per_second": 871.16, "eval_steps_per_second": 54.448, "step": 1184000 }, { "epoch": 1.3, "eval_loss": 2.010411500930786, "eval_runtime": 888.6748, "eval_samples_per_second": 868.507, "eval_steps_per_second": 54.282, "step": 1192000 }, { "epoch": 1.31, "learning_rate": 2.05e-07, "loss": 2.1235, "step": 1200000 }, { "epoch": 1.31, "eval_loss": 2.013165235519409, "eval_runtime": 888.6503, "eval_samples_per_second": 868.531, "eval_steps_per_second": 54.283, "step": 1200000 }, { "epoch": 1.32, "eval_loss": 2.0113847255706787, "eval_runtime": 884.261, "eval_samples_per_second": 872.842, "eval_steps_per_second": 54.553, "step": 1208000 }, { "epoch": 1.33, "learning_rate": 2.0226666666666668e-07, "loss": 2.1229, "step": 1216000 }, { "epoch": 1.33, "eval_loss": 2.010532855987549, "eval_runtime": 887.5065, "eval_samples_per_second": 869.65, "eval_steps_per_second": 54.353, "step": 1216000 }, { "epoch": 1.34, "eval_loss": 2.0130858421325684, "eval_runtime": 881.1399, "eval_samples_per_second": 875.934, "eval_steps_per_second": 54.746, "step": 1224000 }, { "epoch": 1.34, "learning_rate": 1.9953333333333333e-07, "loss": 2.1213, "step": 1232000 }, { "epoch": 1.34, "eval_loss": 2.0141072273254395, "eval_runtime": 882.2467, "eval_samples_per_second": 874.835, "eval_steps_per_second": 54.677, "step": 1232000 }, { "epoch": 1.35, "eval_loss": 2.010868549346924, "eval_runtime": 881.7078, "eval_samples_per_second": 875.369, "eval_steps_per_second": 54.711, "step": 1240000 }, { "epoch": 1.36, "learning_rate": 1.968e-07, "loss": 2.1185, "step": 1248000 }, { "epoch": 1.36, "eval_loss": 2.0129363536834717, "eval_runtime": 886.2455, "eval_samples_per_second": 870.887, "eval_steps_per_second": 54.431, "step": 1248000 }, { "epoch": 1.37, "eval_loss": 2.011003017425537, "eval_runtime": 888.1974, "eval_samples_per_second": 868.974, "eval_steps_per_second": 54.311, "step": 1256000 }, { "epoch": 1.38, "learning_rate": 1.9406666666666667e-07, "loss": 2.131, "step": 1264000 }, { "epoch": 1.38, "eval_loss": 2.01228928565979, "eval_runtime": 884.9282, "eval_samples_per_second": 872.184, "eval_steps_per_second": 54.512, "step": 1264000 }, { "epoch": 1.39, "eval_loss": 2.0104737281799316, "eval_runtime": 881.1611, "eval_samples_per_second": 875.912, "eval_steps_per_second": 54.745, "step": 1272000 }, { "epoch": 1.4, "learning_rate": 1.9133333333333333e-07, "loss": 2.1141, "step": 1280000 }, { "epoch": 1.4, "eval_loss": 2.010425090789795, "eval_runtime": 882.3806, "eval_samples_per_second": 874.702, "eval_steps_per_second": 54.669, "step": 1280000 }, { "epoch": 1.41, "eval_loss": 2.015007734298706, "eval_runtime": 879.3909, "eval_samples_per_second": 877.676, "eval_steps_per_second": 54.855, "step": 1288000 }, { "epoch": 1.41, "learning_rate": 1.886e-07, "loss": 2.1219, "step": 1296000 }, { "epoch": 1.41, "eval_loss": 2.0161073207855225, "eval_runtime": 879.4904, "eval_samples_per_second": 877.576, "eval_steps_per_second": 54.849, "step": 1296000 }, { "epoch": 1.42, "eval_loss": 2.00930118560791, "eval_runtime": 882.5935, "eval_samples_per_second": 874.491, "eval_steps_per_second": 54.656, "step": 1304000 }, { "epoch": 1.43, "learning_rate": 1.8586666666666666e-07, "loss": 2.1203, "step": 1312000 }, { "epoch": 1.43, "eval_loss": 2.0104291439056396, "eval_runtime": 882.9969, "eval_samples_per_second": 874.091, "eval_steps_per_second": 54.631, "step": 1312000 }, { "epoch": 1.44, "eval_loss": 2.0144429206848145, "eval_runtime": 878.5955, "eval_samples_per_second": 878.47, "eval_steps_per_second": 54.905, "step": 1320000 }, { "epoch": 1.45, "learning_rate": 1.8313333333333332e-07, "loss": 2.1264, "step": 1328000 }, { "epoch": 1.45, "eval_loss": 2.0084986686706543, "eval_runtime": 878.8817, "eval_samples_per_second": 878.184, "eval_steps_per_second": 54.887, "step": 1328000 }, { "epoch": 1.46, "eval_loss": 2.0118672847747803, "eval_runtime": 880.8514, "eval_samples_per_second": 876.22, "eval_steps_per_second": 54.764, "step": 1336000 }, { "epoch": 1.47, "learning_rate": 1.804e-07, "loss": 2.1194, "step": 1344000 }, { "epoch": 1.47, "eval_loss": 2.011784076690674, "eval_runtime": 878.874, "eval_samples_per_second": 878.192, "eval_steps_per_second": 54.887, "step": 1344000 }, { "epoch": 1.48, "eval_loss": 2.0109827518463135, "eval_runtime": 893.715, "eval_samples_per_second": 863.609, "eval_steps_per_second": 53.976, "step": 1352000 }, { "epoch": 1.48, "learning_rate": 1.7766666666666666e-07, "loss": 2.117, "step": 1360000 }, { "epoch": 1.48, "eval_loss": 2.014660596847534, "eval_runtime": 915.8924, "eval_samples_per_second": 842.697, "eval_steps_per_second": 52.669, "step": 1360000 }, { "epoch": 1.49, "eval_loss": 2.013535261154175, "eval_runtime": 909.1816, "eval_samples_per_second": 848.917, "eval_steps_per_second": 53.058, "step": 1368000 }, { "epoch": 1.5, "learning_rate": 1.7493333333333334e-07, "loss": 2.1311, "step": 1376000 }, { "epoch": 1.5, "eval_loss": 2.0076611042022705, "eval_runtime": 909.3083, "eval_samples_per_second": 848.799, "eval_steps_per_second": 53.05, "step": 1376000 }, { "epoch": 1.51, "eval_loss": 2.006574869155884, "eval_runtime": 904.8344, "eval_samples_per_second": 852.996, "eval_steps_per_second": 53.313, "step": 1384000 }, { "epoch": 1.52, "learning_rate": 1.722e-07, "loss": 2.1215, "step": 1392000 }, { "epoch": 1.52, "eval_loss": 2.008929967880249, "eval_runtime": 903.4488, "eval_samples_per_second": 854.304, "eval_steps_per_second": 53.394, "step": 1392000 }, { "epoch": 1.53, "eval_loss": 2.0118260383605957, "eval_runtime": 913.6278, "eval_samples_per_second": 844.786, "eval_steps_per_second": 52.799, "step": 1400000 }, { "epoch": 1.54, "learning_rate": 1.6946666666666668e-07, "loss": 2.1185, "step": 1408000 }, { "epoch": 1.54, "eval_loss": 2.0105414390563965, "eval_runtime": 907.6551, "eval_samples_per_second": 850.345, "eval_steps_per_second": 53.147, "step": 1408000 }, { "epoch": 1.54, "eval_loss": 2.012268304824829, "eval_runtime": 903.9952, "eval_samples_per_second": 853.788, "eval_steps_per_second": 53.362, "step": 1416000 }, { "epoch": 1.55, "learning_rate": 1.6673333333333333e-07, "loss": 2.1284, "step": 1424000 }, { "epoch": 1.55, "eval_loss": 2.0133912563323975, "eval_runtime": 910.6028, "eval_samples_per_second": 847.592, "eval_steps_per_second": 52.975, "step": 1424000 }, { "epoch": 1.56, "eval_loss": 2.009307861328125, "eval_runtime": 904.2587, "eval_samples_per_second": 853.539, "eval_steps_per_second": 53.346, "step": 1432000 }, { "epoch": 1.57, "learning_rate": 1.64e-07, "loss": 2.1174, "step": 1440000 }, { "epoch": 1.57, "eval_loss": 2.0101728439331055, "eval_runtime": 912.2693, "eval_samples_per_second": 846.044, "eval_steps_per_second": 52.878, "step": 1440000 }, { "epoch": 1.58, "eval_loss": 2.00759220123291, "eval_runtime": 910.2393, "eval_samples_per_second": 847.931, "eval_steps_per_second": 52.996, "step": 1448000 }, { "epoch": 1.59, "learning_rate": 1.6126666666666667e-07, "loss": 2.1108, "step": 1456000 }, { "epoch": 1.59, "eval_loss": 2.00740909576416, "eval_runtime": 914.6796, "eval_samples_per_second": 843.815, "eval_steps_per_second": 52.739, "step": 1456000 }, { "epoch": 1.6, "eval_loss": 2.007056474685669, "eval_runtime": 908.0025, "eval_samples_per_second": 850.02, "eval_steps_per_second": 53.127, "step": 1464000 }, { "epoch": 1.61, "learning_rate": 1.5853333333333332e-07, "loss": 2.1252, "step": 1472000 }, { "epoch": 1.61, "eval_loss": 2.0092082023620605, "eval_runtime": 905.6872, "eval_samples_per_second": 852.193, "eval_steps_per_second": 53.262, "step": 1472000 }, { "epoch": 1.61, "eval_loss": 2.007967233657837, "eval_runtime": 910.9272, "eval_samples_per_second": 847.291, "eval_steps_per_second": 52.956, "step": 1480000 }, { "epoch": 1.62, "learning_rate": 1.558e-07, "loss": 2.121, "step": 1488000 }, { "epoch": 1.62, "eval_loss": 2.0052874088287354, "eval_runtime": 908.8472, "eval_samples_per_second": 849.23, "eval_steps_per_second": 53.077, "step": 1488000 }, { "epoch": 1.63, "eval_loss": 2.0071661472320557, "eval_runtime": 907.693, "eval_samples_per_second": 850.31, "eval_steps_per_second": 53.145, "step": 1496000 }, { "epoch": 1.64, "learning_rate": 1.5306666666666666e-07, "loss": 2.1178, "step": 1504000 }, { "epoch": 1.64, "eval_loss": 2.0059070587158203, "eval_runtime": 908.356, "eval_samples_per_second": 849.689, "eval_steps_per_second": 53.106, "step": 1504000 }, { "epoch": 1.65, "eval_loss": 2.00836443901062, "eval_runtime": 908.0246, "eval_samples_per_second": 849.999, "eval_steps_per_second": 53.125, "step": 1512000 }, { "epoch": 1.66, "learning_rate": 1.5033333333333332e-07, "loss": 2.1154, "step": 1520000 }, { "epoch": 1.66, "eval_loss": 2.0105550289154053, "eval_runtime": 903.6608, "eval_samples_per_second": 854.104, "eval_steps_per_second": 53.382, "step": 1520000 }, { "epoch": 1.67, "eval_loss": 2.0116729736328125, "eval_runtime": 909.1515, "eval_samples_per_second": 848.945, "eval_steps_per_second": 53.059, "step": 1528000 }, { "epoch": 1.68, "learning_rate": 1.476e-07, "loss": 2.1214, "step": 1536000 }, { "epoch": 1.68, "eval_loss": 2.006955146789551, "eval_runtime": 907.2355, "eval_samples_per_second": 850.738, "eval_steps_per_second": 53.171, "step": 1536000 }, { "epoch": 1.68, "eval_loss": 2.0078775882720947, "eval_runtime": 908.5609, "eval_samples_per_second": 849.497, "eval_steps_per_second": 53.094, "step": 1544000 }, { "epoch": 1.69, "learning_rate": 1.4486666666666665e-07, "loss": 2.1175, "step": 1552000 }, { "epoch": 1.69, "eval_loss": 2.0101876258850098, "eval_runtime": 901.3076, "eval_samples_per_second": 856.334, "eval_steps_per_second": 53.521, "step": 1552000 }, { "epoch": 1.7, "eval_loss": 2.009697675704956, "eval_runtime": 906.1011, "eval_samples_per_second": 851.803, "eval_steps_per_second": 53.238, "step": 1560000 }, { "epoch": 1.71, "learning_rate": 1.4213333333333334e-07, "loss": 2.1206, "step": 1568000 }, { "epoch": 1.71, "eval_loss": 2.0092358589172363, "eval_runtime": 901.2376, "eval_samples_per_second": 856.4, "eval_steps_per_second": 53.525, "step": 1568000 }, { "epoch": 1.72, "eval_loss": 2.005527973175049, "eval_runtime": 896.3075, "eval_samples_per_second": 861.111, "eval_steps_per_second": 53.82, "step": 1576000 }, { "epoch": 1.73, "learning_rate": 1.3940000000000002e-07, "loss": 2.1302, "step": 1584000 }, { "epoch": 1.73, "eval_loss": 2.008502244949341, "eval_runtime": 899.3251, "eval_samples_per_second": 858.221, "eval_steps_per_second": 53.639, "step": 1584000 }, { "epoch": 1.74, "eval_loss": 2.0109806060791016, "eval_runtime": 906.7205, "eval_samples_per_second": 851.222, "eval_steps_per_second": 53.202, "step": 1592000 }, { "epoch": 1.75, "learning_rate": 1.3666666666666665e-07, "loss": 2.1177, "step": 1600000 }, { "epoch": 1.75, "eval_loss": 2.006521701812744, "eval_runtime": 898.4764, "eval_samples_per_second": 859.032, "eval_steps_per_second": 53.69, "step": 1600000 }, { "epoch": 1.75, "eval_loss": 2.0131704807281494, "eval_runtime": 906.0839, "eval_samples_per_second": 851.82, "eval_steps_per_second": 53.239, "step": 1608000 }, { "epoch": 1.76, "learning_rate": 1.3393333333333333e-07, "loss": 2.1101, "step": 1616000 }, { "epoch": 1.76, "eval_loss": 2.0085511207580566, "eval_runtime": 896.2709, "eval_samples_per_second": 861.146, "eval_steps_per_second": 53.822, "step": 1616000 }, { "epoch": 1.77, "eval_loss": 2.0077245235443115, "eval_runtime": 897.3988, "eval_samples_per_second": 860.064, "eval_steps_per_second": 53.754, "step": 1624000 }, { "epoch": 1.78, "learning_rate": 1.312e-07, "loss": 2.1194, "step": 1632000 }, { "epoch": 1.78, "eval_loss": 2.008148431777954, "eval_runtime": 896.5575, "eval_samples_per_second": 860.871, "eval_steps_per_second": 53.805, "step": 1632000 }, { "epoch": 1.79, "eval_loss": 2.008798122406006, "eval_runtime": 897.2787, "eval_samples_per_second": 860.179, "eval_steps_per_second": 53.761, "step": 1640000 }, { "epoch": 1.8, "learning_rate": 1.2846666666666667e-07, "loss": 2.1167, "step": 1648000 }, { "epoch": 1.8, "eval_loss": 2.002239942550659, "eval_runtime": 893.5655, "eval_samples_per_second": 863.753, "eval_steps_per_second": 53.985, "step": 1648000 }, { "epoch": 1.81, "eval_loss": 2.007662296295166, "eval_runtime": 895.7141, "eval_samples_per_second": 861.681, "eval_steps_per_second": 53.855, "step": 1656000 }, { "epoch": 1.82, "learning_rate": 1.2573333333333332e-07, "loss": 2.1083, "step": 1664000 }, { "epoch": 1.82, "eval_loss": 2.0065953731536865, "eval_runtime": 890.9713, "eval_samples_per_second": 866.268, "eval_steps_per_second": 54.142, "step": 1664000 }, { "epoch": 1.82, "eval_loss": 2.0137040615081787, "eval_runtime": 885.7627, "eval_samples_per_second": 871.362, "eval_steps_per_second": 54.46, "step": 1672000 }, { "epoch": 1.83, "learning_rate": 1.23e-07, "loss": 2.1232, "step": 1680000 }, { "epoch": 1.83, "eval_loss": 2.0067014694213867, "eval_runtime": 890.51, "eval_samples_per_second": 866.717, "eval_steps_per_second": 54.17, "step": 1680000 }, { "epoch": 1.84, "eval_loss": 2.0039150714874268, "eval_runtime": 889.3586, "eval_samples_per_second": 867.839, "eval_steps_per_second": 54.24, "step": 1688000 }, { "epoch": 1.85, "learning_rate": 1.2026666666666666e-07, "loss": 2.1212, "step": 1696000 }, { "epoch": 1.85, "eval_loss": 2.008970022201538, "eval_runtime": 893.785, "eval_samples_per_second": 863.541, "eval_steps_per_second": 53.972, "step": 1696000 }, { "epoch": 1.86, "eval_loss": 2.0079498291015625, "eval_runtime": 882.5613, "eval_samples_per_second": 874.523, "eval_steps_per_second": 54.658, "step": 1704000 }, { "epoch": 1.87, "learning_rate": 1.1753333333333334e-07, "loss": 2.1246, "step": 1712000 }, { "epoch": 1.87, "eval_loss": 2.0082814693450928, "eval_runtime": 886.133, "eval_samples_per_second": 870.998, "eval_steps_per_second": 54.438, "step": 1712000 }, { "epoch": 1.88, "eval_loss": 2.003898859024048, "eval_runtime": 887.1853, "eval_samples_per_second": 869.965, "eval_steps_per_second": 54.373, "step": 1720000 }, { "epoch": 1.89, "learning_rate": 1.1480000000000001e-07, "loss": 2.1129, "step": 1728000 }, { "epoch": 1.89, "eval_loss": 2.0069074630737305, "eval_runtime": 891.3907, "eval_samples_per_second": 865.86, "eval_steps_per_second": 54.117, "step": 1728000 }, { "epoch": 1.89, "eval_loss": 2.007922410964966, "eval_runtime": 884.1175, "eval_samples_per_second": 872.984, "eval_steps_per_second": 54.562, "step": 1736000 }, { "epoch": 1.9, "learning_rate": 1.1206666666666666e-07, "loss": 2.1209, "step": 1744000 }, { "epoch": 1.9, "eval_loss": 2.00584077835083, "eval_runtime": 888.6359, "eval_samples_per_second": 868.545, "eval_steps_per_second": 54.284, "step": 1744000 }, { "epoch": 1.91, "eval_loss": 2.0071957111358643, "eval_runtime": 891.8674, "eval_samples_per_second": 865.398, "eval_steps_per_second": 54.088, "step": 1752000 }, { "epoch": 1.92, "learning_rate": 1.0933333333333333e-07, "loss": 2.1209, "step": 1760000 }, { "epoch": 1.92, "eval_loss": 2.0067615509033203, "eval_runtime": 884.8141, "eval_samples_per_second": 872.296, "eval_steps_per_second": 54.519, "step": 1760000 }, { "epoch": 1.93, "eval_loss": 2.0078628063201904, "eval_runtime": 888.3025, "eval_samples_per_second": 868.871, "eval_steps_per_second": 54.305, "step": 1768000 }, { "epoch": 1.94, "learning_rate": 1.066e-07, "loss": 2.1184, "step": 1776000 }, { "epoch": 1.94, "eval_loss": 2.0036442279815674, "eval_runtime": 887.5766, "eval_samples_per_second": 869.581, "eval_steps_per_second": 54.349, "step": 1776000 }, { "epoch": 1.95, "eval_loss": 2.0064985752105713, "eval_runtime": 890.3705, "eval_samples_per_second": 866.853, "eval_steps_per_second": 54.179, "step": 1784000 }, { "epoch": 1.96, "learning_rate": 1.0386666666666667e-07, "loss": 2.1065, "step": 1792000 }, { "epoch": 1.96, "eval_loss": 2.007737159729004, "eval_runtime": 889.1985, "eval_samples_per_second": 867.995, "eval_steps_per_second": 54.25, "step": 1792000 }, { "epoch": 1.96, "eval_loss": 2.006197452545166, "eval_runtime": 889.8901, "eval_samples_per_second": 867.321, "eval_steps_per_second": 54.208, "step": 1800000 }, { "epoch": 1.97, "learning_rate": 1.0113333333333334e-07, "loss": 2.109, "step": 1808000 }, { "epoch": 1.97, "eval_loss": 2.0090434551239014, "eval_runtime": 888.3297, "eval_samples_per_second": 868.844, "eval_steps_per_second": 54.303, "step": 1808000 }, { "epoch": 1.98, "eval_loss": 2.012356758117676, "eval_runtime": 893.3256, "eval_samples_per_second": 863.985, "eval_steps_per_second": 53.999, "step": 1816000 }, { "epoch": 1.99, "learning_rate": 9.84e-08, "loss": 2.1081, "step": 1824000 }, { "epoch": 1.99, "eval_loss": 2.0065596103668213, "eval_runtime": 893.6122, "eval_samples_per_second": 863.708, "eval_steps_per_second": 53.982, "step": 1824000 }, { "epoch": 2.0, "eval_loss": 2.008080005645752, "eval_runtime": 891.4247, "eval_samples_per_second": 865.828, "eval_steps_per_second": 54.115, "step": 1832000 }, { "epoch": 2.01, "learning_rate": 9.566666666666666e-08, "loss": 2.1151, "step": 1840000 }, { "epoch": 2.01, "eval_loss": 2.008512258529663, "eval_runtime": 884.9554, "eval_samples_per_second": 872.157, "eval_steps_per_second": 54.51, "step": 1840000 }, { "epoch": 2.02, "eval_loss": 2.0054173469543457, "eval_runtime": 886.9049, "eval_samples_per_second": 870.24, "eval_steps_per_second": 54.39, "step": 1848000 }, { "epoch": 2.03, "learning_rate": 9.293333333333333e-08, "loss": 2.1178, "step": 1856000 }, { "epoch": 2.03, "eval_loss": 2.005777359008789, "eval_runtime": 886.5315, "eval_samples_per_second": 870.606, "eval_steps_per_second": 54.413, "step": 1856000 }, { "epoch": 2.03, "eval_loss": 2.0048415660858154, "eval_runtime": 893.5519, "eval_samples_per_second": 863.766, "eval_steps_per_second": 53.986, "step": 1864000 }, { "epoch": 2.04, "learning_rate": 9.02e-08, "loss": 2.1035, "step": 1872000 }, { "epoch": 2.04, "eval_loss": 2.004007339477539, "eval_runtime": 890.5358, "eval_samples_per_second": 866.692, "eval_steps_per_second": 54.169, "step": 1872000 }, { "epoch": 2.05, "eval_loss": 2.0059244632720947, "eval_runtime": 887.0437, "eval_samples_per_second": 870.104, "eval_steps_per_second": 54.382, "step": 1880000 }, { "epoch": 2.06, "learning_rate": 8.746666666666667e-08, "loss": 2.1197, "step": 1888000 }, { "epoch": 2.06, "eval_loss": 2.0071017742156982, "eval_runtime": 889.191, "eval_samples_per_second": 868.003, "eval_steps_per_second": 54.25, "step": 1888000 }, { "epoch": 2.07, "eval_loss": 2.005682945251465, "eval_runtime": 888.8818, "eval_samples_per_second": 868.304, "eval_steps_per_second": 54.269, "step": 1896000 }, { "epoch": 2.08, "learning_rate": 8.473333333333334e-08, "loss": 2.1143, "step": 1904000 }, { "epoch": 2.08, "eval_loss": 2.005943536758423, "eval_runtime": 884.5437, "eval_samples_per_second": 872.563, "eval_steps_per_second": 54.535, "step": 1904000 }, { "epoch": 2.09, "eval_loss": 2.0042991638183594, "eval_runtime": 884.1715, "eval_samples_per_second": 872.93, "eval_steps_per_second": 54.558, "step": 1912000 }, { "epoch": 2.09, "learning_rate": 8.2e-08, "loss": 2.1082, "step": 1920000 }, { "epoch": 2.09, "eval_loss": 2.0067648887634277, "eval_runtime": 885.4828, "eval_samples_per_second": 871.637, "eval_steps_per_second": 54.478, "step": 1920000 }, { "epoch": 2.1, "eval_loss": 2.0057313442230225, "eval_runtime": 887.8665, "eval_samples_per_second": 869.297, "eval_steps_per_second": 54.331, "step": 1928000 }, { "epoch": 2.11, "learning_rate": 7.926666666666666e-08, "loss": 2.1202, "step": 1936000 }, { "epoch": 2.11, "eval_loss": 2.007241725921631, "eval_runtime": 885.5971, "eval_samples_per_second": 871.525, "eval_steps_per_second": 54.471, "step": 1936000 }, { "epoch": 2.12, "eval_loss": 2.0057430267333984, "eval_runtime": 888.4045, "eval_samples_per_second": 868.771, "eval_steps_per_second": 54.298, "step": 1944000 }, { "epoch": 2.13, "learning_rate": 7.653333333333333e-08, "loss": 2.1138, "step": 1952000 }, { "epoch": 2.13, "eval_loss": 2.0051097869873047, "eval_runtime": 889.7536, "eval_samples_per_second": 867.454, "eval_steps_per_second": 54.216, "step": 1952000 }, { "epoch": 2.14, "eval_loss": 2.008528709411621, "eval_runtime": 887.8548, "eval_samples_per_second": 869.309, "eval_steps_per_second": 54.332, "step": 1960000 }, { "epoch": 2.15, "learning_rate": 7.38e-08, "loss": 2.1082, "step": 1968000 }, { "epoch": 2.15, "eval_loss": 2.007629871368408, "eval_runtime": 886.2101, "eval_samples_per_second": 870.922, "eval_steps_per_second": 54.433, "step": 1968000 }, { "epoch": 2.16, "eval_loss": 2.0076658725738525, "eval_runtime": 886.4111, "eval_samples_per_second": 870.725, "eval_steps_per_second": 54.421, "step": 1976000 }, { "epoch": 2.16, "learning_rate": 7.106666666666667e-08, "loss": 2.1084, "step": 1984000 }, { "epoch": 2.16, "eval_loss": 2.001997470855713, "eval_runtime": 885.1567, "eval_samples_per_second": 871.959, "eval_steps_per_second": 54.498, "step": 1984000 }, { "epoch": 2.17, "eval_loss": 2.005009651184082, "eval_runtime": 889.5629, "eval_samples_per_second": 867.64, "eval_steps_per_second": 54.228, "step": 1992000 }, { "epoch": 2.18, "learning_rate": 6.833333333333332e-08, "loss": 2.1151, "step": 2000000 }, { "epoch": 2.18, "eval_loss": 2.0065817832946777, "eval_runtime": 885.7641, "eval_samples_per_second": 871.361, "eval_steps_per_second": 54.46, "step": 2000000 }, { "epoch": 2.19, "eval_loss": 2.003136396408081, "eval_runtime": 886.578, "eval_samples_per_second": 870.561, "eval_steps_per_second": 54.41, "step": 2008000 }, { "epoch": 2.2, "learning_rate": 6.56e-08, "loss": 2.1141, "step": 2016000 }, { "epoch": 2.2, "eval_loss": 2.0128238201141357, "eval_runtime": 891.0219, "eval_samples_per_second": 866.219, "eval_steps_per_second": 54.139, "step": 2016000 }, { "epoch": 2.21, "eval_loss": 2.0021839141845703, "eval_runtime": 895.8435, "eval_samples_per_second": 861.557, "eval_steps_per_second": 53.848, "step": 2024000 }, { "epoch": 2.22, "learning_rate": 6.286666666666666e-08, "loss": 2.1129, "step": 2032000 }, { "epoch": 2.22, "eval_loss": 2.0065131187438965, "eval_runtime": 890.2528, "eval_samples_per_second": 866.967, "eval_steps_per_second": 54.186, "step": 2032000 }, { "epoch": 2.23, "eval_loss": 2.005363941192627, "eval_runtime": 890.9681, "eval_samples_per_second": 866.271, "eval_steps_per_second": 54.142, "step": 2040000 }, { "epoch": 2.23, "learning_rate": 6.013333333333333e-08, "loss": 2.1164, "step": 2048000 }, { "epoch": 2.23, "eval_loss": 2.0038933753967285, "eval_runtime": 892.3995, "eval_samples_per_second": 864.882, "eval_steps_per_second": 54.055, "step": 2048000 }, { "epoch": 2.24, "eval_loss": 2.003117561340332, "eval_runtime": 894.495, "eval_samples_per_second": 862.856, "eval_steps_per_second": 53.929, "step": 2056000 }, { "epoch": 2.25, "learning_rate": 5.7400000000000004e-08, "loss": 2.1121, "step": 2064000 }, { "epoch": 2.25, "eval_loss": 2.0101029872894287, "eval_runtime": 886.6646, "eval_samples_per_second": 870.476, "eval_steps_per_second": 54.405, "step": 2064000 }, { "epoch": 2.26, "eval_loss": 2.0098650455474854, "eval_runtime": 887.3882, "eval_samples_per_second": 869.766, "eval_steps_per_second": 54.361, "step": 2072000 }, { "epoch": 2.27, "learning_rate": 5.4666666666666666e-08, "loss": 2.1071, "step": 2080000 }, { "epoch": 2.27, "eval_loss": 2.0041701793670654, "eval_runtime": 891.5578, "eval_samples_per_second": 865.698, "eval_steps_per_second": 54.106, "step": 2080000 }, { "epoch": 2.28, "eval_loss": 2.0030367374420166, "eval_runtime": 886.7055, "eval_samples_per_second": 870.436, "eval_steps_per_second": 54.403, "step": 2088000 }, { "epoch": 2.29, "learning_rate": 5.1933333333333335e-08, "loss": 2.1094, "step": 2096000 }, { "epoch": 2.29, "eval_loss": 2.00482439994812, "eval_runtime": 887.8886, "eval_samples_per_second": 869.276, "eval_steps_per_second": 54.33, "step": 2096000 }, { "epoch": 2.3, "eval_loss": 2.004595994949341, "eval_runtime": 887.4455, "eval_samples_per_second": 869.71, "eval_steps_per_second": 54.357, "step": 2104000 }, { "epoch": 2.3, "learning_rate": 4.92e-08, "loss": 2.1017, "step": 2112000 }, { "epoch": 2.3, "eval_loss": 2.0038633346557617, "eval_runtime": 888.4121, "eval_samples_per_second": 868.764, "eval_steps_per_second": 54.298, "step": 2112000 }, { "epoch": 2.31, "eval_loss": 2.0011472702026367, "eval_runtime": 889.7748, "eval_samples_per_second": 867.433, "eval_steps_per_second": 54.215, "step": 2120000 }, { "epoch": 2.32, "learning_rate": 4.6466666666666666e-08, "loss": 2.1124, "step": 2128000 }, { "epoch": 2.32, "eval_loss": 2.007091522216797, "eval_runtime": 892.2658, "eval_samples_per_second": 865.011, "eval_steps_per_second": 54.063, "step": 2128000 }, { "epoch": 2.33, "eval_loss": 2.0060718059539795, "eval_runtime": 887.502, "eval_samples_per_second": 869.654, "eval_steps_per_second": 54.354, "step": 2136000 }, { "epoch": 2.34, "learning_rate": 4.3733333333333335e-08, "loss": 2.1064, "step": 2144000 }, { "epoch": 2.34, "eval_loss": 2.0040297508239746, "eval_runtime": 888.8512, "eval_samples_per_second": 868.334, "eval_steps_per_second": 54.271, "step": 2144000 }, { "epoch": 2.35, "eval_loss": 2.007528066635132, "eval_runtime": 895.8909, "eval_samples_per_second": 861.511, "eval_steps_per_second": 53.845, "step": 2152000 }, { "epoch": 2.36, "learning_rate": 4.1e-08, "loss": 2.115, "step": 2160000 }, { "epoch": 2.36, "eval_loss": 2.0025811195373535, "eval_runtime": 894.6822, "eval_samples_per_second": 862.675, "eval_steps_per_second": 53.917, "step": 2160000 }, { "epoch": 2.37, "eval_loss": 2.006788492202759, "eval_runtime": 885.9111, "eval_samples_per_second": 871.216, "eval_steps_per_second": 54.451, "step": 2168000 }, { "epoch": 2.37, "learning_rate": 3.8266666666666665e-08, "loss": 2.114, "step": 2176000 }, { "epoch": 2.37, "eval_loss": 2.006558418273926, "eval_runtime": 889.8092, "eval_samples_per_second": 867.399, "eval_steps_per_second": 54.213, "step": 2176000 }, { "epoch": 2.38, "eval_loss": 2.0079538822174072, "eval_runtime": 889.2248, "eval_samples_per_second": 867.97, "eval_steps_per_second": 54.248, "step": 2184000 }, { "epoch": 2.39, "learning_rate": 3.5533333333333334e-08, "loss": 2.1171, "step": 2192000 }, { "epoch": 2.39, "eval_loss": 2.0031957626342773, "eval_runtime": 891.062, "eval_samples_per_second": 866.18, "eval_steps_per_second": 54.137, "step": 2192000 }, { "epoch": 2.4, "eval_loss": 2.0036396980285645, "eval_runtime": 889.4858, "eval_samples_per_second": 867.715, "eval_steps_per_second": 54.232, "step": 2200000 }, { "epoch": 2.41, "learning_rate": 3.28e-08, "loss": 2.1119, "step": 2208000 }, { "epoch": 2.41, "eval_loss": 2.004848003387451, "eval_runtime": 890.2659, "eval_samples_per_second": 866.954, "eval_steps_per_second": 54.185, "step": 2208000 }, { "epoch": 2.42, "eval_loss": 2.0058629512786865, "eval_runtime": 890.6135, "eval_samples_per_second": 866.616, "eval_steps_per_second": 54.164, "step": 2216000 }, { "epoch": 2.43, "learning_rate": 3.0066666666666665e-08, "loss": 2.1097, "step": 2224000 }, { "epoch": 2.43, "eval_loss": 2.005845546722412, "eval_runtime": 889.9256, "eval_samples_per_second": 867.286, "eval_steps_per_second": 54.206, "step": 2224000 }, { "epoch": 2.44, "eval_loss": 2.004934310913086, "eval_runtime": 893.1468, "eval_samples_per_second": 864.158, "eval_steps_per_second": 54.01, "step": 2232000 }, { "epoch": 2.44, "learning_rate": 2.7333333333333333e-08, "loss": 2.1091, "step": 2240000 }, { "epoch": 2.44, "eval_loss": 2.005760669708252, "eval_runtime": 893.6832, "eval_samples_per_second": 863.639, "eval_steps_per_second": 53.978, "step": 2240000 }, { "epoch": 2.45, "eval_loss": 2.0032405853271484, "eval_runtime": 894.8171, "eval_samples_per_second": 862.545, "eval_steps_per_second": 53.909, "step": 2248000 }, { "epoch": 2.46, "learning_rate": 2.46e-08, "loss": 2.1107, "step": 2256000 }, { "epoch": 2.46, "eval_loss": 2.00769305229187, "eval_runtime": 893.4774, "eval_samples_per_second": 863.838, "eval_steps_per_second": 53.99, "step": 2256000 }, { "epoch": 2.47, "eval_loss": 2.0032243728637695, "eval_runtime": 893.6019, "eval_samples_per_second": 863.718, "eval_steps_per_second": 53.983, "step": 2264000 }, { "epoch": 2.48, "learning_rate": 2.1866666666666667e-08, "loss": 2.1126, "step": 2272000 }, { "epoch": 2.48, "eval_loss": 2.0055274963378906, "eval_runtime": 891.7304, "eval_samples_per_second": 865.531, "eval_steps_per_second": 54.096, "step": 2272000 }, { "epoch": 2.49, "eval_loss": 2.002612590789795, "eval_runtime": 892.1014, "eval_samples_per_second": 865.171, "eval_steps_per_second": 54.073, "step": 2280000 }, { "epoch": 2.5, "learning_rate": 1.9133333333333333e-08, "loss": 2.1173, "step": 2288000 }, { "epoch": 2.5, "eval_loss": 2.0062429904937744, "eval_runtime": 891.9249, "eval_samples_per_second": 865.342, "eval_steps_per_second": 54.084, "step": 2288000 }, { "epoch": 2.51, "eval_loss": 2.003859043121338, "eval_runtime": 892.8008, "eval_samples_per_second": 864.493, "eval_steps_per_second": 54.031, "step": 2296000 }, { "epoch": 2.51, "learning_rate": 1.64e-08, "loss": 2.114, "step": 2304000 }, { "epoch": 2.51, "eval_loss": 2.006359100341797, "eval_runtime": 891.1547, "eval_samples_per_second": 866.09, "eval_steps_per_second": 54.131, "step": 2304000 }, { "epoch": 2.52, "eval_loss": 2.0113308429718018, "eval_runtime": 890.136, "eval_samples_per_second": 867.081, "eval_steps_per_second": 54.193, "step": 2312000 }, { "epoch": 2.53, "learning_rate": 1.3666666666666667e-08, "loss": 2.1131, "step": 2320000 }, { "epoch": 2.53, "eval_loss": 2.0065314769744873, "eval_runtime": 890.6924, "eval_samples_per_second": 866.539, "eval_steps_per_second": 54.159, "step": 2320000 }, { "epoch": 2.54, "eval_loss": 2.0098392963409424, "eval_runtime": 892.2668, "eval_samples_per_second": 865.01, "eval_steps_per_second": 54.063, "step": 2328000 }, { "epoch": 2.55, "learning_rate": 1.0933333333333334e-08, "loss": 2.1045, "step": 2336000 }, { "epoch": 2.55, "eval_loss": 2.0060501098632812, "eval_runtime": 891.9301, "eval_samples_per_second": 865.337, "eval_steps_per_second": 54.084, "step": 2336000 }, { "epoch": 2.56, "eval_loss": 2.006572961807251, "eval_runtime": 894.7549, "eval_samples_per_second": 862.605, "eval_steps_per_second": 53.913, "step": 2344000 }, { "epoch": 2.57, "learning_rate": 8.2e-09, "loss": 2.1144, "step": 2352000 }, { "epoch": 2.57, "eval_loss": 2.006028175354004, "eval_runtime": 899.347, "eval_samples_per_second": 858.2, "eval_steps_per_second": 53.638, "step": 2352000 }, { "epoch": 2.57, "eval_loss": 2.00589656829834, "eval_runtime": 893.5452, "eval_samples_per_second": 863.773, "eval_steps_per_second": 53.986, "step": 2360000 }, { "epoch": 2.58, "learning_rate": 5.466666666666667e-09, "loss": 2.1086, "step": 2368000 }, { "epoch": 2.58, "eval_loss": 2.0038540363311768, "eval_runtime": 893.2561, "eval_samples_per_second": 864.052, "eval_steps_per_second": 54.004, "step": 2368000 }, { "epoch": 2.59, "eval_loss": 2.0076115131378174, "eval_runtime": 895.0756, "eval_samples_per_second": 862.296, "eval_steps_per_second": 53.894, "step": 2376000 }, { "epoch": 2.6, "learning_rate": 2.7333333333333334e-09, "loss": 2.1058, "step": 2384000 }, { "epoch": 2.6, "eval_loss": 2.0035552978515625, "eval_runtime": 895.3228, "eval_samples_per_second": 862.058, "eval_steps_per_second": 53.879, "step": 2384000 }, { "epoch": 2.61, "eval_loss": 2.0077223777770996, "eval_runtime": 896.1834, "eval_samples_per_second": 861.23, "eval_steps_per_second": 53.827, "step": 2392000 }, { "epoch": 2.62, "learning_rate": 0.0, "loss": 2.1112, "step": 2400000 }, { "epoch": 2.62, "eval_loss": 2.000014066696167, "eval_runtime": 893.9091, "eval_samples_per_second": 863.421, "eval_steps_per_second": 53.964, "step": 2400000 }, { "epoch": 2.62, "step": 2400000, "total_flos": 7.571300080769916e+17, "train_loss": 2.133689431966146, "train_runtime": 416842.919, "train_samples_per_second": 92.121, "train_steps_per_second": 5.758 } ], "logging_steps": 16000, "max_steps": 2400000, "num_train_epochs": 3, "save_steps": 32000, "total_flos": 7.571300080769916e+17, "trial_name": null, "trial_params": null }