{ "best_metric": null, "best_model_checkpoint": null, "epoch": 41.666666666666664, "eval_steps": 500, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.28, "learning_rate": 1.0000000000000002e-06, "loss": 13.9056, "step": 10 }, { "epoch": 0.56, "learning_rate": 2.0000000000000003e-06, "loss": 14.53, "step": 20 }, { "epoch": 0.83, "learning_rate": 3e-06, "loss": 14.6503, "step": 30 }, { "epoch": 1.11, "learning_rate": 4.000000000000001e-06, "loss": 13.8701, "step": 40 }, { "epoch": 1.39, "learning_rate": 5e-06, "loss": 12.7866, "step": 50 }, { "epoch": 1.67, "learning_rate": 6e-06, "loss": 12.0329, "step": 60 }, { "epoch": 1.94, "learning_rate": 7.000000000000001e-06, "loss": 11.6797, "step": 70 }, { "epoch": 2.22, "learning_rate": 8.000000000000001e-06, "loss": 10.1652, "step": 80 }, { "epoch": 2.5, "learning_rate": 9e-06, "loss": 8.5016, "step": 90 }, { "epoch": 2.78, "learning_rate": 1e-05, "loss": 7.3051, "step": 100 }, { "epoch": 3.06, "learning_rate": 1.1000000000000001e-05, "loss": 6.5266, "step": 110 }, { "epoch": 3.33, "learning_rate": 1.2e-05, "loss": 4.9159, "step": 120 }, { "epoch": 3.61, "learning_rate": 1.3000000000000001e-05, "loss": 4.1691, "step": 130 }, { "epoch": 3.89, "learning_rate": 1.4000000000000001e-05, "loss": 2.9038, "step": 140 }, { "epoch": 4.17, "learning_rate": 1.5e-05, "loss": 2.2194, "step": 150 }, { "epoch": 4.44, "learning_rate": 1.6000000000000003e-05, "loss": 1.97, "step": 160 }, { "epoch": 4.72, "learning_rate": 1.7000000000000003e-05, "loss": 1.4839, "step": 170 }, { "epoch": 5.0, "learning_rate": 1.8e-05, "loss": 1.2807, "step": 180 }, { "epoch": 5.28, "learning_rate": 1.9e-05, "loss": 1.0124, "step": 190 }, { "epoch": 5.56, "learning_rate": 2e-05, "loss": 0.6787, "step": 200 }, { "epoch": 5.83, "learning_rate": 2.1e-05, "loss": 0.5985, "step": 210 }, { "epoch": 6.11, "learning_rate": 2.2000000000000003e-05, "loss": 0.463, "step": 220 }, { "epoch": 6.39, "learning_rate": 2.3000000000000003e-05, "loss": 0.4044, "step": 230 }, { "epoch": 6.67, "learning_rate": 2.4e-05, "loss": 0.3232, "step": 240 }, { "epoch": 6.94, "learning_rate": 2.5e-05, "loss": 0.3017, "step": 250 }, { "epoch": 7.22, "learning_rate": 2.6000000000000002e-05, "loss": 0.2714, "step": 260 }, { "epoch": 7.5, "learning_rate": 2.7000000000000002e-05, "loss": 0.2657, "step": 270 }, { "epoch": 7.78, "learning_rate": 2.8000000000000003e-05, "loss": 0.1915, "step": 280 }, { "epoch": 8.06, "learning_rate": 2.9e-05, "loss": 0.1732, "step": 290 }, { "epoch": 8.33, "learning_rate": 3e-05, "loss": 0.1623, "step": 300 }, { "epoch": 8.61, "learning_rate": 3.1e-05, "loss": 0.1401, "step": 310 }, { "epoch": 8.89, "learning_rate": 3.2000000000000005e-05, "loss": 0.1644, "step": 320 }, { "epoch": 9.17, "learning_rate": 3.3e-05, "loss": 0.1303, "step": 330 }, { "epoch": 9.44, "learning_rate": 3.4000000000000007e-05, "loss": 0.1083, "step": 340 }, { "epoch": 9.72, "learning_rate": 3.5e-05, "loss": 0.1249, "step": 350 }, { "epoch": 10.0, "learning_rate": 3.6e-05, "loss": 0.1037, "step": 360 }, { "epoch": 10.28, "learning_rate": 3.7e-05, "loss": 0.1048, "step": 370 }, { "epoch": 10.56, "learning_rate": 3.8e-05, "loss": 0.0925, "step": 380 }, { "epoch": 10.83, "learning_rate": 3.9000000000000006e-05, "loss": 0.0976, "step": 390 }, { "epoch": 11.11, "learning_rate": 4e-05, "loss": 0.0796, "step": 400 }, { "epoch": 11.39, "learning_rate": 4.1e-05, "loss": 0.0811, "step": 410 }, { "epoch": 11.67, "learning_rate": 4.2e-05, "loss": 0.0789, "step": 420 }, { "epoch": 11.94, "learning_rate": 4.3e-05, "loss": 0.0788, "step": 430 }, { "epoch": 12.22, "learning_rate": 4.4000000000000006e-05, "loss": 0.0674, "step": 440 }, { "epoch": 12.5, "learning_rate": 4.5e-05, "loss": 0.0632, "step": 450 }, { "epoch": 12.78, "learning_rate": 4.600000000000001e-05, "loss": 0.0779, "step": 460 }, { "epoch": 13.06, "learning_rate": 4.7e-05, "loss": 0.0596, "step": 470 }, { "epoch": 13.33, "learning_rate": 4.8e-05, "loss": 0.0568, "step": 480 }, { "epoch": 13.61, "learning_rate": 4.9e-05, "loss": 0.0647, "step": 490 }, { "epoch": 13.89, "learning_rate": 5e-05, "loss": 0.0574, "step": 500 }, { "epoch": 14.17, "learning_rate": 4.961538461538462e-05, "loss": 0.0584, "step": 510 }, { "epoch": 14.44, "learning_rate": 4.923076923076924e-05, "loss": 0.0468, "step": 520 }, { "epoch": 14.72, "learning_rate": 4.884615384615385e-05, "loss": 0.055, "step": 530 }, { "epoch": 15.0, "learning_rate": 4.846153846153846e-05, "loss": 0.05, "step": 540 }, { "epoch": 15.28, "learning_rate": 4.8076923076923084e-05, "loss": 0.0502, "step": 550 }, { "epoch": 15.56, "learning_rate": 4.76923076923077e-05, "loss": 0.048, "step": 560 }, { "epoch": 15.83, "learning_rate": 4.730769230769231e-05, "loss": 0.0465, "step": 570 }, { "epoch": 16.11, "learning_rate": 4.692307692307693e-05, "loss": 0.0477, "step": 580 }, { "epoch": 16.39, "learning_rate": 4.653846153846154e-05, "loss": 0.0451, "step": 590 }, { "epoch": 16.67, "learning_rate": 4.615384615384616e-05, "loss": 0.0345, "step": 600 }, { "epoch": 16.94, "learning_rate": 4.576923076923077e-05, "loss": 0.0443, "step": 610 }, { "epoch": 17.22, "learning_rate": 4.538461538461539e-05, "loss": 0.034, "step": 620 }, { "epoch": 17.5, "learning_rate": 4.5e-05, "loss": 0.0422, "step": 630 }, { "epoch": 17.78, "learning_rate": 4.461538461538462e-05, "loss": 0.0385, "step": 640 }, { "epoch": 18.06, "learning_rate": 4.423076923076923e-05, "loss": 0.0339, "step": 650 }, { "epoch": 18.33, "learning_rate": 4.384615384615385e-05, "loss": 0.0322, "step": 660 }, { "epoch": 18.61, "learning_rate": 4.346153846153846e-05, "loss": 0.0319, "step": 670 }, { "epoch": 18.89, "learning_rate": 4.3076923076923084e-05, "loss": 0.0358, "step": 680 }, { "epoch": 19.17, "learning_rate": 4.269230769230769e-05, "loss": 0.0365, "step": 690 }, { "epoch": 19.44, "learning_rate": 4.230769230769231e-05, "loss": 0.0349, "step": 700 }, { "epoch": 19.72, "learning_rate": 4.192307692307693e-05, "loss": 0.034, "step": 710 }, { "epoch": 20.0, "learning_rate": 4.1538461538461544e-05, "loss": 0.031, "step": 720 }, { "epoch": 20.28, "learning_rate": 4.115384615384615e-05, "loss": 0.0394, "step": 730 }, { "epoch": 20.56, "learning_rate": 4.0769230769230773e-05, "loss": 0.0266, "step": 740 }, { "epoch": 20.83, "learning_rate": 4.038461538461539e-05, "loss": 0.0274, "step": 750 }, { "epoch": 21.11, "learning_rate": 4e-05, "loss": 0.0227, "step": 760 }, { "epoch": 21.39, "learning_rate": 3.961538461538462e-05, "loss": 0.0299, "step": 770 }, { "epoch": 21.67, "learning_rate": 3.923076923076923e-05, "loss": 0.0262, "step": 780 }, { "epoch": 21.94, "learning_rate": 3.884615384615385e-05, "loss": 0.0218, "step": 790 }, { "epoch": 22.22, "learning_rate": 3.846153846153846e-05, "loss": 0.0253, "step": 800 }, { "epoch": 22.5, "learning_rate": 3.807692307692308e-05, "loss": 0.0336, "step": 810 }, { "epoch": 22.78, "learning_rate": 3.769230769230769e-05, "loss": 0.0288, "step": 820 }, { "epoch": 23.06, "learning_rate": 3.730769230769231e-05, "loss": 0.0233, "step": 830 }, { "epoch": 23.33, "learning_rate": 3.692307692307693e-05, "loss": 0.0276, "step": 840 }, { "epoch": 23.61, "learning_rate": 3.653846153846154e-05, "loss": 0.0184, "step": 850 }, { "epoch": 23.89, "learning_rate": 3.615384615384615e-05, "loss": 0.029, "step": 860 }, { "epoch": 24.17, "learning_rate": 3.5769230769230774e-05, "loss": 0.0197, "step": 870 }, { "epoch": 24.44, "learning_rate": 3.538461538461539e-05, "loss": 0.0224, "step": 880 }, { "epoch": 24.72, "learning_rate": 3.5e-05, "loss": 0.0248, "step": 890 }, { "epoch": 25.0, "learning_rate": 3.461538461538462e-05, "loss": 0.0286, "step": 900 }, { "epoch": 25.28, "learning_rate": 3.4230769230769234e-05, "loss": 0.0241, "step": 910 }, { "epoch": 25.56, "learning_rate": 3.384615384615385e-05, "loss": 0.0217, "step": 920 }, { "epoch": 25.83, "learning_rate": 3.346153846153846e-05, "loss": 0.021, "step": 930 }, { "epoch": 26.11, "learning_rate": 3.307692307692308e-05, "loss": 0.0229, "step": 940 }, { "epoch": 26.39, "learning_rate": 3.269230769230769e-05, "loss": 0.0246, "step": 950 }, { "epoch": 26.67, "learning_rate": 3.230769230769231e-05, "loss": 0.0238, "step": 960 }, { "epoch": 26.94, "learning_rate": 3.192307692307692e-05, "loss": 0.0216, "step": 970 }, { "epoch": 27.22, "learning_rate": 3.153846153846154e-05, "loss": 0.0257, "step": 980 }, { "epoch": 27.5, "learning_rate": 3.115384615384615e-05, "loss": 0.0193, "step": 990 }, { "epoch": 27.78, "learning_rate": 3.0769230769230774e-05, "loss": 0.017, "step": 1000 }, { "epoch": 28.06, "learning_rate": 3.0384615384615382e-05, "loss": 0.0176, "step": 1010 }, { "epoch": 28.33, "learning_rate": 3e-05, "loss": 0.0227, "step": 1020 }, { "epoch": 28.61, "learning_rate": 2.9615384615384616e-05, "loss": 0.0185, "step": 1030 }, { "epoch": 28.89, "learning_rate": 2.9230769230769234e-05, "loss": 0.0206, "step": 1040 }, { "epoch": 29.17, "learning_rate": 2.8846153846153845e-05, "loss": 0.0194, "step": 1050 }, { "epoch": 29.44, "learning_rate": 2.846153846153846e-05, "loss": 0.0229, "step": 1060 }, { "epoch": 29.72, "learning_rate": 2.807692307692308e-05, "loss": 0.0186, "step": 1070 }, { "epoch": 30.0, "learning_rate": 2.7692307692307694e-05, "loss": 0.015, "step": 1080 }, { "epoch": 30.28, "learning_rate": 2.7307692307692305e-05, "loss": 0.0198, "step": 1090 }, { "epoch": 30.56, "learning_rate": 2.6923076923076923e-05, "loss": 0.0162, "step": 1100 }, { "epoch": 30.83, "learning_rate": 2.6538461538461538e-05, "loss": 0.0163, "step": 1110 }, { "epoch": 31.11, "learning_rate": 2.6153846153846157e-05, "loss": 0.0257, "step": 1120 }, { "epoch": 31.39, "learning_rate": 2.5769230769230768e-05, "loss": 0.017, "step": 1130 }, { "epoch": 31.67, "learning_rate": 2.5384615384615383e-05, "loss": 0.0131, "step": 1140 }, { "epoch": 31.94, "learning_rate": 2.5e-05, "loss": 0.0224, "step": 1150 }, { "epoch": 32.22, "learning_rate": 2.461538461538462e-05, "loss": 0.0218, "step": 1160 }, { "epoch": 32.5, "learning_rate": 2.423076923076923e-05, "loss": 0.014, "step": 1170 }, { "epoch": 32.78, "learning_rate": 2.384615384615385e-05, "loss": 0.0165, "step": 1180 }, { "epoch": 33.06, "learning_rate": 2.3461538461538464e-05, "loss": 0.017, "step": 1190 }, { "epoch": 33.33, "learning_rate": 2.307692307692308e-05, "loss": 0.0177, "step": 1200 }, { "epoch": 33.61, "learning_rate": 2.2692307692307694e-05, "loss": 0.015, "step": 1210 }, { "epoch": 33.89, "learning_rate": 2.230769230769231e-05, "loss": 0.0185, "step": 1220 }, { "epoch": 34.17, "learning_rate": 2.1923076923076924e-05, "loss": 0.0201, "step": 1230 }, { "epoch": 34.44, "learning_rate": 2.1538461538461542e-05, "loss": 0.0134, "step": 1240 }, { "epoch": 34.72, "learning_rate": 2.1153846153846154e-05, "loss": 0.019, "step": 1250 }, { "epoch": 35.0, "learning_rate": 2.0769230769230772e-05, "loss": 0.0156, "step": 1260 }, { "epoch": 35.28, "learning_rate": 2.0384615384615387e-05, "loss": 0.0171, "step": 1270 }, { "epoch": 35.56, "learning_rate": 2e-05, "loss": 0.0114, "step": 1280 }, { "epoch": 35.83, "learning_rate": 1.9615384615384617e-05, "loss": 0.0208, "step": 1290 }, { "epoch": 36.11, "learning_rate": 1.923076923076923e-05, "loss": 0.0145, "step": 1300 }, { "epoch": 36.39, "learning_rate": 1.8846153846153846e-05, "loss": 0.0124, "step": 1310 }, { "epoch": 36.67, "learning_rate": 1.8461538461538465e-05, "loss": 0.0197, "step": 1320 }, { "epoch": 36.94, "learning_rate": 1.8076923076923076e-05, "loss": 0.018, "step": 1330 }, { "epoch": 37.22, "learning_rate": 1.7692307692307694e-05, "loss": 0.0218, "step": 1340 }, { "epoch": 37.5, "learning_rate": 1.730769230769231e-05, "loss": 0.0111, "step": 1350 }, { "epoch": 37.78, "learning_rate": 1.6923076923076924e-05, "loss": 0.0124, "step": 1360 }, { "epoch": 38.06, "learning_rate": 1.653846153846154e-05, "loss": 0.0154, "step": 1370 }, { "epoch": 38.33, "learning_rate": 1.6153846153846154e-05, "loss": 0.0196, "step": 1380 }, { "epoch": 38.61, "learning_rate": 1.576923076923077e-05, "loss": 0.0123, "step": 1390 }, { "epoch": 38.89, "learning_rate": 1.5384615384615387e-05, "loss": 0.0128, "step": 1400 }, { "epoch": 39.17, "learning_rate": 1.5e-05, "loss": 0.0234, "step": 1410 }, { "epoch": 39.44, "learning_rate": 1.4615384615384617e-05, "loss": 0.0123, "step": 1420 }, { "epoch": 39.72, "learning_rate": 1.423076923076923e-05, "loss": 0.0121, "step": 1430 }, { "epoch": 40.0, "learning_rate": 1.3846153846153847e-05, "loss": 0.0167, "step": 1440 }, { "epoch": 40.28, "learning_rate": 1.3461538461538462e-05, "loss": 0.0146, "step": 1450 }, { "epoch": 40.56, "learning_rate": 1.3076923076923078e-05, "loss": 0.0205, "step": 1460 }, { "epoch": 40.83, "learning_rate": 1.2692307692307691e-05, "loss": 0.0125, "step": 1470 }, { "epoch": 41.11, "learning_rate": 1.230769230769231e-05, "loss": 0.0104, "step": 1480 }, { "epoch": 41.39, "learning_rate": 1.1923076923076925e-05, "loss": 0.021, "step": 1490 }, { "epoch": 41.67, "learning_rate": 1.153846153846154e-05, "loss": 0.0121, "step": 1500 } ], "logging_steps": 10, "max_steps": 1800, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "total_flos": 812050808832000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }