{ "best_metric": null, "best_model_checkpoint": null, "epoch": 13.88888888888889, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.28, "learning_rate": 1.0000000000000002e-06, "loss": 13.9056, "step": 10 }, { "epoch": 0.56, "learning_rate": 2.0000000000000003e-06, "loss": 14.53, "step": 20 }, { "epoch": 0.83, "learning_rate": 3e-06, "loss": 14.6503, "step": 30 }, { "epoch": 1.11, "learning_rate": 4.000000000000001e-06, "loss": 13.8701, "step": 40 }, { "epoch": 1.39, "learning_rate": 5e-06, "loss": 12.7866, "step": 50 }, { "epoch": 1.67, "learning_rate": 6e-06, "loss": 12.0329, "step": 60 }, { "epoch": 1.94, "learning_rate": 7.000000000000001e-06, "loss": 11.6797, "step": 70 }, { "epoch": 2.22, "learning_rate": 8.000000000000001e-06, "loss": 10.1652, "step": 80 }, { "epoch": 2.5, "learning_rate": 9e-06, "loss": 8.5016, "step": 90 }, { "epoch": 2.78, "learning_rate": 1e-05, "loss": 7.3051, "step": 100 }, { "epoch": 3.06, "learning_rate": 1.1000000000000001e-05, "loss": 6.5266, "step": 110 }, { "epoch": 3.33, "learning_rate": 1.2e-05, "loss": 4.9159, "step": 120 }, { "epoch": 3.61, "learning_rate": 1.3000000000000001e-05, "loss": 4.1691, "step": 130 }, { "epoch": 3.89, "learning_rate": 1.4000000000000001e-05, "loss": 2.9038, "step": 140 }, { "epoch": 4.17, "learning_rate": 1.5e-05, "loss": 2.2194, "step": 150 }, { "epoch": 4.44, "learning_rate": 1.6000000000000003e-05, "loss": 1.97, "step": 160 }, { "epoch": 4.72, "learning_rate": 1.7000000000000003e-05, "loss": 1.4839, "step": 170 }, { "epoch": 5.0, "learning_rate": 1.8e-05, "loss": 1.2807, "step": 180 }, { "epoch": 5.28, "learning_rate": 1.9e-05, "loss": 1.0124, "step": 190 }, { "epoch": 5.56, "learning_rate": 2e-05, "loss": 0.6787, "step": 200 }, { "epoch": 5.83, "learning_rate": 2.1e-05, "loss": 0.5985, "step": 210 }, { "epoch": 6.11, "learning_rate": 2.2000000000000003e-05, "loss": 0.463, "step": 220 }, { "epoch": 6.39, "learning_rate": 2.3000000000000003e-05, "loss": 0.4044, "step": 230 }, { "epoch": 6.67, "learning_rate": 2.4e-05, "loss": 0.3232, "step": 240 }, { "epoch": 6.94, "learning_rate": 2.5e-05, "loss": 0.3017, "step": 250 }, { "epoch": 7.22, "learning_rate": 2.6000000000000002e-05, "loss": 0.2714, "step": 260 }, { "epoch": 7.5, "learning_rate": 2.7000000000000002e-05, "loss": 0.2657, "step": 270 }, { "epoch": 7.78, "learning_rate": 2.8000000000000003e-05, "loss": 0.1915, "step": 280 }, { "epoch": 8.06, "learning_rate": 2.9e-05, "loss": 0.1732, "step": 290 }, { "epoch": 8.33, "learning_rate": 3e-05, "loss": 0.1623, "step": 300 }, { "epoch": 8.61, "learning_rate": 3.1e-05, "loss": 0.1401, "step": 310 }, { "epoch": 8.89, "learning_rate": 3.2000000000000005e-05, "loss": 0.1644, "step": 320 }, { "epoch": 9.17, "learning_rate": 3.3e-05, "loss": 0.1303, "step": 330 }, { "epoch": 9.44, "learning_rate": 3.4000000000000007e-05, "loss": 0.1083, "step": 340 }, { "epoch": 9.72, "learning_rate": 3.5e-05, "loss": 0.1249, "step": 350 }, { "epoch": 10.0, "learning_rate": 3.6e-05, "loss": 0.1037, "step": 360 }, { "epoch": 10.28, "learning_rate": 3.7e-05, "loss": 0.1048, "step": 370 }, { "epoch": 10.56, "learning_rate": 3.8e-05, "loss": 0.0925, "step": 380 }, { "epoch": 10.83, "learning_rate": 3.9000000000000006e-05, "loss": 0.0976, "step": 390 }, { "epoch": 11.11, "learning_rate": 4e-05, "loss": 0.0796, "step": 400 }, { "epoch": 11.39, "learning_rate": 4.1e-05, "loss": 0.0811, "step": 410 }, { "epoch": 11.67, "learning_rate": 4.2e-05, "loss": 0.0789, "step": 420 }, { "epoch": 11.94, "learning_rate": 4.3e-05, "loss": 0.0788, "step": 430 }, { "epoch": 12.22, "learning_rate": 4.4000000000000006e-05, "loss": 0.0674, "step": 440 }, { "epoch": 12.5, "learning_rate": 4.5e-05, "loss": 0.0632, "step": 450 }, { "epoch": 12.78, "learning_rate": 4.600000000000001e-05, "loss": 0.0779, "step": 460 }, { "epoch": 13.06, "learning_rate": 4.7e-05, "loss": 0.0596, "step": 470 }, { "epoch": 13.33, "learning_rate": 4.8e-05, "loss": 0.0568, "step": 480 }, { "epoch": 13.61, "learning_rate": 4.9e-05, "loss": 0.0647, "step": 490 }, { "epoch": 13.89, "learning_rate": 5e-05, "loss": 0.0574, "step": 500 } ], "logging_steps": 10, "max_steps": 1800, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "total_flos": 270683602944000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }