{ "best_metric": 1.3614939451217651, "best_model_checkpoint": "outputs/checkpoint-488", "epoch": 13.992831541218639, "eval_steps": 500, "global_step": 488, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.5734767025089605, "grad_norm": 1.2556052207946777, "learning_rate": 4.000000000000001e-06, "loss": 2.9206, "step": 20 }, { "epoch": 0.974910394265233, "eval_loss": 2.7616007328033447, "eval_runtime": 12.2966, "eval_samples_per_second": 30.252, "eval_steps_per_second": 3.822, "step": 34 }, { "epoch": 1.146953405017921, "grad_norm": 1.4213184118270874, "learning_rate": 8.000000000000001e-06, "loss": 2.8502, "step": 40 }, { "epoch": 1.7204301075268817, "grad_norm": 1.9485336542129517, "learning_rate": 1.2e-05, "loss": 2.6654, "step": 60 }, { "epoch": 1.978494623655914, "eval_loss": 2.177459478378296, "eval_runtime": 12.2962, "eval_samples_per_second": 30.253, "eval_steps_per_second": 3.822, "step": 69 }, { "epoch": 2.293906810035842, "grad_norm": 0.9282850623130798, "learning_rate": 1.6000000000000003e-05, "loss": 2.2325, "step": 80 }, { "epoch": 2.867383512544803, "grad_norm": 0.6272071003913879, "learning_rate": 2e-05, "loss": 1.9127, "step": 100 }, { "epoch": 2.982078853046595, "eval_loss": 1.8282170295715332, "eval_runtime": 12.2659, "eval_samples_per_second": 30.328, "eval_steps_per_second": 3.832, "step": 104 }, { "epoch": 3.4408602150537635, "grad_norm": 0.6469711065292358, "learning_rate": 1.9882804237803487e-05, "loss": 1.833, "step": 120 }, { "epoch": 3.985663082437276, "eval_loss": 1.7144227027893066, "eval_runtime": 12.2541, "eval_samples_per_second": 30.357, "eval_steps_per_second": 3.835, "step": 139 }, { "epoch": 4.014336917562724, "grad_norm": 0.7177844643592834, "learning_rate": 1.9533963920549307e-05, "loss": 1.7656, "step": 140 }, { "epoch": 4.587813620071684, "grad_norm": 0.7868255972862244, "learning_rate": 1.8961655569610557e-05, "loss": 1.7057, "step": 160 }, { "epoch": 4.989247311827957, "eval_loss": 1.6268185377120972, "eval_runtime": 12.2554, "eval_samples_per_second": 30.354, "eval_steps_per_second": 3.835, "step": 174 }, { "epoch": 5.161290322580645, "grad_norm": 0.7630313038825989, "learning_rate": 1.8179293607667177e-05, "loss": 1.6298, "step": 180 }, { "epoch": 5.734767025089606, "grad_norm": 0.8428413271903992, "learning_rate": 1.720521593600787e-05, "loss": 1.5832, "step": 200 }, { "epoch": 5.992831541218638, "eval_loss": 1.5533965826034546, "eval_runtime": 12.2514, "eval_samples_per_second": 30.364, "eval_steps_per_second": 3.836, "step": 209 }, { "epoch": 6.308243727598566, "grad_norm": 0.9982613921165466, "learning_rate": 1.6062254109666383e-05, "loss": 1.5144, "step": 220 }, { "epoch": 6.881720430107527, "grad_norm": 0.988570511341095, "learning_rate": 1.477719818512263e-05, "loss": 1.4884, "step": 240 }, { "epoch": 6.996415770609319, "eval_loss": 1.4935128688812256, "eval_runtime": 12.2503, "eval_samples_per_second": 30.367, "eval_steps_per_second": 3.837, "step": 244 }, { "epoch": 7.455197132616488, "grad_norm": 0.9964269399642944, "learning_rate": 1.3380168784085028e-05, "loss": 1.4513, "step": 260 }, { "epoch": 8.0, "eval_loss": 1.4478424787521362, "eval_runtime": 12.2711, "eval_samples_per_second": 30.315, "eval_steps_per_second": 3.83, "step": 279 }, { "epoch": 8.028673835125447, "grad_norm": 1.040726900100708, "learning_rate": 1.1903911091646684e-05, "loss": 1.3805, "step": 280 }, { "epoch": 8.602150537634408, "grad_norm": 1.3048664331436157, "learning_rate": 1.0383027336900356e-05, "loss": 1.3677, "step": 300 }, { "epoch": 8.974910394265233, "eval_loss": 1.4132319688796997, "eval_runtime": 12.2651, "eval_samples_per_second": 30.33, "eval_steps_per_second": 3.832, "step": 313 }, { "epoch": 9.175627240143369, "grad_norm": 1.1397103071212769, "learning_rate": 8.853165746015997e-06, "loss": 1.3506, "step": 320 }, { "epoch": 9.74910394265233, "grad_norm": 1.4186557531356812, "learning_rate": 7.350184978033386e-06, "loss": 1.3173, "step": 340 }, { "epoch": 9.978494623655914, "eval_loss": 1.3905054330825806, "eval_runtime": 12.26, "eval_samples_per_second": 30.343, "eval_steps_per_second": 3.834, "step": 348 }, { "epoch": 10.32258064516129, "grad_norm": 1.2632255554199219, "learning_rate": 5.9093136282866014e-06, "loss": 1.3071, "step": 360 }, { "epoch": 10.89605734767025, "grad_norm": 1.3758857250213623, "learning_rate": 4.56432449998779e-06, "loss": 1.2863, "step": 380 }, { "epoch": 10.982078853046595, "eval_loss": 1.375404953956604, "eval_runtime": 12.2609, "eval_samples_per_second": 30.34, "eval_steps_per_second": 3.833, "step": 383 }, { "epoch": 11.469534050179211, "grad_norm": 1.365923285484314, "learning_rate": 3.3467429983443477e-06, "loss": 1.2785, "step": 400 }, { "epoch": 11.985663082437275, "eval_loss": 1.366598129272461, "eval_runtime": 12.2562, "eval_samples_per_second": 30.352, "eval_steps_per_second": 3.835, "step": 418 }, { "epoch": 12.043010752688172, "grad_norm": 1.328456163406372, "learning_rate": 2.2851082017805704e-06, "loss": 1.2641, "step": 420 }, { "epoch": 12.616487455197133, "grad_norm": 1.3476920127868652, "learning_rate": 1.4043039301279904e-06, "loss": 1.261, "step": 440 }, { "epoch": 12.989247311827956, "eval_loss": 1.3626736402511597, "eval_runtime": 12.2637, "eval_samples_per_second": 30.333, "eval_steps_per_second": 3.832, "step": 453 }, { "epoch": 13.189964157706093, "grad_norm": 1.4261298179626465, "learning_rate": 7.249754889790539e-07, "loss": 1.2505, "step": 460 }, { "epoch": 13.763440860215054, "grad_norm": 1.580108642578125, "learning_rate": 2.6304576122221035e-07, "loss": 1.2546, "step": 480 }, { "epoch": 13.992831541218639, "eval_loss": 1.3614939451217651, "eval_runtime": 12.2686, "eval_samples_per_second": 30.321, "eval_steps_per_second": 3.831, "step": 488 } ], "logging_steps": 20, "max_steps": 510, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 500, "total_flos": 2.5689245217644544e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }