|
{ |
|
"best_metric": 1.361409306526184, |
|
"best_model_checkpoint": "outputs/checkpoint-510", |
|
"epoch": 14.623655913978494, |
|
"eval_steps": 500, |
|
"global_step": 510, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.5734767025089605, |
|
"grad_norm": 1.2556052207946777, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 2.9206, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.974910394265233, |
|
"eval_loss": 2.7616007328033447, |
|
"eval_runtime": 12.2966, |
|
"eval_samples_per_second": 30.252, |
|
"eval_steps_per_second": 3.822, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.146953405017921, |
|
"grad_norm": 1.4213184118270874, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 2.8502, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.7204301075268817, |
|
"grad_norm": 1.9485336542129517, |
|
"learning_rate": 1.2e-05, |
|
"loss": 2.6654, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.978494623655914, |
|
"eval_loss": 2.177459478378296, |
|
"eval_runtime": 12.2962, |
|
"eval_samples_per_second": 30.253, |
|
"eval_steps_per_second": 3.822, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.293906810035842, |
|
"grad_norm": 0.9282850623130798, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 2.2325, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.867383512544803, |
|
"grad_norm": 0.6272071003913879, |
|
"learning_rate": 2e-05, |
|
"loss": 1.9127, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.982078853046595, |
|
"eval_loss": 1.8282170295715332, |
|
"eval_runtime": 12.2659, |
|
"eval_samples_per_second": 30.328, |
|
"eval_steps_per_second": 3.832, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 3.4408602150537635, |
|
"grad_norm": 0.6469711065292358, |
|
"learning_rate": 1.9882804237803487e-05, |
|
"loss": 1.833, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 3.985663082437276, |
|
"eval_loss": 1.7144227027893066, |
|
"eval_runtime": 12.2541, |
|
"eval_samples_per_second": 30.357, |
|
"eval_steps_per_second": 3.835, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 4.014336917562724, |
|
"grad_norm": 0.7177844643592834, |
|
"learning_rate": 1.9533963920549307e-05, |
|
"loss": 1.7656, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 4.587813620071684, |
|
"grad_norm": 0.7868255972862244, |
|
"learning_rate": 1.8961655569610557e-05, |
|
"loss": 1.7057, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 4.989247311827957, |
|
"eval_loss": 1.6268185377120972, |
|
"eval_runtime": 12.2554, |
|
"eval_samples_per_second": 30.354, |
|
"eval_steps_per_second": 3.835, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 5.161290322580645, |
|
"grad_norm": 0.7630313038825989, |
|
"learning_rate": 1.8179293607667177e-05, |
|
"loss": 1.6298, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 5.734767025089606, |
|
"grad_norm": 0.8428413271903992, |
|
"learning_rate": 1.720521593600787e-05, |
|
"loss": 1.5832, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 5.992831541218638, |
|
"eval_loss": 1.5533965826034546, |
|
"eval_runtime": 12.2514, |
|
"eval_samples_per_second": 30.364, |
|
"eval_steps_per_second": 3.836, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 6.308243727598566, |
|
"grad_norm": 0.9982613921165466, |
|
"learning_rate": 1.6062254109666383e-05, |
|
"loss": 1.5144, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 6.881720430107527, |
|
"grad_norm": 0.988570511341095, |
|
"learning_rate": 1.477719818512263e-05, |
|
"loss": 1.4884, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 6.996415770609319, |
|
"eval_loss": 1.4935128688812256, |
|
"eval_runtime": 12.2503, |
|
"eval_samples_per_second": 30.367, |
|
"eval_steps_per_second": 3.837, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 7.455197132616488, |
|
"grad_norm": 0.9964269399642944, |
|
"learning_rate": 1.3380168784085028e-05, |
|
"loss": 1.4513, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 1.4478424787521362, |
|
"eval_runtime": 12.2711, |
|
"eval_samples_per_second": 30.315, |
|
"eval_steps_per_second": 3.83, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 8.028673835125447, |
|
"grad_norm": 1.040726900100708, |
|
"learning_rate": 1.1903911091646684e-05, |
|
"loss": 1.3805, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 8.602150537634408, |
|
"grad_norm": 1.3048664331436157, |
|
"learning_rate": 1.0383027336900356e-05, |
|
"loss": 1.3677, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 8.974910394265233, |
|
"eval_loss": 1.4132319688796997, |
|
"eval_runtime": 12.2651, |
|
"eval_samples_per_second": 30.33, |
|
"eval_steps_per_second": 3.832, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 9.175627240143369, |
|
"grad_norm": 1.1397103071212769, |
|
"learning_rate": 8.853165746015997e-06, |
|
"loss": 1.3506, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 9.74910394265233, |
|
"grad_norm": 1.4186557531356812, |
|
"learning_rate": 7.350184978033386e-06, |
|
"loss": 1.3173, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 9.978494623655914, |
|
"eval_loss": 1.3905054330825806, |
|
"eval_runtime": 12.26, |
|
"eval_samples_per_second": 30.343, |
|
"eval_steps_per_second": 3.834, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 10.32258064516129, |
|
"grad_norm": 1.2632255554199219, |
|
"learning_rate": 5.9093136282866014e-06, |
|
"loss": 1.3071, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 10.89605734767025, |
|
"grad_norm": 1.3758857250213623, |
|
"learning_rate": 4.56432449998779e-06, |
|
"loss": 1.2863, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 10.982078853046595, |
|
"eval_loss": 1.375404953956604, |
|
"eval_runtime": 12.2609, |
|
"eval_samples_per_second": 30.34, |
|
"eval_steps_per_second": 3.833, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 11.469534050179211, |
|
"grad_norm": 1.365923285484314, |
|
"learning_rate": 3.3467429983443477e-06, |
|
"loss": 1.2785, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 11.985663082437275, |
|
"eval_loss": 1.366598129272461, |
|
"eval_runtime": 12.2562, |
|
"eval_samples_per_second": 30.352, |
|
"eval_steps_per_second": 3.835, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 12.043010752688172, |
|
"grad_norm": 1.328456163406372, |
|
"learning_rate": 2.2851082017805704e-06, |
|
"loss": 1.2641, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 12.616487455197133, |
|
"grad_norm": 1.3476920127868652, |
|
"learning_rate": 1.4043039301279904e-06, |
|
"loss": 1.261, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 12.989247311827956, |
|
"eval_loss": 1.3626736402511597, |
|
"eval_runtime": 12.2637, |
|
"eval_samples_per_second": 30.333, |
|
"eval_steps_per_second": 3.832, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 13.189964157706093, |
|
"grad_norm": 1.4261298179626465, |
|
"learning_rate": 7.249754889790539e-07, |
|
"loss": 1.2505, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 13.763440860215054, |
|
"grad_norm": 1.580108642578125, |
|
"learning_rate": 2.6304576122221035e-07, |
|
"loss": 1.2546, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 13.992831541218639, |
|
"eval_loss": 1.3614939451217651, |
|
"eval_runtime": 12.2686, |
|
"eval_samples_per_second": 30.321, |
|
"eval_steps_per_second": 3.831, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 14.336917562724015, |
|
"grad_norm": 1.3021036386489868, |
|
"learning_rate": 2.9341988162595593e-08, |
|
"loss": 1.27, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 14.623655913978494, |
|
"eval_loss": 1.361409306526184, |
|
"eval_runtime": 12.1836, |
|
"eval_samples_per_second": 30.533, |
|
"eval_steps_per_second": 3.858, |
|
"step": 510 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 510, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 15, |
|
"save_steps": 500, |
|
"total_flos": 2.684450537417933e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|