|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 50, |
|
"global_step": 2000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 7.943530082702637, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 1.8114, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 1.1615628004074097, |
|
"eval_runtime": 3.153, |
|
"eval_samples_per_second": 49.476, |
|
"eval_steps_per_second": 2.537, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 5.800487518310547, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.9446, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"eval_loss": 0.8041068315505981, |
|
"eval_runtime": 3.0734, |
|
"eval_samples_per_second": 50.759, |
|
"eval_steps_per_second": 2.603, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 3.3142590522766113, |
|
"learning_rate": 3e-06, |
|
"loss": 0.829, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"eval_loss": 0.7686946392059326, |
|
"eval_runtime": 3.0993, |
|
"eval_samples_per_second": 50.334, |
|
"eval_steps_per_second": 2.581, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.635634422302246, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.8045, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 0.754519522190094, |
|
"eval_runtime": 3.0961, |
|
"eval_samples_per_second": 50.386, |
|
"eval_steps_per_second": 2.584, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 3.970125913619995, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7957, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 0.7530429363250732, |
|
"eval_runtime": 3.0913, |
|
"eval_samples_per_second": 50.465, |
|
"eval_steps_per_second": 2.588, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 3.469130516052246, |
|
"learning_rate": 6e-06, |
|
"loss": 0.7833, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"eval_loss": 0.7474463582038879, |
|
"eval_runtime": 3.0973, |
|
"eval_samples_per_second": 50.366, |
|
"eval_steps_per_second": 2.583, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 2.752779722213745, |
|
"learning_rate": 7e-06, |
|
"loss": 0.8005, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"eval_loss": 0.7482121586799622, |
|
"eval_runtime": 3.0919, |
|
"eval_samples_per_second": 50.454, |
|
"eval_steps_per_second": 2.587, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 2.9983832836151123, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.771, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 0.7438180446624756, |
|
"eval_runtime": 3.0859, |
|
"eval_samples_per_second": 50.553, |
|
"eval_steps_per_second": 2.592, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 2.574100971221924, |
|
"learning_rate": 9e-06, |
|
"loss": 0.7449, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"eval_loss": 0.7498941421508789, |
|
"eval_runtime": 3.0388, |
|
"eval_samples_per_second": 51.336, |
|
"eval_steps_per_second": 2.633, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 2.4326021671295166, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7533, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 0.7558138966560364, |
|
"eval_runtime": 3.0822, |
|
"eval_samples_per_second": 50.614, |
|
"eval_steps_per_second": 2.596, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 2.791947364807129, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 0.8062, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"eval_loss": 0.7553198337554932, |
|
"eval_runtime": 3.0823, |
|
"eval_samples_per_second": 50.611, |
|
"eval_steps_per_second": 2.595, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 2.409931182861328, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.8354, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_loss": 0.7591729760169983, |
|
"eval_runtime": 3.0925, |
|
"eval_samples_per_second": 50.444, |
|
"eval_steps_per_second": 2.587, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 2.654364585876465, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 0.8084, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"eval_loss": 0.7605798244476318, |
|
"eval_runtime": 3.0938, |
|
"eval_samples_per_second": 50.423, |
|
"eval_steps_per_second": 2.586, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 2.8715052604675293, |
|
"learning_rate": 1.4e-05, |
|
"loss": 0.7464, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"eval_loss": 0.768295168876648, |
|
"eval_runtime": 3.0904, |
|
"eval_samples_per_second": 50.479, |
|
"eval_steps_per_second": 2.589, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 2.547102451324463, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.7652, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_loss": 0.7770843505859375, |
|
"eval_runtime": 3.0851, |
|
"eval_samples_per_second": 50.566, |
|
"eval_steps_per_second": 2.593, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.6396355628967285, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.7807, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 0.7774565815925598, |
|
"eval_runtime": 3.0842, |
|
"eval_samples_per_second": 50.581, |
|
"eval_steps_per_second": 2.594, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 2.243042230606079, |
|
"learning_rate": 1.7e-05, |
|
"loss": 0.8038, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"eval_loss": 0.778019905090332, |
|
"eval_runtime": 3.0739, |
|
"eval_samples_per_second": 50.75, |
|
"eval_steps_per_second": 2.603, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 2.5405020713806152, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.8292, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"eval_loss": 0.7810414433479309, |
|
"eval_runtime": 3.0805, |
|
"eval_samples_per_second": 50.641, |
|
"eval_steps_per_second": 2.597, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 2.9373252391815186, |
|
"learning_rate": 1.9e-05, |
|
"loss": 0.8057, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"eval_loss": 0.7795991897583008, |
|
"eval_runtime": 3.0813, |
|
"eval_samples_per_second": 50.627, |
|
"eval_steps_per_second": 2.596, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.0142874717712402, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8265, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.7865928411483765, |
|
"eval_runtime": 3.0922, |
|
"eval_samples_per_second": 50.449, |
|
"eval_steps_per_second": 2.587, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 2.4737377166748047, |
|
"learning_rate": 1.9998476951563914e-05, |
|
"loss": 0.567, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"eval_loss": 0.8242304921150208, |
|
"eval_runtime": 3.0914, |
|
"eval_samples_per_second": 50.462, |
|
"eval_steps_per_second": 2.588, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 1.9739576578140259, |
|
"learning_rate": 1.999390827019096e-05, |
|
"loss": 0.5251, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"eval_loss": 0.8305041193962097, |
|
"eval_runtime": 3.0703, |
|
"eval_samples_per_second": 50.81, |
|
"eval_steps_per_second": 2.606, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 3.079801321029663, |
|
"learning_rate": 1.9986295347545738e-05, |
|
"loss": 0.5836, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"eval_loss": 0.8230717778205872, |
|
"eval_runtime": 3.0905, |
|
"eval_samples_per_second": 50.478, |
|
"eval_steps_per_second": 2.589, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 2.380758047103882, |
|
"learning_rate": 1.9975640502598243e-05, |
|
"loss": 0.5404, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"eval_loss": 0.8192301392555237, |
|
"eval_runtime": 3.0886, |
|
"eval_samples_per_second": 50.508, |
|
"eval_steps_per_second": 2.59, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 2.047494649887085, |
|
"learning_rate": 1.9961946980917457e-05, |
|
"loss": 0.5448, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"eval_loss": 0.8297872543334961, |
|
"eval_runtime": 3.0696, |
|
"eval_samples_per_second": 50.821, |
|
"eval_steps_per_second": 2.606, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 2.4927003383636475, |
|
"learning_rate": 1.9945218953682736e-05, |
|
"loss": 0.5505, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"eval_loss": 0.8312389850616455, |
|
"eval_runtime": 3.0792, |
|
"eval_samples_per_second": 50.662, |
|
"eval_steps_per_second": 2.598, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 3.409341812133789, |
|
"learning_rate": 1.9925461516413224e-05, |
|
"loss": 0.5726, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"eval_loss": 0.8322041630744934, |
|
"eval_runtime": 3.0883, |
|
"eval_samples_per_second": 50.514, |
|
"eval_steps_per_second": 2.59, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 1.5658016204833984, |
|
"learning_rate": 1.9902680687415704e-05, |
|
"loss": 0.5687, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"eval_loss": 0.8255723118782043, |
|
"eval_runtime": 3.0902, |
|
"eval_samples_per_second": 50.481, |
|
"eval_steps_per_second": 2.589, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 2.2273943424224854, |
|
"learning_rate": 1.9876883405951378e-05, |
|
"loss": 0.5768, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"eval_loss": 0.8274821639060974, |
|
"eval_runtime": 3.0903, |
|
"eval_samples_per_second": 50.48, |
|
"eval_steps_per_second": 2.589, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 1.8873156309127808, |
|
"learning_rate": 1.9848077530122083e-05, |
|
"loss": 0.5437, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"eval_loss": 0.8350476026535034, |
|
"eval_runtime": 3.0892, |
|
"eval_samples_per_second": 50.498, |
|
"eval_steps_per_second": 2.59, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 1.8373135328292847, |
|
"learning_rate": 1.9816271834476642e-05, |
|
"loss": 0.569, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"eval_loss": 0.8110507726669312, |
|
"eval_runtime": 3.096, |
|
"eval_samples_per_second": 50.388, |
|
"eval_steps_per_second": 2.584, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 2.555246591567993, |
|
"learning_rate": 1.9781476007338058e-05, |
|
"loss": 0.5672, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"eval_loss": 0.8229615688323975, |
|
"eval_runtime": 3.1033, |
|
"eval_samples_per_second": 50.269, |
|
"eval_steps_per_second": 2.578, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 2.032984972000122, |
|
"learning_rate": 1.9743700647852356e-05, |
|
"loss": 0.5709, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"eval_loss": 0.8179122805595398, |
|
"eval_runtime": 3.0703, |
|
"eval_samples_per_second": 50.81, |
|
"eval_steps_per_second": 2.606, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 2.0338752269744873, |
|
"learning_rate": 1.9702957262759964e-05, |
|
"loss": 0.5945, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"eval_loss": 0.8166740536689758, |
|
"eval_runtime": 3.0576, |
|
"eval_samples_per_second": 51.021, |
|
"eval_steps_per_second": 2.616, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 2.7427613735198975, |
|
"learning_rate": 1.9659258262890683e-05, |
|
"loss": 0.5756, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"eval_loss": 0.8103773593902588, |
|
"eval_runtime": 3.0935, |
|
"eval_samples_per_second": 50.429, |
|
"eval_steps_per_second": 2.586, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 2.876005172729492, |
|
"learning_rate": 1.961261695938319e-05, |
|
"loss": 0.5701, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"eval_loss": 0.8226655125617981, |
|
"eval_runtime": 3.099, |
|
"eval_samples_per_second": 50.338, |
|
"eval_steps_per_second": 2.581, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 2.338442087173462, |
|
"learning_rate": 1.9563047559630356e-05, |
|
"loss": 0.5765, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"eval_loss": 0.8165319561958313, |
|
"eval_runtime": 3.0865, |
|
"eval_samples_per_second": 50.543, |
|
"eval_steps_per_second": 2.592, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 1.9952592849731445, |
|
"learning_rate": 1.9510565162951538e-05, |
|
"loss": 0.6023, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"eval_loss": 0.8059903979301453, |
|
"eval_runtime": 3.057, |
|
"eval_samples_per_second": 51.03, |
|
"eval_steps_per_second": 2.617, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 2.47471284866333, |
|
"learning_rate": 1.945518575599317e-05, |
|
"loss": 0.5544, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"eval_loss": 0.8095174431800842, |
|
"eval_runtime": 3.1028, |
|
"eval_samples_per_second": 50.277, |
|
"eval_steps_per_second": 2.578, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 2.391505002975464, |
|
"learning_rate": 1.9396926207859085e-05, |
|
"loss": 0.5932, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.8160108923912048, |
|
"eval_runtime": 3.0868, |
|
"eval_samples_per_second": 50.537, |
|
"eval_steps_per_second": 2.592, |
|
"step": 2000 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 10000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 400, |
|
"total_flos": 1.140965468316631e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|