|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 6.153846153846154, |
|
"eval_steps": 50, |
|
"global_step": 2000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"grad_norm": 25.85894203186035, |
|
"learning_rate": 3.0769230769230774e-06, |
|
"loss": 1.0987, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"eval_loss": 0.7039205431938171, |
|
"eval_runtime": 1.7807, |
|
"eval_samples_per_second": 64.02, |
|
"eval_steps_per_second": 3.369, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 10.326680183410645, |
|
"learning_rate": 6.153846153846155e-06, |
|
"loss": 0.6388, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"eval_loss": 0.6586376428604126, |
|
"eval_runtime": 1.795, |
|
"eval_samples_per_second": 63.511, |
|
"eval_steps_per_second": 3.343, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.46153846153846156, |
|
"grad_norm": 2.875251531600952, |
|
"learning_rate": 9.230769230769232e-06, |
|
"loss": 0.6463, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.46153846153846156, |
|
"eval_loss": 0.6574278473854065, |
|
"eval_runtime": 1.7356, |
|
"eval_samples_per_second": 65.682, |
|
"eval_steps_per_second": 3.457, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 2.7418408393859863, |
|
"learning_rate": 1.230769230769231e-05, |
|
"loss": 0.6721, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"eval_loss": 0.6629524827003479, |
|
"eval_runtime": 1.7437, |
|
"eval_samples_per_second": 65.378, |
|
"eval_steps_per_second": 3.441, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 1.8382577896118164, |
|
"learning_rate": 1.5384615384615387e-05, |
|
"loss": 0.6288, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"eval_loss": 0.673968493938446, |
|
"eval_runtime": 1.8191, |
|
"eval_samples_per_second": 62.667, |
|
"eval_steps_per_second": 3.298, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 2.8036742210388184, |
|
"learning_rate": 1.8461538461538465e-05, |
|
"loss": 0.7276, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"eval_loss": 0.6912825107574463, |
|
"eval_runtime": 1.7707, |
|
"eval_samples_per_second": 64.38, |
|
"eval_steps_per_second": 3.388, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.0769230769230769, |
|
"grad_norm": 1.469930648803711, |
|
"learning_rate": 1.9996395276708856e-05, |
|
"loss": 0.5365, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.0769230769230769, |
|
"eval_loss": 0.7528238296508789, |
|
"eval_runtime": 1.7853, |
|
"eval_samples_per_second": 63.856, |
|
"eval_steps_per_second": 3.361, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"grad_norm": 2.5635602474212646, |
|
"learning_rate": 1.9967573081342103e-05, |
|
"loss": 0.4279, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"eval_loss": 0.7624219655990601, |
|
"eval_runtime": 1.8133, |
|
"eval_samples_per_second": 62.869, |
|
"eval_steps_per_second": 3.309, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.3846153846153846, |
|
"grad_norm": 1.6066973209381104, |
|
"learning_rate": 1.9910011792459086e-05, |
|
"loss": 0.42, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.3846153846153846, |
|
"eval_loss": 0.7675647139549255, |
|
"eval_runtime": 1.7503, |
|
"eval_samples_per_second": 65.132, |
|
"eval_steps_per_second": 3.428, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 2.782292604446411, |
|
"learning_rate": 1.9823877374156647e-05, |
|
"loss": 0.4284, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"eval_loss": 0.7663527131080627, |
|
"eval_runtime": 1.7731, |
|
"eval_samples_per_second": 64.293, |
|
"eval_steps_per_second": 3.384, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.6923076923076923, |
|
"grad_norm": 1.985113263130188, |
|
"learning_rate": 1.9709418174260523e-05, |
|
"loss": 0.4615, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.6923076923076923, |
|
"eval_loss": 0.7588106989860535, |
|
"eval_runtime": 1.743, |
|
"eval_samples_per_second": 65.406, |
|
"eval_steps_per_second": 3.442, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.8461538461538463, |
|
"grad_norm": 3.593240261077881, |
|
"learning_rate": 1.9566964208274254e-05, |
|
"loss": 0.4493, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.8461538461538463, |
|
"eval_loss": 0.7685819864273071, |
|
"eval_runtime": 1.7745, |
|
"eval_samples_per_second": 64.242, |
|
"eval_steps_per_second": 3.381, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 2.46645450592041, |
|
"learning_rate": 1.9396926207859085e-05, |
|
"loss": 0.4103, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.7486200928688049, |
|
"eval_runtime": 1.7602, |
|
"eval_samples_per_second": 64.766, |
|
"eval_steps_per_second": 3.409, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.1538461538461537, |
|
"grad_norm": 2.113468647003174, |
|
"learning_rate": 1.9199794436588244e-05, |
|
"loss": 0.2133, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.1538461538461537, |
|
"eval_loss": 0.8937916159629822, |
|
"eval_runtime": 1.7444, |
|
"eval_samples_per_second": 65.35, |
|
"eval_steps_per_second": 3.439, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.3076923076923075, |
|
"grad_norm": 1.9733953475952148, |
|
"learning_rate": 1.8976137276390145e-05, |
|
"loss": 0.2181, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.3076923076923075, |
|
"eval_loss": 0.8633579611778259, |
|
"eval_runtime": 1.7477, |
|
"eval_samples_per_second": 65.228, |
|
"eval_steps_per_second": 3.433, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.4615384615384617, |
|
"grad_norm": 1.7347004413604736, |
|
"learning_rate": 1.8726599588756144e-05, |
|
"loss": 0.2005, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.4615384615384617, |
|
"eval_loss": 0.8777443766593933, |
|
"eval_runtime": 1.7381, |
|
"eval_samples_per_second": 65.588, |
|
"eval_steps_per_second": 3.452, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.6153846153846154, |
|
"grad_norm": 1.5702624320983887, |
|
"learning_rate": 1.845190085543795e-05, |
|
"loss": 0.2157, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.6153846153846154, |
|
"eval_loss": 0.8784948587417603, |
|
"eval_runtime": 1.7673, |
|
"eval_samples_per_second": 64.505, |
|
"eval_steps_per_second": 3.395, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.769230769230769, |
|
"grad_norm": 2.3403327465057373, |
|
"learning_rate": 1.8152833103995443e-05, |
|
"loss": 0.2115, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.769230769230769, |
|
"eval_loss": 0.8863385915756226, |
|
"eval_runtime": 1.7696, |
|
"eval_samples_per_second": 64.421, |
|
"eval_steps_per_second": 3.391, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.9230769230769234, |
|
"grad_norm": 1.9724828004837036, |
|
"learning_rate": 1.7830258624176224e-05, |
|
"loss": 0.2225, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.9230769230769234, |
|
"eval_loss": 0.8796689510345459, |
|
"eval_runtime": 1.7943, |
|
"eval_samples_per_second": 63.534, |
|
"eval_steps_per_second": 3.344, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"grad_norm": 1.9262828826904297, |
|
"learning_rate": 1.7485107481711014e-05, |
|
"loss": 0.1693, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"eval_loss": 0.9404221177101135, |
|
"eval_runtime": 1.8296, |
|
"eval_samples_per_second": 62.309, |
|
"eval_steps_per_second": 3.279, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.230769230769231, |
|
"grad_norm": 1.1772091388702393, |
|
"learning_rate": 1.7118374836693407e-05, |
|
"loss": 0.1256, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 3.230769230769231, |
|
"eval_loss": 0.9360042810440063, |
|
"eval_runtime": 1.8904, |
|
"eval_samples_per_second": 60.305, |
|
"eval_steps_per_second": 3.174, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 3.3846153846153846, |
|
"grad_norm": 1.739654302597046, |
|
"learning_rate": 1.67311180742757e-05, |
|
"loss": 0.125, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.3846153846153846, |
|
"eval_loss": 0.9563117623329163, |
|
"eval_runtime": 1.7652, |
|
"eval_samples_per_second": 64.581, |
|
"eval_steps_per_second": 3.399, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.5384615384615383, |
|
"grad_norm": 1.3015375137329102, |
|
"learning_rate": 1.6324453755953772e-05, |
|
"loss": 0.1281, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.5384615384615383, |
|
"eval_loss": 0.9608204960823059, |
|
"eval_runtime": 1.7445, |
|
"eval_samples_per_second": 65.347, |
|
"eval_steps_per_second": 3.439, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.6923076923076925, |
|
"grad_norm": 2.1856415271759033, |
|
"learning_rate": 1.5899554400231233e-05, |
|
"loss": 0.1264, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.6923076923076925, |
|
"eval_loss": 0.9677081108093262, |
|
"eval_runtime": 1.7748, |
|
"eval_samples_per_second": 64.233, |
|
"eval_steps_per_second": 3.381, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.8461538461538463, |
|
"grad_norm": 1.8629130125045776, |
|
"learning_rate": 1.5457645101945046e-05, |
|
"loss": 0.1267, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 3.8461538461538463, |
|
"eval_loss": 0.9708028435707092, |
|
"eval_runtime": 1.7595, |
|
"eval_samples_per_second": 64.791, |
|
"eval_steps_per_second": 3.41, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 2.2338545322418213, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.1324, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.9690905809402466, |
|
"eval_runtime": 1.7742, |
|
"eval_samples_per_second": 64.255, |
|
"eval_steps_per_second": 3.382, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 4.153846153846154, |
|
"grad_norm": 1.040000557899475, |
|
"learning_rate": 1.4527938603696376e-05, |
|
"loss": 0.0931, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 4.153846153846154, |
|
"eval_loss": 0.9888765811920166, |
|
"eval_runtime": 1.7454, |
|
"eval_samples_per_second": 65.315, |
|
"eval_steps_per_second": 3.438, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 4.3076923076923075, |
|
"grad_norm": 0.49024778604507446, |
|
"learning_rate": 1.404282198824305e-05, |
|
"loss": 0.0829, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 4.3076923076923075, |
|
"eval_loss": 1.0040687322616577, |
|
"eval_runtime": 1.798, |
|
"eval_samples_per_second": 63.406, |
|
"eval_steps_per_second": 3.337, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 4.461538461538462, |
|
"grad_norm": 1.1068203449249268, |
|
"learning_rate": 1.3546048870425356e-05, |
|
"loss": 0.0908, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 4.461538461538462, |
|
"eval_loss": 1.0206270217895508, |
|
"eval_runtime": 1.7439, |
|
"eval_samples_per_second": 65.37, |
|
"eval_steps_per_second": 3.441, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 4.615384615384615, |
|
"grad_norm": 1.7601001262664795, |
|
"learning_rate": 1.303905157574247e-05, |
|
"loss": 0.0845, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.615384615384615, |
|
"eval_loss": 1.026615023612976, |
|
"eval_runtime": 1.7715, |
|
"eval_samples_per_second": 64.352, |
|
"eval_steps_per_second": 3.387, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.769230769230769, |
|
"grad_norm": 1.9115608930587769, |
|
"learning_rate": 1.2523291908642219e-05, |
|
"loss": 0.087, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 4.769230769230769, |
|
"eval_loss": 1.03285551071167, |
|
"eval_runtime": 1.7503, |
|
"eval_samples_per_second": 65.13, |
|
"eval_steps_per_second": 3.428, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 4.923076923076923, |
|
"grad_norm": 0.8985757231712341, |
|
"learning_rate": 1.2000256937760446e-05, |
|
"loss": 0.0833, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 4.923076923076923, |
|
"eval_loss": 1.0319923162460327, |
|
"eval_runtime": 1.8324, |
|
"eval_samples_per_second": 62.213, |
|
"eval_steps_per_second": 3.274, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 5.076923076923077, |
|
"grad_norm": 0.524580717086792, |
|
"learning_rate": 1.1471454708317163e-05, |
|
"loss": 0.0758, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 5.076923076923077, |
|
"eval_loss": 1.0695827007293701, |
|
"eval_runtime": 1.8442, |
|
"eval_samples_per_second": 61.814, |
|
"eval_steps_per_second": 3.253, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 5.230769230769231, |
|
"grad_norm": 0.7894753813743591, |
|
"learning_rate": 1.0938409894031793e-05, |
|
"loss": 0.0645, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 5.230769230769231, |
|
"eval_loss": 1.0913734436035156, |
|
"eval_runtime": 1.77, |
|
"eval_samples_per_second": 64.406, |
|
"eval_steps_per_second": 3.39, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 5.384615384615385, |
|
"grad_norm": 0.9538184404373169, |
|
"learning_rate": 1.0402659401094154e-05, |
|
"loss": 0.0586, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 5.384615384615385, |
|
"eval_loss": 1.0830014944076538, |
|
"eval_runtime": 1.7555, |
|
"eval_samples_per_second": 64.939, |
|
"eval_steps_per_second": 3.418, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 5.538461538461538, |
|
"grad_norm": 0.7752998471260071, |
|
"learning_rate": 9.865747936866027e-06, |
|
"loss": 0.0611, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 5.538461538461538, |
|
"eval_loss": 1.0868384838104248, |
|
"eval_runtime": 1.762, |
|
"eval_samples_per_second": 64.699, |
|
"eval_steps_per_second": 3.405, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 5.6923076923076925, |
|
"grad_norm": 1.0308915376663208, |
|
"learning_rate": 9.329223556089976e-06, |
|
"loss": 0.0675, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 5.6923076923076925, |
|
"eval_loss": 1.0818272829055786, |
|
"eval_runtime": 1.7338, |
|
"eval_samples_per_second": 65.753, |
|
"eval_steps_per_second": 3.461, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 5.846153846153846, |
|
"grad_norm": 0.47693803906440735, |
|
"learning_rate": 8.79463319744677e-06, |
|
"loss": 0.0631, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 5.846153846153846, |
|
"eval_loss": 1.0780550241470337, |
|
"eval_runtime": 1.778, |
|
"eval_samples_per_second": 64.118, |
|
"eval_steps_per_second": 3.375, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.3759777247905731, |
|
"learning_rate": 8.263518223330698e-06, |
|
"loss": 0.0609, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 1.0708911418914795, |
|
"eval_runtime": 1.7659, |
|
"eval_samples_per_second": 64.557, |
|
"eval_steps_per_second": 3.398, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 6.153846153846154, |
|
"grad_norm": 0.34113025665283203, |
|
"learning_rate": 7.73740997570278e-06, |
|
"loss": 0.0473, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 6.153846153846154, |
|
"eval_loss": 1.1129051446914673, |
|
"eval_runtime": 1.7485, |
|
"eval_samples_per_second": 65.199, |
|
"eval_steps_per_second": 3.432, |
|
"step": 2000 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 3250, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 1000, |
|
"total_flos": 1.223215769756631e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|