|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 27270, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.18335166850018336, |
|
"grad_norm": 3.5391387939453125, |
|
"learning_rate": 4.908324165749908e-05, |
|
"loss": 2.4276, |
|
"num_input_tokens_seen": 1750768, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3667033370003667, |
|
"grad_norm": 3.4214749336242676, |
|
"learning_rate": 4.816648331499817e-05, |
|
"loss": 2.2532, |
|
"num_input_tokens_seen": 3485632, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5500550055005501, |
|
"grad_norm": 3.533691167831421, |
|
"learning_rate": 4.724972497249725e-05, |
|
"loss": 2.1894, |
|
"num_input_tokens_seen": 5230688, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.7334066740007334, |
|
"grad_norm": 3.7089884281158447, |
|
"learning_rate": 4.633296662999633e-05, |
|
"loss": 2.1511, |
|
"num_input_tokens_seen": 6971344, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.9167583425009168, |
|
"grad_norm": 4.088582515716553, |
|
"learning_rate": 4.541620828749542e-05, |
|
"loss": 2.1089, |
|
"num_input_tokens_seen": 8738136, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.1001100110011002, |
|
"grad_norm": 4.8249077796936035, |
|
"learning_rate": 4.449944994499451e-05, |
|
"loss": 2.0594, |
|
"num_input_tokens_seen": 10466350, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.2834616795012834, |
|
"grad_norm": 3.9551169872283936, |
|
"learning_rate": 4.358269160249359e-05, |
|
"loss": 2.0194, |
|
"num_input_tokens_seen": 12222070, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.466813348001467, |
|
"grad_norm": 3.0416815280914307, |
|
"learning_rate": 4.266593325999267e-05, |
|
"loss": 2.0019, |
|
"num_input_tokens_seen": 13976918, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.6501650165016502, |
|
"grad_norm": 3.295426607131958, |
|
"learning_rate": 4.174917491749175e-05, |
|
"loss": 2.0024, |
|
"num_input_tokens_seen": 15721702, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.8335166850018334, |
|
"grad_norm": 4.8525309562683105, |
|
"learning_rate": 4.0832416574990836e-05, |
|
"loss": 1.9935, |
|
"num_input_tokens_seen": 17458590, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.0168683535020167, |
|
"grad_norm": 4.256695747375488, |
|
"learning_rate": 3.991565823248992e-05, |
|
"loss": 1.9769, |
|
"num_input_tokens_seen": 19188010, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.2002200220022003, |
|
"grad_norm": 4.129441738128662, |
|
"learning_rate": 3.8998899889989e-05, |
|
"loss": 1.9108, |
|
"num_input_tokens_seen": 20932210, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.3835716905023836, |
|
"grad_norm": 2.544461250305176, |
|
"learning_rate": 3.808214154748808e-05, |
|
"loss": 1.9047, |
|
"num_input_tokens_seen": 22658578, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 2.566923359002567, |
|
"grad_norm": 4.752838611602783, |
|
"learning_rate": 3.716538320498717e-05, |
|
"loss": 1.9119, |
|
"num_input_tokens_seen": 24411482, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 2.7502750275027505, |
|
"grad_norm": 4.965038776397705, |
|
"learning_rate": 3.624862486248625e-05, |
|
"loss": 1.8986, |
|
"num_input_tokens_seen": 26157770, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 2.933626696002934, |
|
"grad_norm": 4.416258335113525, |
|
"learning_rate": 3.5331866519985334e-05, |
|
"loss": 1.9086, |
|
"num_input_tokens_seen": 27912394, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 3.116978364503117, |
|
"grad_norm": 3.501598596572876, |
|
"learning_rate": 3.4415108177484414e-05, |
|
"loss": 1.868, |
|
"num_input_tokens_seen": 29671328, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 3.3003300330033003, |
|
"grad_norm": 3.8959696292877197, |
|
"learning_rate": 3.34983498349835e-05, |
|
"loss": 1.8465, |
|
"num_input_tokens_seen": 31405544, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 3.4836817015034836, |
|
"grad_norm": 3.5625758171081543, |
|
"learning_rate": 3.258159149248258e-05, |
|
"loss": 1.8463, |
|
"num_input_tokens_seen": 33146784, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 3.667033370003667, |
|
"grad_norm": 3.303110122680664, |
|
"learning_rate": 3.166483314998166e-05, |
|
"loss": 1.8394, |
|
"num_input_tokens_seen": 34888072, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 3.8503850385038505, |
|
"grad_norm": 3.5172908306121826, |
|
"learning_rate": 3.074807480748075e-05, |
|
"loss": 1.8379, |
|
"num_input_tokens_seen": 36645960, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 4.033736707004033, |
|
"grad_norm": 4.386786460876465, |
|
"learning_rate": 2.983131646497983e-05, |
|
"loss": 1.8245, |
|
"num_input_tokens_seen": 38388631, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 4.2170883755042174, |
|
"grad_norm": 3.2586567401885986, |
|
"learning_rate": 2.891455812247892e-05, |
|
"loss": 1.8029, |
|
"num_input_tokens_seen": 40139079, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 4.400440044004401, |
|
"grad_norm": 3.6384007930755615, |
|
"learning_rate": 2.7997799779978003e-05, |
|
"loss": 1.7909, |
|
"num_input_tokens_seen": 41872751, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 4.583791712504584, |
|
"grad_norm": 4.475183486938477, |
|
"learning_rate": 2.7081041437477084e-05, |
|
"loss": 1.791, |
|
"num_input_tokens_seen": 43618911, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 4.767143381004767, |
|
"grad_norm": 4.72713041305542, |
|
"learning_rate": 2.6164283094976168e-05, |
|
"loss": 1.7745, |
|
"num_input_tokens_seen": 45373143, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 4.9504950495049505, |
|
"grad_norm": 3.3076839447021484, |
|
"learning_rate": 2.5247524752475248e-05, |
|
"loss": 1.7968, |
|
"num_input_tokens_seen": 47112151, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 5.133846718005134, |
|
"grad_norm": 4.046383857727051, |
|
"learning_rate": 2.4330766409974332e-05, |
|
"loss": 1.7611, |
|
"num_input_tokens_seen": 48852751, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 5.317198386505317, |
|
"grad_norm": 3.291144609451294, |
|
"learning_rate": 2.3414008067473413e-05, |
|
"loss": 1.7363, |
|
"num_input_tokens_seen": 50602567, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 5.5005500550055, |
|
"grad_norm": 4.23388671875, |
|
"learning_rate": 2.24972497249725e-05, |
|
"loss": 1.7814, |
|
"num_input_tokens_seen": 52369863, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 5.683901723505684, |
|
"grad_norm": 3.1835505962371826, |
|
"learning_rate": 2.158049138247158e-05, |
|
"loss": 1.751, |
|
"num_input_tokens_seen": 54115983, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 5.867253392005868, |
|
"grad_norm": 3.593493938446045, |
|
"learning_rate": 2.0663733039970665e-05, |
|
"loss": 1.7481, |
|
"num_input_tokens_seen": 55853919, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 6.050605060506051, |
|
"grad_norm": 4.3933258056640625, |
|
"learning_rate": 1.9746974697469746e-05, |
|
"loss": 1.7506, |
|
"num_input_tokens_seen": 57581239, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 6.233956729006234, |
|
"grad_norm": 3.6081910133361816, |
|
"learning_rate": 1.883021635496883e-05, |
|
"loss": 1.7294, |
|
"num_input_tokens_seen": 59313735, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 6.417308397506417, |
|
"grad_norm": 3.7784392833709717, |
|
"learning_rate": 1.7913458012467914e-05, |
|
"loss": 1.719, |
|
"num_input_tokens_seen": 61061911, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 6.600660066006601, |
|
"grad_norm": 3.5482571125030518, |
|
"learning_rate": 1.6996699669966998e-05, |
|
"loss": 1.7184, |
|
"num_input_tokens_seen": 62802279, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 6.784011734506784, |
|
"grad_norm": 3.797348737716675, |
|
"learning_rate": 1.6079941327466082e-05, |
|
"loss": 1.7101, |
|
"num_input_tokens_seen": 64536303, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 6.967363403006967, |
|
"grad_norm": 3.9275312423706055, |
|
"learning_rate": 1.5163182984965163e-05, |
|
"loss": 1.7153, |
|
"num_input_tokens_seen": 66282967, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 7.15071507150715, |
|
"grad_norm": 3.65077805519104, |
|
"learning_rate": 1.4246424642464248e-05, |
|
"loss": 1.7181, |
|
"num_input_tokens_seen": 68030296, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 7.334066740007334, |
|
"grad_norm": 4.696651458740234, |
|
"learning_rate": 1.3329666299963331e-05, |
|
"loss": 1.6992, |
|
"num_input_tokens_seen": 69767824, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 7.517418408507518, |
|
"grad_norm": 5.405508518218994, |
|
"learning_rate": 1.2412907957462413e-05, |
|
"loss": 1.6903, |
|
"num_input_tokens_seen": 71509128, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 7.700770077007701, |
|
"grad_norm": 3.7343809604644775, |
|
"learning_rate": 1.1496149614961496e-05, |
|
"loss": 1.7019, |
|
"num_input_tokens_seen": 73255224, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 7.884121745507884, |
|
"grad_norm": 4.133444786071777, |
|
"learning_rate": 1.057939127246058e-05, |
|
"loss": 1.6959, |
|
"num_input_tokens_seen": 75002496, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 8.067473414008067, |
|
"grad_norm": 4.398416996002197, |
|
"learning_rate": 9.662632929959662e-06, |
|
"loss": 1.7018, |
|
"num_input_tokens_seen": 76756073, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 8.250825082508252, |
|
"grad_norm": 4.565046310424805, |
|
"learning_rate": 8.745874587458746e-06, |
|
"loss": 1.6837, |
|
"num_input_tokens_seen": 78483465, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 8.434176751008435, |
|
"grad_norm": 3.950497627258301, |
|
"learning_rate": 7.829116244957828e-06, |
|
"loss": 1.6913, |
|
"num_input_tokens_seen": 80220865, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 8.617528419508618, |
|
"grad_norm": 3.9700405597686768, |
|
"learning_rate": 6.912357902456913e-06, |
|
"loss": 1.6814, |
|
"num_input_tokens_seen": 81964649, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 8.800880088008801, |
|
"grad_norm": 3.21114444732666, |
|
"learning_rate": 5.995599559955996e-06, |
|
"loss": 1.689, |
|
"num_input_tokens_seen": 83718889, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 8.984231756508985, |
|
"grad_norm": 3.5966849327087402, |
|
"learning_rate": 5.078841217455079e-06, |
|
"loss": 1.6734, |
|
"num_input_tokens_seen": 85471529, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 9.167583425009168, |
|
"grad_norm": 3.4596688747406006, |
|
"learning_rate": 4.162082874954162e-06, |
|
"loss": 1.6792, |
|
"num_input_tokens_seen": 87214771, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 9.350935093509351, |
|
"grad_norm": 3.9838054180145264, |
|
"learning_rate": 3.2453245324532458e-06, |
|
"loss": 1.6583, |
|
"num_input_tokens_seen": 88949475, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 9.534286762009534, |
|
"grad_norm": 3.389430522918701, |
|
"learning_rate": 2.3285661899523286e-06, |
|
"loss": 1.6836, |
|
"num_input_tokens_seen": 90694267, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 9.717638430509718, |
|
"grad_norm": 4.560466289520264, |
|
"learning_rate": 1.411807847451412e-06, |
|
"loss": 1.6804, |
|
"num_input_tokens_seen": 92441267, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 9.900990099009901, |
|
"grad_norm": 4.484193325042725, |
|
"learning_rate": 4.950495049504951e-07, |
|
"loss": 1.6876, |
|
"num_input_tokens_seen": 94186835, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"num_input_tokens_seen": 95128823, |
|
"step": 27270, |
|
"total_flos": 3.4538173670639616e+16, |
|
"train_loss": 1.8293144167322006, |
|
"train_runtime": 2454.5506, |
|
"train_samples_per_second": 88.859, |
|
"train_steps_per_second": 11.11, |
|
"train_tokens_per_second": 38762.215 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 27270, |
|
"num_input_tokens_seen": 95128823, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.4538173670639616e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|