{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 27270, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.18335166850018336, "grad_norm": 3.5391387939453125, "learning_rate": 4.908324165749908e-05, "loss": 2.4276, "num_input_tokens_seen": 1750768, "step": 500 }, { "epoch": 0.3667033370003667, "grad_norm": 3.4214749336242676, "learning_rate": 4.816648331499817e-05, "loss": 2.2532, "num_input_tokens_seen": 3485632, "step": 1000 }, { "epoch": 0.5500550055005501, "grad_norm": 3.533691167831421, "learning_rate": 4.724972497249725e-05, "loss": 2.1894, "num_input_tokens_seen": 5230688, "step": 1500 }, { "epoch": 0.7334066740007334, "grad_norm": 3.7089884281158447, "learning_rate": 4.633296662999633e-05, "loss": 2.1511, "num_input_tokens_seen": 6971344, "step": 2000 }, { "epoch": 0.9167583425009168, "grad_norm": 4.088582515716553, "learning_rate": 4.541620828749542e-05, "loss": 2.1089, "num_input_tokens_seen": 8738136, "step": 2500 }, { "epoch": 1.1001100110011002, "grad_norm": 4.8249077796936035, "learning_rate": 4.449944994499451e-05, "loss": 2.0594, "num_input_tokens_seen": 10466350, "step": 3000 }, { "epoch": 1.2834616795012834, "grad_norm": 3.9551169872283936, "learning_rate": 4.358269160249359e-05, "loss": 2.0194, "num_input_tokens_seen": 12222070, "step": 3500 }, { "epoch": 1.466813348001467, "grad_norm": 3.0416815280914307, "learning_rate": 4.266593325999267e-05, "loss": 2.0019, "num_input_tokens_seen": 13976918, "step": 4000 }, { "epoch": 1.6501650165016502, "grad_norm": 3.295426607131958, "learning_rate": 4.174917491749175e-05, "loss": 2.0024, "num_input_tokens_seen": 15721702, "step": 4500 }, { "epoch": 1.8335166850018334, "grad_norm": 4.8525309562683105, "learning_rate": 4.0832416574990836e-05, "loss": 1.9935, "num_input_tokens_seen": 17458590, "step": 5000 }, { "epoch": 2.0168683535020167, "grad_norm": 4.256695747375488, "learning_rate": 3.991565823248992e-05, "loss": 1.9769, "num_input_tokens_seen": 19188010, "step": 5500 }, { "epoch": 2.2002200220022003, "grad_norm": 4.129441738128662, "learning_rate": 3.8998899889989e-05, "loss": 1.9108, "num_input_tokens_seen": 20932210, "step": 6000 }, { "epoch": 2.3835716905023836, "grad_norm": 2.544461250305176, "learning_rate": 3.808214154748808e-05, "loss": 1.9047, "num_input_tokens_seen": 22658578, "step": 6500 }, { "epoch": 2.566923359002567, "grad_norm": 4.752838611602783, "learning_rate": 3.716538320498717e-05, "loss": 1.9119, "num_input_tokens_seen": 24411482, "step": 7000 }, { "epoch": 2.7502750275027505, "grad_norm": 4.965038776397705, "learning_rate": 3.624862486248625e-05, "loss": 1.8986, "num_input_tokens_seen": 26157770, "step": 7500 }, { "epoch": 2.933626696002934, "grad_norm": 4.416258335113525, "learning_rate": 3.5331866519985334e-05, "loss": 1.9086, "num_input_tokens_seen": 27912394, "step": 8000 }, { "epoch": 3.116978364503117, "grad_norm": 3.501598596572876, "learning_rate": 3.4415108177484414e-05, "loss": 1.868, "num_input_tokens_seen": 29671328, "step": 8500 }, { "epoch": 3.3003300330033003, "grad_norm": 3.8959696292877197, "learning_rate": 3.34983498349835e-05, "loss": 1.8465, "num_input_tokens_seen": 31405544, "step": 9000 }, { "epoch": 3.4836817015034836, "grad_norm": 3.5625758171081543, "learning_rate": 3.258159149248258e-05, "loss": 1.8463, "num_input_tokens_seen": 33146784, "step": 9500 }, { "epoch": 3.667033370003667, "grad_norm": 3.303110122680664, "learning_rate": 3.166483314998166e-05, "loss": 1.8394, "num_input_tokens_seen": 34888072, "step": 10000 }, { "epoch": 3.8503850385038505, "grad_norm": 3.5172908306121826, "learning_rate": 3.074807480748075e-05, "loss": 1.8379, "num_input_tokens_seen": 36645960, "step": 10500 }, { "epoch": 4.033736707004033, "grad_norm": 4.386786460876465, "learning_rate": 2.983131646497983e-05, "loss": 1.8245, "num_input_tokens_seen": 38388631, "step": 11000 }, { "epoch": 4.2170883755042174, "grad_norm": 3.2586567401885986, "learning_rate": 2.891455812247892e-05, "loss": 1.8029, "num_input_tokens_seen": 40139079, "step": 11500 }, { "epoch": 4.400440044004401, "grad_norm": 3.6384007930755615, "learning_rate": 2.7997799779978003e-05, "loss": 1.7909, "num_input_tokens_seen": 41872751, "step": 12000 }, { "epoch": 4.583791712504584, "grad_norm": 4.475183486938477, "learning_rate": 2.7081041437477084e-05, "loss": 1.791, "num_input_tokens_seen": 43618911, "step": 12500 }, { "epoch": 4.767143381004767, "grad_norm": 4.72713041305542, "learning_rate": 2.6164283094976168e-05, "loss": 1.7745, "num_input_tokens_seen": 45373143, "step": 13000 }, { "epoch": 4.9504950495049505, "grad_norm": 3.3076839447021484, "learning_rate": 2.5247524752475248e-05, "loss": 1.7968, "num_input_tokens_seen": 47112151, "step": 13500 }, { "epoch": 5.133846718005134, "grad_norm": 4.046383857727051, "learning_rate": 2.4330766409974332e-05, "loss": 1.7611, "num_input_tokens_seen": 48852751, "step": 14000 }, { "epoch": 5.317198386505317, "grad_norm": 3.291144609451294, "learning_rate": 2.3414008067473413e-05, "loss": 1.7363, "num_input_tokens_seen": 50602567, "step": 14500 }, { "epoch": 5.5005500550055, "grad_norm": 4.23388671875, "learning_rate": 2.24972497249725e-05, "loss": 1.7814, "num_input_tokens_seen": 52369863, "step": 15000 }, { "epoch": 5.683901723505684, "grad_norm": 3.1835505962371826, "learning_rate": 2.158049138247158e-05, "loss": 1.751, "num_input_tokens_seen": 54115983, "step": 15500 }, { "epoch": 5.867253392005868, "grad_norm": 3.593493938446045, "learning_rate": 2.0663733039970665e-05, "loss": 1.7481, "num_input_tokens_seen": 55853919, "step": 16000 }, { "epoch": 6.050605060506051, "grad_norm": 4.3933258056640625, "learning_rate": 1.9746974697469746e-05, "loss": 1.7506, "num_input_tokens_seen": 57581239, "step": 16500 }, { "epoch": 6.233956729006234, "grad_norm": 3.6081910133361816, "learning_rate": 1.883021635496883e-05, "loss": 1.7294, "num_input_tokens_seen": 59313735, "step": 17000 }, { "epoch": 6.417308397506417, "grad_norm": 3.7784392833709717, "learning_rate": 1.7913458012467914e-05, "loss": 1.719, "num_input_tokens_seen": 61061911, "step": 17500 }, { "epoch": 6.600660066006601, "grad_norm": 3.5482571125030518, "learning_rate": 1.6996699669966998e-05, "loss": 1.7184, "num_input_tokens_seen": 62802279, "step": 18000 }, { "epoch": 6.784011734506784, "grad_norm": 3.797348737716675, "learning_rate": 1.6079941327466082e-05, "loss": 1.7101, "num_input_tokens_seen": 64536303, "step": 18500 }, { "epoch": 6.967363403006967, "grad_norm": 3.9275312423706055, "learning_rate": 1.5163182984965163e-05, "loss": 1.7153, "num_input_tokens_seen": 66282967, "step": 19000 }, { "epoch": 7.15071507150715, "grad_norm": 3.65077805519104, "learning_rate": 1.4246424642464248e-05, "loss": 1.7181, "num_input_tokens_seen": 68030296, "step": 19500 }, { "epoch": 7.334066740007334, "grad_norm": 4.696651458740234, "learning_rate": 1.3329666299963331e-05, "loss": 1.6992, "num_input_tokens_seen": 69767824, "step": 20000 }, { "epoch": 7.517418408507518, "grad_norm": 5.405508518218994, "learning_rate": 1.2412907957462413e-05, "loss": 1.6903, "num_input_tokens_seen": 71509128, "step": 20500 }, { "epoch": 7.700770077007701, "grad_norm": 3.7343809604644775, "learning_rate": 1.1496149614961496e-05, "loss": 1.7019, "num_input_tokens_seen": 73255224, "step": 21000 }, { "epoch": 7.884121745507884, "grad_norm": 4.133444786071777, "learning_rate": 1.057939127246058e-05, "loss": 1.6959, "num_input_tokens_seen": 75002496, "step": 21500 }, { "epoch": 8.067473414008067, "grad_norm": 4.398416996002197, "learning_rate": 9.662632929959662e-06, "loss": 1.7018, "num_input_tokens_seen": 76756073, "step": 22000 }, { "epoch": 8.250825082508252, "grad_norm": 4.565046310424805, "learning_rate": 8.745874587458746e-06, "loss": 1.6837, "num_input_tokens_seen": 78483465, "step": 22500 }, { "epoch": 8.434176751008435, "grad_norm": 3.950497627258301, "learning_rate": 7.829116244957828e-06, "loss": 1.6913, "num_input_tokens_seen": 80220865, "step": 23000 }, { "epoch": 8.617528419508618, "grad_norm": 3.9700405597686768, "learning_rate": 6.912357902456913e-06, "loss": 1.6814, "num_input_tokens_seen": 81964649, "step": 23500 }, { "epoch": 8.800880088008801, "grad_norm": 3.21114444732666, "learning_rate": 5.995599559955996e-06, "loss": 1.689, "num_input_tokens_seen": 83718889, "step": 24000 }, { "epoch": 8.984231756508985, "grad_norm": 3.5966849327087402, "learning_rate": 5.078841217455079e-06, "loss": 1.6734, "num_input_tokens_seen": 85471529, "step": 24500 }, { "epoch": 9.167583425009168, "grad_norm": 3.4596688747406006, "learning_rate": 4.162082874954162e-06, "loss": 1.6792, "num_input_tokens_seen": 87214771, "step": 25000 }, { "epoch": 9.350935093509351, "grad_norm": 3.9838054180145264, "learning_rate": 3.2453245324532458e-06, "loss": 1.6583, "num_input_tokens_seen": 88949475, "step": 25500 }, { "epoch": 9.534286762009534, "grad_norm": 3.389430522918701, "learning_rate": 2.3285661899523286e-06, "loss": 1.6836, "num_input_tokens_seen": 90694267, "step": 26000 }, { "epoch": 9.717638430509718, "grad_norm": 4.560466289520264, "learning_rate": 1.411807847451412e-06, "loss": 1.6804, "num_input_tokens_seen": 92441267, "step": 26500 }, { "epoch": 9.900990099009901, "grad_norm": 4.484193325042725, "learning_rate": 4.950495049504951e-07, "loss": 1.6876, "num_input_tokens_seen": 94186835, "step": 27000 }, { "epoch": 10.0, "num_input_tokens_seen": 95128823, "step": 27270, "total_flos": 3.4538173670639616e+16, "train_loss": 1.8293144167322006, "train_runtime": 2454.5506, "train_samples_per_second": 88.859, "train_steps_per_second": 11.11, "train_tokens_per_second": 38762.215 } ], "logging_steps": 500, "max_steps": 27270, "num_input_tokens_seen": 95128823, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.4538173670639616e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }