{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 591, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.050761421319796954, "grad_norm": 2.884176452669992, "learning_rate": 5e-06, "loss": 0.7058, "step": 10 }, { "epoch": 0.10152284263959391, "grad_norm": 1.1031748057601416, "learning_rate": 5e-06, "loss": 0.6396, "step": 20 }, { "epoch": 0.15228426395939088, "grad_norm": 0.8763770464242356, "learning_rate": 5e-06, "loss": 0.6263, "step": 30 }, { "epoch": 0.20304568527918782, "grad_norm": 0.8637694735287882, "learning_rate": 5e-06, "loss": 0.6163, "step": 40 }, { "epoch": 0.25380710659898476, "grad_norm": 0.6443562464039193, "learning_rate": 5e-06, "loss": 0.5983, "step": 50 }, { "epoch": 0.30456852791878175, "grad_norm": 0.7181436435893084, "learning_rate": 5e-06, "loss": 0.5841, "step": 60 }, { "epoch": 0.3553299492385787, "grad_norm": 0.5736245789251513, "learning_rate": 5e-06, "loss": 0.5851, "step": 70 }, { "epoch": 0.40609137055837563, "grad_norm": 0.5370831215793392, "learning_rate": 5e-06, "loss": 0.5743, "step": 80 }, { "epoch": 0.45685279187817257, "grad_norm": 0.7868112154827747, "learning_rate": 5e-06, "loss": 0.5631, "step": 90 }, { "epoch": 0.5076142131979695, "grad_norm": 0.6384967672565564, "learning_rate": 5e-06, "loss": 0.5621, "step": 100 }, { "epoch": 0.5583756345177665, "grad_norm": 0.6105632833081082, "learning_rate": 5e-06, "loss": 0.5626, "step": 110 }, { "epoch": 0.6091370558375635, "grad_norm": 0.4980885433595067, "learning_rate": 5e-06, "loss": 0.5576, "step": 120 }, { "epoch": 0.6598984771573604, "grad_norm": 0.5567494710307067, "learning_rate": 5e-06, "loss": 0.5634, "step": 130 }, { "epoch": 0.7106598984771574, "grad_norm": 0.5747955173184647, "learning_rate": 5e-06, "loss": 0.5556, "step": 140 }, { "epoch": 0.7614213197969543, "grad_norm": 0.7422760458895346, "learning_rate": 5e-06, "loss": 0.5485, "step": 150 }, { "epoch": 0.8121827411167513, "grad_norm": 0.5519407925537647, "learning_rate": 5e-06, "loss": 0.5555, "step": 160 }, { "epoch": 0.8629441624365483, "grad_norm": 0.3938384364399605, "learning_rate": 5e-06, "loss": 0.5534, "step": 170 }, { "epoch": 0.9137055837563451, "grad_norm": 0.4274758585586587, "learning_rate": 5e-06, "loss": 0.5504, "step": 180 }, { "epoch": 0.9644670050761421, "grad_norm": 0.4623946226615239, "learning_rate": 5e-06, "loss": 0.5428, "step": 190 }, { "epoch": 1.0, "eval_loss": 0.545667827129364, "eval_runtime": 69.9223, "eval_samples_per_second": 75.899, "eval_steps_per_second": 0.601, "step": 197 }, { "epoch": 1.015228426395939, "grad_norm": 0.4892463181375632, "learning_rate": 5e-06, "loss": 0.538, "step": 200 }, { "epoch": 1.0659898477157361, "grad_norm": 0.5258103265854674, "learning_rate": 5e-06, "loss": 0.5005, "step": 210 }, { "epoch": 1.116751269035533, "grad_norm": 0.43953346933881265, "learning_rate": 5e-06, "loss": 0.5116, "step": 220 }, { "epoch": 1.16751269035533, "grad_norm": 0.555350254923011, "learning_rate": 5e-06, "loss": 0.5039, "step": 230 }, { "epoch": 1.218274111675127, "grad_norm": 0.4240258545882722, "learning_rate": 5e-06, "loss": 0.4979, "step": 240 }, { "epoch": 1.2690355329949239, "grad_norm": 0.6055626365057643, "learning_rate": 5e-06, "loss": 0.4977, "step": 250 }, { "epoch": 1.3197969543147208, "grad_norm": 0.5183814019968731, "learning_rate": 5e-06, "loss": 0.5015, "step": 260 }, { "epoch": 1.3705583756345177, "grad_norm": 0.44198994392173363, "learning_rate": 5e-06, "loss": 0.5052, "step": 270 }, { "epoch": 1.4213197969543148, "grad_norm": 0.4597615345223239, "learning_rate": 5e-06, "loss": 0.4974, "step": 280 }, { "epoch": 1.4720812182741116, "grad_norm": 0.47036560762175594, "learning_rate": 5e-06, "loss": 0.5044, "step": 290 }, { "epoch": 1.5228426395939088, "grad_norm": 0.4356873877920564, "learning_rate": 5e-06, "loss": 0.5134, "step": 300 }, { "epoch": 1.5736040609137056, "grad_norm": 0.4547653150275499, "learning_rate": 5e-06, "loss": 0.5022, "step": 310 }, { "epoch": 1.6243654822335025, "grad_norm": 0.5243750306692014, "learning_rate": 5e-06, "loss": 0.5001, "step": 320 }, { "epoch": 1.6751269035532994, "grad_norm": 0.5072947429791255, "learning_rate": 5e-06, "loss": 0.504, "step": 330 }, { "epoch": 1.7258883248730963, "grad_norm": 0.4591167012346128, "learning_rate": 5e-06, "loss": 0.5008, "step": 340 }, { "epoch": 1.7766497461928934, "grad_norm": 0.559953933092174, "learning_rate": 5e-06, "loss": 0.5058, "step": 350 }, { "epoch": 1.8274111675126905, "grad_norm": 0.4615691289798925, "learning_rate": 5e-06, "loss": 0.5013, "step": 360 }, { "epoch": 1.8781725888324874, "grad_norm": 0.4673102096867509, "learning_rate": 5e-06, "loss": 0.505, "step": 370 }, { "epoch": 1.9289340101522843, "grad_norm": 0.41382969409293313, "learning_rate": 5e-06, "loss": 0.5029, "step": 380 }, { "epoch": 1.9796954314720812, "grad_norm": 0.4786556881881671, "learning_rate": 5e-06, "loss": 0.4991, "step": 390 }, { "epoch": 2.0, "eval_loss": 0.5367357730865479, "eval_runtime": 69.7466, "eval_samples_per_second": 76.09, "eval_steps_per_second": 0.602, "step": 394 }, { "epoch": 2.030456852791878, "grad_norm": 0.7058322019162009, "learning_rate": 5e-06, "loss": 0.4785, "step": 400 }, { "epoch": 2.081218274111675, "grad_norm": 0.45765334889032916, "learning_rate": 5e-06, "loss": 0.4563, "step": 410 }, { "epoch": 2.1319796954314723, "grad_norm": 0.4288566534513188, "learning_rate": 5e-06, "loss": 0.4564, "step": 420 }, { "epoch": 2.182741116751269, "grad_norm": 0.4543531173101596, "learning_rate": 5e-06, "loss": 0.4559, "step": 430 }, { "epoch": 2.233502538071066, "grad_norm": 0.5677467898296061, "learning_rate": 5e-06, "loss": 0.454, "step": 440 }, { "epoch": 2.284263959390863, "grad_norm": 0.5213901371480834, "learning_rate": 5e-06, "loss": 0.4584, "step": 450 }, { "epoch": 2.33502538071066, "grad_norm": 0.5106205098926287, "learning_rate": 5e-06, "loss": 0.4603, "step": 460 }, { "epoch": 2.3857868020304567, "grad_norm": 0.48058806334750254, "learning_rate": 5e-06, "loss": 0.4586, "step": 470 }, { "epoch": 2.436548223350254, "grad_norm": 0.4462504100857437, "learning_rate": 5e-06, "loss": 0.4547, "step": 480 }, { "epoch": 2.487309644670051, "grad_norm": 0.47373720391622604, "learning_rate": 5e-06, "loss": 0.4553, "step": 490 }, { "epoch": 2.5380710659898478, "grad_norm": 0.44268127361884035, "learning_rate": 5e-06, "loss": 0.4552, "step": 500 }, { "epoch": 2.5888324873096447, "grad_norm": 0.5168789659092264, "learning_rate": 5e-06, "loss": 0.4495, "step": 510 }, { "epoch": 2.6395939086294415, "grad_norm": 0.4759213683260235, "learning_rate": 5e-06, "loss": 0.4601, "step": 520 }, { "epoch": 2.6903553299492384, "grad_norm": 0.5405150525609788, "learning_rate": 5e-06, "loss": 0.4639, "step": 530 }, { "epoch": 2.7411167512690353, "grad_norm": 0.48554448131156325, "learning_rate": 5e-06, "loss": 0.4561, "step": 540 }, { "epoch": 2.7918781725888326, "grad_norm": 0.5715450026884347, "learning_rate": 5e-06, "loss": 0.458, "step": 550 }, { "epoch": 2.8426395939086295, "grad_norm": 0.45709891570844524, "learning_rate": 5e-06, "loss": 0.4582, "step": 560 }, { "epoch": 2.8934010152284264, "grad_norm": 0.4329230281361173, "learning_rate": 5e-06, "loss": 0.4535, "step": 570 }, { "epoch": 2.9441624365482233, "grad_norm": 0.4707895543051601, "learning_rate": 5e-06, "loss": 0.4609, "step": 580 }, { "epoch": 2.99492385786802, "grad_norm": 0.4717664390758382, "learning_rate": 5e-06, "loss": 0.4636, "step": 590 }, { "epoch": 3.0, "eval_loss": 0.5386124849319458, "eval_runtime": 67.8364, "eval_samples_per_second": 78.232, "eval_steps_per_second": 0.619, "step": 591 }, { "epoch": 3.0, "step": 591, "total_flos": 989528252743680.0, "train_loss": 0.5134775242224563, "train_runtime": 10543.3353, "train_samples_per_second": 28.688, "train_steps_per_second": 0.056 } ], "logging_steps": 10, "max_steps": 591, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 989528252743680.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }