{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9954337899543377, "eval_steps": 500, "global_step": 984, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.030441400304414, "grad_norm": 4.619646397901246, "learning_rate": 5e-06, "loss": 0.801, "step": 10 }, { "epoch": 0.060882800608828, "grad_norm": 2.4758507718701606, "learning_rate": 5e-06, "loss": 0.7282, "step": 20 }, { "epoch": 0.091324200913242, "grad_norm": 3.479812913897804, "learning_rate": 5e-06, "loss": 0.7045, "step": 30 }, { "epoch": 0.121765601217656, "grad_norm": 2.9360047427282385, "learning_rate": 5e-06, "loss": 0.6875, "step": 40 }, { "epoch": 0.15220700152207, "grad_norm": 0.973776971269499, "learning_rate": 5e-06, "loss": 0.6818, "step": 50 }, { "epoch": 0.182648401826484, "grad_norm": 0.800750811910126, "learning_rate": 5e-06, "loss": 0.6695, "step": 60 }, { "epoch": 0.213089802130898, "grad_norm": 0.7737759634387463, "learning_rate": 5e-06, "loss": 0.6519, "step": 70 }, { "epoch": 0.243531202435312, "grad_norm": 1.3731039805269152, "learning_rate": 5e-06, "loss": 0.6423, "step": 80 }, { "epoch": 0.273972602739726, "grad_norm": 0.6785230486405546, "learning_rate": 5e-06, "loss": 0.6501, "step": 90 }, { "epoch": 0.30441400304414, "grad_norm": 1.054981943761996, "learning_rate": 5e-06, "loss": 0.6403, "step": 100 }, { "epoch": 0.334855403348554, "grad_norm": 0.7312141288913386, "learning_rate": 5e-06, "loss": 0.6327, "step": 110 }, { "epoch": 0.365296803652968, "grad_norm": 0.611778635862804, "learning_rate": 5e-06, "loss": 0.6311, "step": 120 }, { "epoch": 0.395738203957382, "grad_norm": 0.6581343673138518, "learning_rate": 5e-06, "loss": 0.6383, "step": 130 }, { "epoch": 0.426179604261796, "grad_norm": 0.8961158759762995, "learning_rate": 5e-06, "loss": 0.6289, "step": 140 }, { "epoch": 0.45662100456621, "grad_norm": 0.5571046772698393, "learning_rate": 5e-06, "loss": 0.6315, "step": 150 }, { "epoch": 0.487062404870624, "grad_norm": 0.5225558082134438, "learning_rate": 5e-06, "loss": 0.6292, "step": 160 }, { "epoch": 0.517503805175038, "grad_norm": 0.5700430081742105, "learning_rate": 5e-06, "loss": 0.6274, "step": 170 }, { "epoch": 0.547945205479452, "grad_norm": 1.3638017063276748, "learning_rate": 5e-06, "loss": 0.6147, "step": 180 }, { "epoch": 0.578386605783866, "grad_norm": 1.1474194001541684, "learning_rate": 5e-06, "loss": 0.6214, "step": 190 }, { "epoch": 0.60882800608828, "grad_norm": 0.5983544869985015, "learning_rate": 5e-06, "loss": 0.6211, "step": 200 }, { "epoch": 0.639269406392694, "grad_norm": 0.5030983225008678, "learning_rate": 5e-06, "loss": 0.6269, "step": 210 }, { "epoch": 0.669710806697108, "grad_norm": 0.5355894732987774, "learning_rate": 5e-06, "loss": 0.6186, "step": 220 }, { "epoch": 0.700152207001522, "grad_norm": 0.5535858496033256, "learning_rate": 5e-06, "loss": 0.6174, "step": 230 }, { "epoch": 0.730593607305936, "grad_norm": 0.4757423728889985, "learning_rate": 5e-06, "loss": 0.6157, "step": 240 }, { "epoch": 0.76103500761035, "grad_norm": 0.5846472040453055, "learning_rate": 5e-06, "loss": 0.6119, "step": 250 }, { "epoch": 0.791476407914764, "grad_norm": 0.5064988157906138, "learning_rate": 5e-06, "loss": 0.6174, "step": 260 }, { "epoch": 0.821917808219178, "grad_norm": 0.5603859566155508, "learning_rate": 5e-06, "loss": 0.6115, "step": 270 }, { "epoch": 0.852359208523592, "grad_norm": 0.5340207450065109, "learning_rate": 5e-06, "loss": 0.618, "step": 280 }, { "epoch": 0.882800608828006, "grad_norm": 0.5176245531200292, "learning_rate": 5e-06, "loss": 0.6141, "step": 290 }, { "epoch": 0.91324200913242, "grad_norm": 0.5686518744643043, "learning_rate": 5e-06, "loss": 0.61, "step": 300 }, { "epoch": 0.943683409436834, "grad_norm": 0.5557582588776627, "learning_rate": 5e-06, "loss": 0.6086, "step": 310 }, { "epoch": 0.974124809741248, "grad_norm": 0.48632226958782565, "learning_rate": 5e-06, "loss": 0.6134, "step": 320 }, { "epoch": 0.9984779299847792, "eval_loss": 0.6086177825927734, "eval_runtime": 177.842, "eval_samples_per_second": 49.758, "eval_steps_per_second": 0.394, "step": 328 }, { "epoch": 1.004566210045662, "grad_norm": 0.5349618575586927, "learning_rate": 5e-06, "loss": 0.6066, "step": 330 }, { "epoch": 1.035007610350076, "grad_norm": 0.7085072078930811, "learning_rate": 5e-06, "loss": 0.5583, "step": 340 }, { "epoch": 1.06544901065449, "grad_norm": 0.5086842387297956, "learning_rate": 5e-06, "loss": 0.5577, "step": 350 }, { "epoch": 1.095890410958904, "grad_norm": 0.5618625623080041, "learning_rate": 5e-06, "loss": 0.5692, "step": 360 }, { "epoch": 1.126331811263318, "grad_norm": 0.8882124898558976, "learning_rate": 5e-06, "loss": 0.5622, "step": 370 }, { "epoch": 1.156773211567732, "grad_norm": 0.8010806701357303, "learning_rate": 5e-06, "loss": 0.5626, "step": 380 }, { "epoch": 1.187214611872146, "grad_norm": 0.6044300304393334, "learning_rate": 5e-06, "loss": 0.564, "step": 390 }, { "epoch": 1.21765601217656, "grad_norm": 0.6478561685693183, "learning_rate": 5e-06, "loss": 0.5599, "step": 400 }, { "epoch": 1.248097412480974, "grad_norm": 0.48517644823822886, "learning_rate": 5e-06, "loss": 0.5629, "step": 410 }, { "epoch": 1.278538812785388, "grad_norm": 0.6579745015004641, "learning_rate": 5e-06, "loss": 0.5688, "step": 420 }, { "epoch": 1.308980213089802, "grad_norm": 0.5933025521185841, "learning_rate": 5e-06, "loss": 0.5625, "step": 430 }, { "epoch": 1.339421613394216, "grad_norm": 0.5524692856580558, "learning_rate": 5e-06, "loss": 0.5645, "step": 440 }, { "epoch": 1.36986301369863, "grad_norm": 0.5949700286062661, "learning_rate": 5e-06, "loss": 0.5667, "step": 450 }, { "epoch": 1.400304414003044, "grad_norm": 0.7922305492429499, "learning_rate": 5e-06, "loss": 0.5671, "step": 460 }, { "epoch": 1.430745814307458, "grad_norm": 0.5035984988245502, "learning_rate": 5e-06, "loss": 0.5699, "step": 470 }, { "epoch": 1.461187214611872, "grad_norm": 0.4675577590538898, "learning_rate": 5e-06, "loss": 0.5622, "step": 480 }, { "epoch": 1.491628614916286, "grad_norm": 0.6149051107408182, "learning_rate": 5e-06, "loss": 0.568, "step": 490 }, { "epoch": 1.5220700152207, "grad_norm": 0.624211800852349, "learning_rate": 5e-06, "loss": 0.5641, "step": 500 }, { "epoch": 1.5525114155251143, "grad_norm": 0.9181588884042637, "learning_rate": 5e-06, "loss": 0.5666, "step": 510 }, { "epoch": 1.582952815829528, "grad_norm": 0.4937076242442424, "learning_rate": 5e-06, "loss": 0.5666, "step": 520 }, { "epoch": 1.6133942161339423, "grad_norm": 0.5082102469467535, "learning_rate": 5e-06, "loss": 0.5574, "step": 530 }, { "epoch": 1.643835616438356, "grad_norm": 0.4937565845275598, "learning_rate": 5e-06, "loss": 0.5689, "step": 540 }, { "epoch": 1.6742770167427703, "grad_norm": 0.4848002546906178, "learning_rate": 5e-06, "loss": 0.5688, "step": 550 }, { "epoch": 1.704718417047184, "grad_norm": 0.5230615787294617, "learning_rate": 5e-06, "loss": 0.5677, "step": 560 }, { "epoch": 1.7351598173515983, "grad_norm": 0.4976273028384401, "learning_rate": 5e-06, "loss": 0.5586, "step": 570 }, { "epoch": 1.765601217656012, "grad_norm": 0.49403463868422004, "learning_rate": 5e-06, "loss": 0.5604, "step": 580 }, { "epoch": 1.7960426179604263, "grad_norm": 0.6320569692146091, "learning_rate": 5e-06, "loss": 0.5578, "step": 590 }, { "epoch": 1.82648401826484, "grad_norm": 0.5695898310066485, "learning_rate": 5e-06, "loss": 0.5593, "step": 600 }, { "epoch": 1.8569254185692543, "grad_norm": 0.49537917557389, "learning_rate": 5e-06, "loss": 0.5627, "step": 610 }, { "epoch": 1.887366818873668, "grad_norm": 0.6247132495885726, "learning_rate": 5e-06, "loss": 0.5621, "step": 620 }, { "epoch": 1.9178082191780823, "grad_norm": 0.5030338206961641, "learning_rate": 5e-06, "loss": 0.5598, "step": 630 }, { "epoch": 1.948249619482496, "grad_norm": 0.4962258563588259, "learning_rate": 5e-06, "loss": 0.5562, "step": 640 }, { "epoch": 1.9786910197869103, "grad_norm": 0.49909928012252913, "learning_rate": 5e-06, "loss": 0.5568, "step": 650 }, { "epoch": 2.0, "eval_loss": 0.5997375249862671, "eval_runtime": 178.4917, "eval_samples_per_second": 49.577, "eval_steps_per_second": 0.392, "step": 657 }, { "epoch": 2.009132420091324, "grad_norm": 0.8861960444337913, "learning_rate": 5e-06, "loss": 0.5441, "step": 660 }, { "epoch": 2.0395738203957383, "grad_norm": 0.5425502389160675, "learning_rate": 5e-06, "loss": 0.514, "step": 670 }, { "epoch": 2.070015220700152, "grad_norm": 0.5562526095704428, "learning_rate": 5e-06, "loss": 0.5032, "step": 680 }, { "epoch": 2.1004566210045663, "grad_norm": 0.49212370632524655, "learning_rate": 5e-06, "loss": 0.5154, "step": 690 }, { "epoch": 2.13089802130898, "grad_norm": 0.534245423835535, "learning_rate": 5e-06, "loss": 0.5116, "step": 700 }, { "epoch": 2.1613394216133943, "grad_norm": 0.5454909388550724, "learning_rate": 5e-06, "loss": 0.5136, "step": 710 }, { "epoch": 2.191780821917808, "grad_norm": 0.8231087538933396, "learning_rate": 5e-06, "loss": 0.5052, "step": 720 }, { "epoch": 2.2222222222222223, "grad_norm": 0.5745911989069736, "learning_rate": 5e-06, "loss": 0.5157, "step": 730 }, { "epoch": 2.252663622526636, "grad_norm": 0.5370023428369475, "learning_rate": 5e-06, "loss": 0.5142, "step": 740 }, { "epoch": 2.2831050228310503, "grad_norm": 0.5417242182114983, "learning_rate": 5e-06, "loss": 0.507, "step": 750 }, { "epoch": 2.313546423135464, "grad_norm": 0.4948430389470553, "learning_rate": 5e-06, "loss": 0.5071, "step": 760 }, { "epoch": 2.3439878234398783, "grad_norm": 0.5823872991278739, "learning_rate": 5e-06, "loss": 0.5152, "step": 770 }, { "epoch": 2.374429223744292, "grad_norm": 0.5749628913643053, "learning_rate": 5e-06, "loss": 0.5158, "step": 780 }, { "epoch": 2.4048706240487063, "grad_norm": 0.523744812578657, "learning_rate": 5e-06, "loss": 0.5121, "step": 790 }, { "epoch": 2.43531202435312, "grad_norm": 0.6430805814437145, "learning_rate": 5e-06, "loss": 0.5187, "step": 800 }, { "epoch": 2.4657534246575343, "grad_norm": 0.6135659154747556, "learning_rate": 5e-06, "loss": 0.5105, "step": 810 }, { "epoch": 2.496194824961948, "grad_norm": 0.5819355389015244, "learning_rate": 5e-06, "loss": 0.5127, "step": 820 }, { "epoch": 2.5266362252663623, "grad_norm": 0.5222652845841953, "learning_rate": 5e-06, "loss": 0.5172, "step": 830 }, { "epoch": 2.557077625570776, "grad_norm": 0.5085292250908287, "learning_rate": 5e-06, "loss": 0.5154, "step": 840 }, { "epoch": 2.5875190258751903, "grad_norm": 0.5051693370140233, "learning_rate": 5e-06, "loss": 0.5122, "step": 850 }, { "epoch": 2.617960426179604, "grad_norm": 0.5219920053064364, "learning_rate": 5e-06, "loss": 0.5154, "step": 860 }, { "epoch": 2.6484018264840183, "grad_norm": 0.5918822792144229, "learning_rate": 5e-06, "loss": 0.5141, "step": 870 }, { "epoch": 2.678843226788432, "grad_norm": 0.522542434998343, "learning_rate": 5e-06, "loss": 0.5117, "step": 880 }, { "epoch": 2.7092846270928463, "grad_norm": 0.5321626854739884, "learning_rate": 5e-06, "loss": 0.5073, "step": 890 }, { "epoch": 2.73972602739726, "grad_norm": 0.5099777411434487, "learning_rate": 5e-06, "loss": 0.5202, "step": 900 }, { "epoch": 2.7701674277016743, "grad_norm": 0.5225721184412385, "learning_rate": 5e-06, "loss": 0.5204, "step": 910 }, { "epoch": 2.800608828006088, "grad_norm": 0.49419482852544927, "learning_rate": 5e-06, "loss": 0.5117, "step": 920 }, { "epoch": 2.8310502283105023, "grad_norm": 0.5580841932281034, "learning_rate": 5e-06, "loss": 0.5125, "step": 930 }, { "epoch": 2.861491628614916, "grad_norm": 0.5459012978192056, "learning_rate": 5e-06, "loss": 0.5159, "step": 940 }, { "epoch": 2.8919330289193304, "grad_norm": 0.4931927553237747, "learning_rate": 5e-06, "loss": 0.516, "step": 950 }, { "epoch": 2.922374429223744, "grad_norm": 0.5125805588681561, "learning_rate": 5e-06, "loss": 0.5204, "step": 960 }, { "epoch": 2.9528158295281584, "grad_norm": 0.5359088405972897, "learning_rate": 5e-06, "loss": 0.5126, "step": 970 }, { "epoch": 2.983257229832572, "grad_norm": 0.5366173084145919, "learning_rate": 5e-06, "loss": 0.507, "step": 980 }, { "epoch": 2.9954337899543377, "eval_loss": 0.6046163439750671, "eval_runtime": 178.3759, "eval_samples_per_second": 49.609, "eval_steps_per_second": 0.392, "step": 984 }, { "epoch": 2.9954337899543377, "step": 984, "total_flos": 1647817890201600.0, "train_loss": 0.5723786247455007, "train_runtime": 29605.407, "train_samples_per_second": 17.036, "train_steps_per_second": 0.033 } ], "logging_steps": 10, "max_steps": 984, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1647817890201600.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }