{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.998438719750195, "eval_steps": 500, "global_step": 960, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0312256049960968, "grad_norm": 2.3905091254417177, "learning_rate": 5e-06, "loss": 0.8071, "step": 10 }, { "epoch": 0.0624512099921936, "grad_norm": 1.3958881588356253, "learning_rate": 5e-06, "loss": 0.7197, "step": 20 }, { "epoch": 0.0936768149882904, "grad_norm": 1.5094847169199592, "learning_rate": 5e-06, "loss": 0.6985, "step": 30 }, { "epoch": 0.1249024199843872, "grad_norm": 1.4080817727177422, "learning_rate": 5e-06, "loss": 0.6855, "step": 40 }, { "epoch": 0.156128024980484, "grad_norm": 0.8437914162126238, "learning_rate": 5e-06, "loss": 0.6727, "step": 50 }, { "epoch": 0.1873536299765808, "grad_norm": 0.8642540743285999, "learning_rate": 5e-06, "loss": 0.6576, "step": 60 }, { "epoch": 0.2185792349726776, "grad_norm": 0.6633530231329323, "learning_rate": 5e-06, "loss": 0.6514, "step": 70 }, { "epoch": 0.2498048399687744, "grad_norm": 0.8262205915204606, "learning_rate": 5e-06, "loss": 0.6457, "step": 80 }, { "epoch": 0.2810304449648712, "grad_norm": 0.5613248571253571, "learning_rate": 5e-06, "loss": 0.6412, "step": 90 }, { "epoch": 0.312256049960968, "grad_norm": 0.9150986102724331, "learning_rate": 5e-06, "loss": 0.6384, "step": 100 }, { "epoch": 0.3434816549570648, "grad_norm": 0.7465781739155327, "learning_rate": 5e-06, "loss": 0.6335, "step": 110 }, { "epoch": 0.3747072599531616, "grad_norm": 0.8250370829383081, "learning_rate": 5e-06, "loss": 0.6302, "step": 120 }, { "epoch": 0.4059328649492584, "grad_norm": 0.6341139387710243, "learning_rate": 5e-06, "loss": 0.634, "step": 130 }, { "epoch": 0.4371584699453552, "grad_norm": 0.4964005156113376, "learning_rate": 5e-06, "loss": 0.6211, "step": 140 }, { "epoch": 0.468384074941452, "grad_norm": 0.9475290669111363, "learning_rate": 5e-06, "loss": 0.6271, "step": 150 }, { "epoch": 0.4996096799375488, "grad_norm": 0.7811648794629471, "learning_rate": 5e-06, "loss": 0.6238, "step": 160 }, { "epoch": 0.5308352849336456, "grad_norm": 0.516293302775752, "learning_rate": 5e-06, "loss": 0.621, "step": 170 }, { "epoch": 0.5620608899297423, "grad_norm": 0.471912073011228, "learning_rate": 5e-06, "loss": 0.6215, "step": 180 }, { "epoch": 0.5932864949258392, "grad_norm": 0.5719925024660438, "learning_rate": 5e-06, "loss": 0.6193, "step": 190 }, { "epoch": 0.624512099921936, "grad_norm": 0.5059415320269443, "learning_rate": 5e-06, "loss": 0.6175, "step": 200 }, { "epoch": 0.6557377049180327, "grad_norm": 0.5195408058121892, "learning_rate": 5e-06, "loss": 0.622, "step": 210 }, { "epoch": 0.6869633099141296, "grad_norm": 0.6014889400609209, "learning_rate": 5e-06, "loss": 0.6158, "step": 220 }, { "epoch": 0.7181889149102264, "grad_norm": 0.5096070261428851, "learning_rate": 5e-06, "loss": 0.6177, "step": 230 }, { "epoch": 0.7494145199063232, "grad_norm": 0.6028623695390841, "learning_rate": 5e-06, "loss": 0.6148, "step": 240 }, { "epoch": 0.78064012490242, "grad_norm": 0.5451038071079088, "learning_rate": 5e-06, "loss": 0.6164, "step": 250 }, { "epoch": 0.8118657298985168, "grad_norm": 0.4708236433706893, "learning_rate": 5e-06, "loss": 0.6094, "step": 260 }, { "epoch": 0.8430913348946136, "grad_norm": 0.46109612782168113, "learning_rate": 5e-06, "loss": 0.6103, "step": 270 }, { "epoch": 0.8743169398907104, "grad_norm": 0.502648205452055, "learning_rate": 5e-06, "loss": 0.6095, "step": 280 }, { "epoch": 0.9055425448868072, "grad_norm": 0.4489395079927774, "learning_rate": 5e-06, "loss": 0.6065, "step": 290 }, { "epoch": 0.936768149882904, "grad_norm": 0.6477411963812875, "learning_rate": 5e-06, "loss": 0.6132, "step": 300 }, { "epoch": 0.9679937548790007, "grad_norm": 0.5302907770786253, "learning_rate": 5e-06, "loss": 0.6111, "step": 310 }, { "epoch": 0.9992193598750976, "grad_norm": 0.45517099030938496, "learning_rate": 5e-06, "loss": 0.5986, "step": 320 }, { "epoch": 0.9992193598750976, "eval_loss": 0.6128131151199341, "eval_runtime": 341.0462, "eval_samples_per_second": 25.304, "eval_steps_per_second": 0.396, "step": 320 }, { "epoch": 1.0308352849336455, "grad_norm": 0.8438882865201324, "learning_rate": 5e-06, "loss": 0.6164, "step": 330 }, { "epoch": 1.0620608899297423, "grad_norm": 0.4821646175445636, "learning_rate": 5e-06, "loss": 0.5662, "step": 340 }, { "epoch": 1.0932864949258392, "grad_norm": 0.5058980149763423, "learning_rate": 5e-06, "loss": 0.5606, "step": 350 }, { "epoch": 1.124512099921936, "grad_norm": 0.4781831185352073, "learning_rate": 5e-06, "loss": 0.5648, "step": 360 }, { "epoch": 1.1557377049180328, "grad_norm": 0.5048559047058323, "learning_rate": 5e-06, "loss": 0.5582, "step": 370 }, { "epoch": 1.1869633099141297, "grad_norm": 0.44414824193518654, "learning_rate": 5e-06, "loss": 0.5584, "step": 380 }, { "epoch": 1.2181889149102263, "grad_norm": 0.4928423351798681, "learning_rate": 5e-06, "loss": 0.5515, "step": 390 }, { "epoch": 1.2494145199063231, "grad_norm": 0.5064189451582637, "learning_rate": 5e-06, "loss": 0.5637, "step": 400 }, { "epoch": 1.28064012490242, "grad_norm": 0.44193713470343654, "learning_rate": 5e-06, "loss": 0.5618, "step": 410 }, { "epoch": 1.3118657298985168, "grad_norm": 0.4650381211562015, "learning_rate": 5e-06, "loss": 0.5554, "step": 420 }, { "epoch": 1.3430913348946136, "grad_norm": 0.5544428065241478, "learning_rate": 5e-06, "loss": 0.5547, "step": 430 }, { "epoch": 1.3743169398907105, "grad_norm": 0.48005595474790913, "learning_rate": 5e-06, "loss": 0.5523, "step": 440 }, { "epoch": 1.4055425448868073, "grad_norm": 0.4974548951913249, "learning_rate": 5e-06, "loss": 0.5666, "step": 450 }, { "epoch": 1.436768149882904, "grad_norm": 0.4923658625750441, "learning_rate": 5e-06, "loss": 0.5558, "step": 460 }, { "epoch": 1.4679937548790007, "grad_norm": 0.5272663506589431, "learning_rate": 5e-06, "loss": 0.5584, "step": 470 }, { "epoch": 1.4992193598750976, "grad_norm": 0.5304464959914178, "learning_rate": 5e-06, "loss": 0.5643, "step": 480 }, { "epoch": 1.5304449648711944, "grad_norm": 0.5773543616559265, "learning_rate": 5e-06, "loss": 0.5598, "step": 490 }, { "epoch": 1.561670569867291, "grad_norm": 0.4558348320273449, "learning_rate": 5e-06, "loss": 0.5591, "step": 500 }, { "epoch": 1.5928961748633879, "grad_norm": 0.5072303901122793, "learning_rate": 5e-06, "loss": 0.5626, "step": 510 }, { "epoch": 1.6241217798594847, "grad_norm": 0.5369887998410667, "learning_rate": 5e-06, "loss": 0.5556, "step": 520 }, { "epoch": 1.6553473848555815, "grad_norm": 0.5556757682627291, "learning_rate": 5e-06, "loss": 0.5572, "step": 530 }, { "epoch": 1.6865729898516784, "grad_norm": 0.5337242705677901, "learning_rate": 5e-06, "loss": 0.557, "step": 540 }, { "epoch": 1.7177985948477752, "grad_norm": 0.46280527938706506, "learning_rate": 5e-06, "loss": 0.5617, "step": 550 }, { "epoch": 1.749024199843872, "grad_norm": 0.45608832514525505, "learning_rate": 5e-06, "loss": 0.5581, "step": 560 }, { "epoch": 1.7802498048399689, "grad_norm": 0.48374355780746187, "learning_rate": 5e-06, "loss": 0.5564, "step": 570 }, { "epoch": 1.8114754098360657, "grad_norm": 0.5029705354009028, "learning_rate": 5e-06, "loss": 0.559, "step": 580 }, { "epoch": 1.8427010148321625, "grad_norm": 0.46966476792976214, "learning_rate": 5e-06, "loss": 0.5616, "step": 590 }, { "epoch": 1.8739266198282591, "grad_norm": 0.446283124549817, "learning_rate": 5e-06, "loss": 0.553, "step": 600 }, { "epoch": 1.905152224824356, "grad_norm": 0.4745527474098281, "learning_rate": 5e-06, "loss": 0.5589, "step": 610 }, { "epoch": 1.9363778298204528, "grad_norm": 0.501609279464785, "learning_rate": 5e-06, "loss": 0.5628, "step": 620 }, { "epoch": 1.9676034348165494, "grad_norm": 0.49320626859834116, "learning_rate": 5e-06, "loss": 0.5522, "step": 630 }, { "epoch": 1.9988290398126463, "grad_norm": 0.4324557011242181, "learning_rate": 5e-06, "loss": 0.5596, "step": 640 }, { "epoch": 1.9988290398126463, "eval_loss": 0.6045193076133728, "eval_runtime": 340.5129, "eval_samples_per_second": 25.344, "eval_steps_per_second": 0.396, "step": 640 }, { "epoch": 2.030444964871194, "grad_norm": 0.6006527642113036, "learning_rate": 5e-06, "loss": 0.5662, "step": 650 }, { "epoch": 2.061670569867291, "grad_norm": 0.5376147888211947, "learning_rate": 5e-06, "loss": 0.5018, "step": 660 }, { "epoch": 2.092896174863388, "grad_norm": 0.5448017881956769, "learning_rate": 5e-06, "loss": 0.5057, "step": 670 }, { "epoch": 2.1241217798594847, "grad_norm": 0.6095347029172922, "learning_rate": 5e-06, "loss": 0.5036, "step": 680 }, { "epoch": 2.1553473848555815, "grad_norm": 0.5281790301882382, "learning_rate": 5e-06, "loss": 0.5066, "step": 690 }, { "epoch": 2.1865729898516784, "grad_norm": 0.543025537124188, "learning_rate": 5e-06, "loss": 0.5086, "step": 700 }, { "epoch": 2.217798594847775, "grad_norm": 0.6024294613229594, "learning_rate": 5e-06, "loss": 0.508, "step": 710 }, { "epoch": 2.249024199843872, "grad_norm": 0.5261160691218546, "learning_rate": 5e-06, "loss": 0.5073, "step": 720 }, { "epoch": 2.280249804839969, "grad_norm": 0.4878879224650377, "learning_rate": 5e-06, "loss": 0.5126, "step": 730 }, { "epoch": 2.3114754098360657, "grad_norm": 0.5298908191049263, "learning_rate": 5e-06, "loss": 0.5098, "step": 740 }, { "epoch": 2.3427010148321625, "grad_norm": 0.4963375261761113, "learning_rate": 5e-06, "loss": 0.5084, "step": 750 }, { "epoch": 2.3739266198282594, "grad_norm": 0.476625155447844, "learning_rate": 5e-06, "loss": 0.5097, "step": 760 }, { "epoch": 2.4051522248243558, "grad_norm": 0.5208071390082176, "learning_rate": 5e-06, "loss": 0.5028, "step": 770 }, { "epoch": 2.4363778298204526, "grad_norm": 0.4800697229604007, "learning_rate": 5e-06, "loss": 0.5102, "step": 780 }, { "epoch": 2.4676034348165494, "grad_norm": 0.5837948115948769, "learning_rate": 5e-06, "loss": 0.5068, "step": 790 }, { "epoch": 2.4988290398126463, "grad_norm": 0.5280421758640109, "learning_rate": 5e-06, "loss": 0.5141, "step": 800 }, { "epoch": 2.530054644808743, "grad_norm": 0.4594714496886714, "learning_rate": 5e-06, "loss": 0.5081, "step": 810 }, { "epoch": 2.56128024980484, "grad_norm": 0.51076427145537, "learning_rate": 5e-06, "loss": 0.5123, "step": 820 }, { "epoch": 2.5925058548009368, "grad_norm": 0.5309550406289082, "learning_rate": 5e-06, "loss": 0.5067, "step": 830 }, { "epoch": 2.6237314597970336, "grad_norm": 0.6033885635557941, "learning_rate": 5e-06, "loss": 0.519, "step": 840 }, { "epoch": 2.6549570647931304, "grad_norm": 0.5271243446375676, "learning_rate": 5e-06, "loss": 0.5048, "step": 850 }, { "epoch": 2.6861826697892273, "grad_norm": 0.5270298610894952, "learning_rate": 5e-06, "loss": 0.5165, "step": 860 }, { "epoch": 2.717408274785324, "grad_norm": 0.5222507557357616, "learning_rate": 5e-06, "loss": 0.5154, "step": 870 }, { "epoch": 2.748633879781421, "grad_norm": 0.49639435328740067, "learning_rate": 5e-06, "loss": 0.5091, "step": 880 }, { "epoch": 2.7798594847775178, "grad_norm": 0.4828475074772525, "learning_rate": 5e-06, "loss": 0.5112, "step": 890 }, { "epoch": 2.8110850897736146, "grad_norm": 0.5145958529566682, "learning_rate": 5e-06, "loss": 0.5109, "step": 900 }, { "epoch": 2.8423106947697114, "grad_norm": 0.528104324477883, "learning_rate": 5e-06, "loss": 0.5166, "step": 910 }, { "epoch": 2.873536299765808, "grad_norm": 0.47920251362694366, "learning_rate": 5e-06, "loss": 0.519, "step": 920 }, { "epoch": 2.9047619047619047, "grad_norm": 0.538906478147928, "learning_rate": 5e-06, "loss": 0.5181, "step": 930 }, { "epoch": 2.9359875097580015, "grad_norm": 0.5243949264804789, "learning_rate": 5e-06, "loss": 0.5134, "step": 940 }, { "epoch": 2.9672131147540983, "grad_norm": 0.47727618067883554, "learning_rate": 5e-06, "loss": 0.512, "step": 950 }, { "epoch": 2.998438719750195, "grad_norm": 0.5110204465597075, "learning_rate": 5e-06, "loss": 0.5083, "step": 960 }, { "epoch": 2.998438719750195, "eval_loss": 0.608026921749115, "eval_runtime": 339.6647, "eval_samples_per_second": 25.407, "eval_steps_per_second": 0.397, "step": 960 }, { "epoch": 2.998438719750195, "step": 960, "total_flos": 1607826375966720.0, "train_loss": 0.5702028140425682, "train_runtime": 56737.9664, "train_samples_per_second": 8.669, "train_steps_per_second": 0.017 } ], "logging_steps": 10, "max_steps": 960, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1607826375966720.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }