|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.998438719750195, |
|
"eval_steps": 500, |
|
"global_step": 960, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0312256049960968, |
|
"grad_norm": 2.3905091254417177, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8071, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0624512099921936, |
|
"grad_norm": 1.3958881588356253, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7197, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0936768149882904, |
|
"grad_norm": 1.5094847169199592, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6985, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1249024199843872, |
|
"grad_norm": 1.4080817727177422, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6855, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.156128024980484, |
|
"grad_norm": 0.8437914162126238, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6727, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1873536299765808, |
|
"grad_norm": 0.8642540743285999, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6576, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2185792349726776, |
|
"grad_norm": 0.6633530231329323, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6514, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2498048399687744, |
|
"grad_norm": 0.8262205915204606, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6457, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2810304449648712, |
|
"grad_norm": 0.5613248571253571, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6412, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.312256049960968, |
|
"grad_norm": 0.9150986102724331, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6384, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3434816549570648, |
|
"grad_norm": 0.7465781739155327, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6335, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3747072599531616, |
|
"grad_norm": 0.8250370829383081, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6302, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4059328649492584, |
|
"grad_norm": 0.6341139387710243, |
|
"learning_rate": 5e-06, |
|
"loss": 0.634, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4371584699453552, |
|
"grad_norm": 0.4964005156113376, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6211, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.468384074941452, |
|
"grad_norm": 0.9475290669111363, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6271, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4996096799375488, |
|
"grad_norm": 0.7811648794629471, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6238, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5308352849336456, |
|
"grad_norm": 0.516293302775752, |
|
"learning_rate": 5e-06, |
|
"loss": 0.621, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5620608899297423, |
|
"grad_norm": 0.471912073011228, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6215, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5932864949258392, |
|
"grad_norm": 0.5719925024660438, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6193, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.624512099921936, |
|
"grad_norm": 0.5059415320269443, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6175, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6557377049180327, |
|
"grad_norm": 0.5195408058121892, |
|
"learning_rate": 5e-06, |
|
"loss": 0.622, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6869633099141296, |
|
"grad_norm": 0.6014889400609209, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6158, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7181889149102264, |
|
"grad_norm": 0.5096070261428851, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6177, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7494145199063232, |
|
"grad_norm": 0.6028623695390841, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6148, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.78064012490242, |
|
"grad_norm": 0.5451038071079088, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6164, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8118657298985168, |
|
"grad_norm": 0.4708236433706893, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6094, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8430913348946136, |
|
"grad_norm": 0.46109612782168113, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6103, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8743169398907104, |
|
"grad_norm": 0.502648205452055, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6095, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.9055425448868072, |
|
"grad_norm": 0.4489395079927774, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6065, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.936768149882904, |
|
"grad_norm": 0.6477411963812875, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6132, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9679937548790007, |
|
"grad_norm": 0.5302907770786253, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6111, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9992193598750976, |
|
"grad_norm": 0.45517099030938496, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5986, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9992193598750976, |
|
"eval_loss": 0.6128131151199341, |
|
"eval_runtime": 341.0462, |
|
"eval_samples_per_second": 25.304, |
|
"eval_steps_per_second": 0.396, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.0308352849336455, |
|
"grad_norm": 0.8438882865201324, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6164, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.0620608899297423, |
|
"grad_norm": 0.4821646175445636, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5662, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.0932864949258392, |
|
"grad_norm": 0.5058980149763423, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5606, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.124512099921936, |
|
"grad_norm": 0.4781831185352073, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5648, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.1557377049180328, |
|
"grad_norm": 0.5048559047058323, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5582, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.1869633099141297, |
|
"grad_norm": 0.44414824193518654, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5584, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.2181889149102263, |
|
"grad_norm": 0.4928423351798681, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5515, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.2494145199063231, |
|
"grad_norm": 0.5064189451582637, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5637, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.28064012490242, |
|
"grad_norm": 0.44193713470343654, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5618, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.3118657298985168, |
|
"grad_norm": 0.4650381211562015, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5554, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.3430913348946136, |
|
"grad_norm": 0.5544428065241478, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5547, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.3743169398907105, |
|
"grad_norm": 0.48005595474790913, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5523, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.4055425448868073, |
|
"grad_norm": 0.4974548951913249, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5666, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.436768149882904, |
|
"grad_norm": 0.4923658625750441, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5558, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.4679937548790007, |
|
"grad_norm": 0.5272663506589431, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5584, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.4992193598750976, |
|
"grad_norm": 0.5304464959914178, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5643, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.5304449648711944, |
|
"grad_norm": 0.5773543616559265, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5598, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.561670569867291, |
|
"grad_norm": 0.4558348320273449, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5591, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.5928961748633879, |
|
"grad_norm": 0.5072303901122793, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5626, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.6241217798594847, |
|
"grad_norm": 0.5369887998410667, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5556, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.6553473848555815, |
|
"grad_norm": 0.5556757682627291, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5572, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.6865729898516784, |
|
"grad_norm": 0.5337242705677901, |
|
"learning_rate": 5e-06, |
|
"loss": 0.557, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.7177985948477752, |
|
"grad_norm": 0.46280527938706506, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5617, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.749024199843872, |
|
"grad_norm": 0.45608832514525505, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5581, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.7802498048399689, |
|
"grad_norm": 0.48374355780746187, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5564, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.8114754098360657, |
|
"grad_norm": 0.5029705354009028, |
|
"learning_rate": 5e-06, |
|
"loss": 0.559, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.8427010148321625, |
|
"grad_norm": 0.46966476792976214, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5616, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.8739266198282591, |
|
"grad_norm": 0.446283124549817, |
|
"learning_rate": 5e-06, |
|
"loss": 0.553, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.905152224824356, |
|
"grad_norm": 0.4745527474098281, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5589, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.9363778298204528, |
|
"grad_norm": 0.501609279464785, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5628, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.9676034348165494, |
|
"grad_norm": 0.49320626859834116, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5522, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.9988290398126463, |
|
"grad_norm": 0.4324557011242181, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5596, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.9988290398126463, |
|
"eval_loss": 0.6045193076133728, |
|
"eval_runtime": 340.5129, |
|
"eval_samples_per_second": 25.344, |
|
"eval_steps_per_second": 0.396, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.030444964871194, |
|
"grad_norm": 0.6006527642113036, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5662, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.061670569867291, |
|
"grad_norm": 0.5376147888211947, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5018, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.092896174863388, |
|
"grad_norm": 0.5448017881956769, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5057, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.1241217798594847, |
|
"grad_norm": 0.6095347029172922, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5036, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.1553473848555815, |
|
"grad_norm": 0.5281790301882382, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5066, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.1865729898516784, |
|
"grad_norm": 0.543025537124188, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5086, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.217798594847775, |
|
"grad_norm": 0.6024294613229594, |
|
"learning_rate": 5e-06, |
|
"loss": 0.508, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.249024199843872, |
|
"grad_norm": 0.5261160691218546, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5073, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.280249804839969, |
|
"grad_norm": 0.4878879224650377, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5126, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.3114754098360657, |
|
"grad_norm": 0.5298908191049263, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5098, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.3427010148321625, |
|
"grad_norm": 0.4963375261761113, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5084, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.3739266198282594, |
|
"grad_norm": 0.476625155447844, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5097, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.4051522248243558, |
|
"grad_norm": 0.5208071390082176, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5028, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.4363778298204526, |
|
"grad_norm": 0.4800697229604007, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5102, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.4676034348165494, |
|
"grad_norm": 0.5837948115948769, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5068, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.4988290398126463, |
|
"grad_norm": 0.5280421758640109, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5141, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.530054644808743, |
|
"grad_norm": 0.4594714496886714, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5081, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.56128024980484, |
|
"grad_norm": 0.51076427145537, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5123, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.5925058548009368, |
|
"grad_norm": 0.5309550406289082, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5067, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.6237314597970336, |
|
"grad_norm": 0.6033885635557941, |
|
"learning_rate": 5e-06, |
|
"loss": 0.519, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.6549570647931304, |
|
"grad_norm": 0.5271243446375676, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5048, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.6861826697892273, |
|
"grad_norm": 0.5270298610894952, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5165, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.717408274785324, |
|
"grad_norm": 0.5222507557357616, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5154, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.748633879781421, |
|
"grad_norm": 0.49639435328740067, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5091, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.7798594847775178, |
|
"grad_norm": 0.4828475074772525, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5112, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.8110850897736146, |
|
"grad_norm": 0.5145958529566682, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5109, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.8423106947697114, |
|
"grad_norm": 0.528104324477883, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5166, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.873536299765808, |
|
"grad_norm": 0.47920251362694366, |
|
"learning_rate": 5e-06, |
|
"loss": 0.519, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.9047619047619047, |
|
"grad_norm": 0.538906478147928, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5181, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.9359875097580015, |
|
"grad_norm": 0.5243949264804789, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5134, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.9672131147540983, |
|
"grad_norm": 0.47727618067883554, |
|
"learning_rate": 5e-06, |
|
"loss": 0.512, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.998438719750195, |
|
"grad_norm": 0.5110204465597075, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5083, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.998438719750195, |
|
"eval_loss": 0.608026921749115, |
|
"eval_runtime": 339.6647, |
|
"eval_samples_per_second": 25.407, |
|
"eval_steps_per_second": 0.397, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.998438719750195, |
|
"step": 960, |
|
"total_flos": 1607826375966720.0, |
|
"train_loss": 0.5702028140425682, |
|
"train_runtime": 56737.9664, |
|
"train_samples_per_second": 8.669, |
|
"train_steps_per_second": 0.017 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 960, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1607826375966720.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|