|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9954337899543377, |
|
"eval_steps": 500, |
|
"global_step": 984, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.030441400304414, |
|
"grad_norm": 4.619646397901246, |
|
"learning_rate": 5e-06, |
|
"loss": 0.801, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.060882800608828, |
|
"grad_norm": 2.4758507718701606, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7282, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.091324200913242, |
|
"grad_norm": 3.479812913897804, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7045, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.121765601217656, |
|
"grad_norm": 2.9360047427282385, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6875, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.15220700152207, |
|
"grad_norm": 0.973776971269499, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6818, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.182648401826484, |
|
"grad_norm": 0.800750811910126, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6695, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.213089802130898, |
|
"grad_norm": 0.7737759634387463, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6519, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.243531202435312, |
|
"grad_norm": 1.3731039805269152, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6423, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.273972602739726, |
|
"grad_norm": 0.6785230486405546, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6501, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.30441400304414, |
|
"grad_norm": 1.054981943761996, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6403, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.334855403348554, |
|
"grad_norm": 0.7312141288913386, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6327, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.365296803652968, |
|
"grad_norm": 0.611778635862804, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6311, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.395738203957382, |
|
"grad_norm": 0.6581343673138518, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6383, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.426179604261796, |
|
"grad_norm": 0.8961158759762995, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6289, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.45662100456621, |
|
"grad_norm": 0.5571046772698393, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6315, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.487062404870624, |
|
"grad_norm": 0.5225558082134438, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6292, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.517503805175038, |
|
"grad_norm": 0.5700430081742105, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6274, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.547945205479452, |
|
"grad_norm": 1.3638017063276748, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6147, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.578386605783866, |
|
"grad_norm": 1.1474194001541684, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6214, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.60882800608828, |
|
"grad_norm": 0.5983544869985015, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6211, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.639269406392694, |
|
"grad_norm": 0.5030983225008678, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6269, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.669710806697108, |
|
"grad_norm": 0.5355894732987774, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6186, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.700152207001522, |
|
"grad_norm": 0.5535858496033256, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6174, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.730593607305936, |
|
"grad_norm": 0.4757423728889985, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6157, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.76103500761035, |
|
"grad_norm": 0.5846472040453055, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6119, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.791476407914764, |
|
"grad_norm": 0.5064988157906138, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6174, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.821917808219178, |
|
"grad_norm": 0.5603859566155508, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6115, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.852359208523592, |
|
"grad_norm": 0.5340207450065109, |
|
"learning_rate": 5e-06, |
|
"loss": 0.618, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.882800608828006, |
|
"grad_norm": 0.5176245531200292, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6141, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.91324200913242, |
|
"grad_norm": 0.5686518744643043, |
|
"learning_rate": 5e-06, |
|
"loss": 0.61, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.943683409436834, |
|
"grad_norm": 0.5557582588776627, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6086, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.974124809741248, |
|
"grad_norm": 0.48632226958782565, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6134, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9984779299847792, |
|
"eval_loss": 0.6086177825927734, |
|
"eval_runtime": 177.842, |
|
"eval_samples_per_second": 49.758, |
|
"eval_steps_per_second": 0.394, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 1.004566210045662, |
|
"grad_norm": 0.5349618575586927, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6066, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.035007610350076, |
|
"grad_norm": 0.7085072078930811, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5583, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.06544901065449, |
|
"grad_norm": 0.5086842387297956, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5577, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.095890410958904, |
|
"grad_norm": 0.5618625623080041, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5692, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.126331811263318, |
|
"grad_norm": 0.8882124898558976, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5622, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.156773211567732, |
|
"grad_norm": 0.8010806701357303, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5626, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.187214611872146, |
|
"grad_norm": 0.6044300304393334, |
|
"learning_rate": 5e-06, |
|
"loss": 0.564, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.21765601217656, |
|
"grad_norm": 0.6478561685693183, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5599, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.248097412480974, |
|
"grad_norm": 0.48517644823822886, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5629, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.278538812785388, |
|
"grad_norm": 0.6579745015004641, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5688, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.308980213089802, |
|
"grad_norm": 0.5933025521185841, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5625, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.339421613394216, |
|
"grad_norm": 0.5524692856580558, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5645, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.36986301369863, |
|
"grad_norm": 0.5949700286062661, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5667, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.400304414003044, |
|
"grad_norm": 0.7922305492429499, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5671, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.430745814307458, |
|
"grad_norm": 0.5035984988245502, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5699, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.461187214611872, |
|
"grad_norm": 0.4675577590538898, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5622, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.491628614916286, |
|
"grad_norm": 0.6149051107408182, |
|
"learning_rate": 5e-06, |
|
"loss": 0.568, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.5220700152207, |
|
"grad_norm": 0.624211800852349, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5641, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.5525114155251143, |
|
"grad_norm": 0.9181588884042637, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5666, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.582952815829528, |
|
"grad_norm": 0.4937076242442424, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5666, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.6133942161339423, |
|
"grad_norm": 0.5082102469467535, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5574, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.643835616438356, |
|
"grad_norm": 0.4937565845275598, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5689, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.6742770167427703, |
|
"grad_norm": 0.4848002546906178, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5688, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.704718417047184, |
|
"grad_norm": 0.5230615787294617, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5677, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.7351598173515983, |
|
"grad_norm": 0.4976273028384401, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5586, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.765601217656012, |
|
"grad_norm": 0.49403463868422004, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5604, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.7960426179604263, |
|
"grad_norm": 0.6320569692146091, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5578, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.82648401826484, |
|
"grad_norm": 0.5695898310066485, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5593, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.8569254185692543, |
|
"grad_norm": 0.49537917557389, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5627, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.887366818873668, |
|
"grad_norm": 0.6247132495885726, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5621, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.9178082191780823, |
|
"grad_norm": 0.5030338206961641, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5598, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.948249619482496, |
|
"grad_norm": 0.4962258563588259, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5562, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.9786910197869103, |
|
"grad_norm": 0.49909928012252913, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5568, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.5997375249862671, |
|
"eval_runtime": 178.4917, |
|
"eval_samples_per_second": 49.577, |
|
"eval_steps_per_second": 0.392, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 2.009132420091324, |
|
"grad_norm": 0.8861960444337913, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5441, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.0395738203957383, |
|
"grad_norm": 0.5425502389160675, |
|
"learning_rate": 5e-06, |
|
"loss": 0.514, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.070015220700152, |
|
"grad_norm": 0.5562526095704428, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5032, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.1004566210045663, |
|
"grad_norm": 0.49212370632524655, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5154, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.13089802130898, |
|
"grad_norm": 0.534245423835535, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5116, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.1613394216133943, |
|
"grad_norm": 0.5454909388550724, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5136, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.191780821917808, |
|
"grad_norm": 0.8231087538933396, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5052, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 0.5745911989069736, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5157, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.252663622526636, |
|
"grad_norm": 0.5370023428369475, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5142, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.2831050228310503, |
|
"grad_norm": 0.5417242182114983, |
|
"learning_rate": 5e-06, |
|
"loss": 0.507, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.313546423135464, |
|
"grad_norm": 0.4948430389470553, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5071, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.3439878234398783, |
|
"grad_norm": 0.5823872991278739, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5152, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.374429223744292, |
|
"grad_norm": 0.5749628913643053, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5158, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.4048706240487063, |
|
"grad_norm": 0.523744812578657, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5121, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.43531202435312, |
|
"grad_norm": 0.6430805814437145, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5187, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.4657534246575343, |
|
"grad_norm": 0.6135659154747556, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5105, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.496194824961948, |
|
"grad_norm": 0.5819355389015244, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5127, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.5266362252663623, |
|
"grad_norm": 0.5222652845841953, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5172, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.557077625570776, |
|
"grad_norm": 0.5085292250908287, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5154, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.5875190258751903, |
|
"grad_norm": 0.5051693370140233, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5122, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.617960426179604, |
|
"grad_norm": 0.5219920053064364, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5154, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.6484018264840183, |
|
"grad_norm": 0.5918822792144229, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5141, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.678843226788432, |
|
"grad_norm": 0.522542434998343, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5117, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.7092846270928463, |
|
"grad_norm": 0.5321626854739884, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5073, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.73972602739726, |
|
"grad_norm": 0.5099777411434487, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5202, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.7701674277016743, |
|
"grad_norm": 0.5225721184412385, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5204, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.800608828006088, |
|
"grad_norm": 0.49419482852544927, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5117, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.8310502283105023, |
|
"grad_norm": 0.5580841932281034, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5125, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.861491628614916, |
|
"grad_norm": 0.5459012978192056, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5159, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.8919330289193304, |
|
"grad_norm": 0.4931927553237747, |
|
"learning_rate": 5e-06, |
|
"loss": 0.516, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.922374429223744, |
|
"grad_norm": 0.5125805588681561, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5204, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.9528158295281584, |
|
"grad_norm": 0.5359088405972897, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5126, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.983257229832572, |
|
"grad_norm": 0.5366173084145919, |
|
"learning_rate": 5e-06, |
|
"loss": 0.507, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.9954337899543377, |
|
"eval_loss": 0.6046163439750671, |
|
"eval_runtime": 178.3759, |
|
"eval_samples_per_second": 49.609, |
|
"eval_steps_per_second": 0.392, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 2.9954337899543377, |
|
"step": 984, |
|
"total_flos": 1647817890201600.0, |
|
"train_loss": 0.5723786247455007, |
|
"train_runtime": 29605.407, |
|
"train_samples_per_second": 17.036, |
|
"train_steps_per_second": 0.033 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 984, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1647817890201600.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|