|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 2370, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004219409282700422, |
|
"grad_norm": 3.1438205242156982, |
|
"learning_rate": 8.438818565400843e-07, |
|
"loss": 2.5933, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.02109704641350211, |
|
"grad_norm": 3.45337176322937, |
|
"learning_rate": 4.219409282700422e-06, |
|
"loss": 2.5683, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.04219409282700422, |
|
"grad_norm": 3.8354620933532715, |
|
"learning_rate": 8.438818565400844e-06, |
|
"loss": 2.567, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06329113924050633, |
|
"grad_norm": 3.2700271606445312, |
|
"learning_rate": 1.2658227848101267e-05, |
|
"loss": 2.4327, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.08438818565400844, |
|
"grad_norm": 3.2498395442962646, |
|
"learning_rate": 1.6877637130801688e-05, |
|
"loss": 2.2197, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.10548523206751055, |
|
"grad_norm": 2.3556034564971924, |
|
"learning_rate": 2.1097046413502112e-05, |
|
"loss": 1.8787, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.12658227848101267, |
|
"grad_norm": 1.5979266166687012, |
|
"learning_rate": 2.5316455696202533e-05, |
|
"loss": 1.5467, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.14767932489451477, |
|
"grad_norm": 1.260302186012268, |
|
"learning_rate": 2.9535864978902954e-05, |
|
"loss": 1.4303, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.16877637130801687, |
|
"grad_norm": 0.5591890811920166, |
|
"learning_rate": 3.3755274261603375e-05, |
|
"loss": 1.3705, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.189873417721519, |
|
"grad_norm": 0.5186863541603088, |
|
"learning_rate": 3.79746835443038e-05, |
|
"loss": 1.328, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2109704641350211, |
|
"grad_norm": 0.5181670188903809, |
|
"learning_rate": 4.2194092827004224e-05, |
|
"loss": 1.2614, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2320675105485232, |
|
"grad_norm": 0.5109447240829468, |
|
"learning_rate": 4.641350210970464e-05, |
|
"loss": 1.2328, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.25316455696202533, |
|
"grad_norm": 0.4200008809566498, |
|
"learning_rate": 5.0632911392405066e-05, |
|
"loss": 1.1882, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2742616033755274, |
|
"grad_norm": 0.37015053629875183, |
|
"learning_rate": 5.4852320675105484e-05, |
|
"loss": 1.1461, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.29535864978902954, |
|
"grad_norm": 0.39964228868484497, |
|
"learning_rate": 5.907172995780591e-05, |
|
"loss": 1.1336, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.31645569620253167, |
|
"grad_norm": 0.3632591962814331, |
|
"learning_rate": 6.329113924050633e-05, |
|
"loss": 1.1081, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.33755274261603374, |
|
"grad_norm": 0.363908588886261, |
|
"learning_rate": 6.751054852320675e-05, |
|
"loss": 1.1042, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.35864978902953587, |
|
"grad_norm": 0.373738557100296, |
|
"learning_rate": 7.172995780590718e-05, |
|
"loss": 1.0802, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.379746835443038, |
|
"grad_norm": 0.3337308168411255, |
|
"learning_rate": 7.59493670886076e-05, |
|
"loss": 1.0781, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.4008438818565401, |
|
"grad_norm": 0.36707767844200134, |
|
"learning_rate": 8.016877637130802e-05, |
|
"loss": 1.07, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.4219409282700422, |
|
"grad_norm": 0.40698128938674927, |
|
"learning_rate": 8.438818565400845e-05, |
|
"loss": 1.0672, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.4430379746835443, |
|
"grad_norm": 0.4015476107597351, |
|
"learning_rate": 8.860759493670887e-05, |
|
"loss": 1.0508, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.4641350210970464, |
|
"grad_norm": 0.3830510675907135, |
|
"learning_rate": 9.282700421940928e-05, |
|
"loss": 1.0396, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.48523206751054854, |
|
"grad_norm": 0.540158748626709, |
|
"learning_rate": 9.704641350210972e-05, |
|
"loss": 1.0356, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.5063291139240507, |
|
"grad_norm": 0.5870048999786377, |
|
"learning_rate": 0.00010126582278481013, |
|
"loss": 1.0363, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5274261603375527, |
|
"grad_norm": 0.6282536387443542, |
|
"learning_rate": 0.00010548523206751055, |
|
"loss": 1.0205, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.5485232067510548, |
|
"grad_norm": 0.5962668061256409, |
|
"learning_rate": 0.00010970464135021097, |
|
"loss": 1.0327, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.569620253164557, |
|
"grad_norm": 0.44145339727401733, |
|
"learning_rate": 0.0001139240506329114, |
|
"loss": 1.0036, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.5907172995780591, |
|
"grad_norm": 0.3850124776363373, |
|
"learning_rate": 0.00011814345991561182, |
|
"loss": 0.9939, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.6118143459915611, |
|
"grad_norm": 0.45476189255714417, |
|
"learning_rate": 0.00012236286919831225, |
|
"loss": 1.01, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.6329113924050633, |
|
"grad_norm": 0.4156922399997711, |
|
"learning_rate": 0.00012658227848101267, |
|
"loss": 1.0046, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6540084388185654, |
|
"grad_norm": 0.5297821760177612, |
|
"learning_rate": 0.00013080168776371308, |
|
"loss": 0.9885, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.6751054852320675, |
|
"grad_norm": 0.4995609521865845, |
|
"learning_rate": 0.0001350210970464135, |
|
"loss": 0.9885, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6962025316455697, |
|
"grad_norm": 0.43320751190185547, |
|
"learning_rate": 0.00013924050632911395, |
|
"loss": 0.9818, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.7172995780590717, |
|
"grad_norm": 0.3719841539859772, |
|
"learning_rate": 0.00014345991561181436, |
|
"loss": 0.9739, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.7383966244725738, |
|
"grad_norm": 0.40309059619903564, |
|
"learning_rate": 0.00014767932489451478, |
|
"loss": 0.9646, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.759493670886076, |
|
"grad_norm": 0.41251224279403687, |
|
"learning_rate": 0.0001518987341772152, |
|
"loss": 0.9606, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.7805907172995781, |
|
"grad_norm": 0.3959939181804657, |
|
"learning_rate": 0.00015611814345991562, |
|
"loss": 0.9585, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.8016877637130801, |
|
"grad_norm": 0.5289701819419861, |
|
"learning_rate": 0.00016033755274261603, |
|
"loss": 0.9709, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.8227848101265823, |
|
"grad_norm": 0.4239669740200043, |
|
"learning_rate": 0.00016455696202531648, |
|
"loss": 0.9577, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.8438818565400844, |
|
"grad_norm": 0.5463127493858337, |
|
"learning_rate": 0.0001687763713080169, |
|
"loss": 0.9703, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.8649789029535865, |
|
"grad_norm": 0.4942500591278076, |
|
"learning_rate": 0.00017299578059071731, |
|
"loss": 0.956, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.8860759493670886, |
|
"grad_norm": 0.361708402633667, |
|
"learning_rate": 0.00017721518987341773, |
|
"loss": 0.9598, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.9071729957805907, |
|
"grad_norm": 0.5146432518959045, |
|
"learning_rate": 0.00018143459915611815, |
|
"loss": 0.9606, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.9282700421940928, |
|
"grad_norm": 0.4746183156967163, |
|
"learning_rate": 0.00018565400843881857, |
|
"loss": 0.9255, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.9493670886075949, |
|
"grad_norm": 0.35139352083206177, |
|
"learning_rate": 0.00018987341772151899, |
|
"loss": 0.9452, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.9704641350210971, |
|
"grad_norm": 0.3744509816169739, |
|
"learning_rate": 0.00019409282700421943, |
|
"loss": 0.9388, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.9915611814345991, |
|
"grad_norm": 0.409365177154541, |
|
"learning_rate": 0.00019831223628691985, |
|
"loss": 0.9183, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.6516629457473755, |
|
"eval_runtime": 0.5551, |
|
"eval_samples_per_second": 3.603, |
|
"eval_steps_per_second": 1.802, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.0126582278481013, |
|
"grad_norm": 0.3671579658985138, |
|
"learning_rate": 0.00019999902382014363, |
|
"loss": 0.9181, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.0337552742616034, |
|
"grad_norm": 0.37430304288864136, |
|
"learning_rate": 0.0001999930583455953, |
|
"loss": 0.9099, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.0548523206751055, |
|
"grad_norm": 0.3971017599105835, |
|
"learning_rate": 0.00019998167004176888, |
|
"loss": 0.9187, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.0759493670886076, |
|
"grad_norm": 0.35043320059776306, |
|
"learning_rate": 0.00019996485952627552, |
|
"loss": 0.9063, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.0970464135021096, |
|
"grad_norm": 0.34455016255378723, |
|
"learning_rate": 0.00019994262771078406, |
|
"loss": 0.9072, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.1181434599156117, |
|
"grad_norm": 0.358531653881073, |
|
"learning_rate": 0.0001999149758009716, |
|
"loss": 0.916, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.139240506329114, |
|
"grad_norm": 0.3874802589416504, |
|
"learning_rate": 0.00019988190529645808, |
|
"loss": 0.8913, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.160337552742616, |
|
"grad_norm": 0.3963850736618042, |
|
"learning_rate": 0.00019984341799072504, |
|
"loss": 0.9033, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.1814345991561181, |
|
"grad_norm": 0.485408753156662, |
|
"learning_rate": 0.0001997995159710182, |
|
"loss": 0.8965, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.2025316455696202, |
|
"grad_norm": 0.383575975894928, |
|
"learning_rate": 0.00019975020161823445, |
|
"loss": 0.8919, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.2236286919831223, |
|
"grad_norm": 0.3458747863769531, |
|
"learning_rate": 0.00019969547760679258, |
|
"loss": 0.8827, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.2447257383966246, |
|
"grad_norm": 0.36945974826812744, |
|
"learning_rate": 0.00019963534690448835, |
|
"loss": 0.8957, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.2658227848101267, |
|
"grad_norm": 0.39198634028434753, |
|
"learning_rate": 0.0001995698127723334, |
|
"loss": 0.879, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.2869198312236287, |
|
"grad_norm": 0.36909279227256775, |
|
"learning_rate": 0.0001994988787643786, |
|
"loss": 0.9014, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.3080168776371308, |
|
"grad_norm": 0.3728243112564087, |
|
"learning_rate": 0.00019942254872752112, |
|
"loss": 0.891, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.3291139240506329, |
|
"grad_norm": 0.34412381052970886, |
|
"learning_rate": 0.00019934082680129586, |
|
"loss": 0.8744, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.350210970464135, |
|
"grad_norm": 0.3310409486293793, |
|
"learning_rate": 0.00019925371741765107, |
|
"loss": 0.8788, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.371308016877637, |
|
"grad_norm": 0.3466126620769501, |
|
"learning_rate": 0.00019916122530070783, |
|
"loss": 0.8953, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.3924050632911391, |
|
"grad_norm": 0.3331877291202545, |
|
"learning_rate": 0.00019906335546650392, |
|
"loss": 0.8703, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.4135021097046414, |
|
"grad_norm": 0.37082529067993164, |
|
"learning_rate": 0.0001989601132227218, |
|
"loss": 0.8951, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.4345991561181435, |
|
"grad_norm": 0.3338633179664612, |
|
"learning_rate": 0.00019885150416840082, |
|
"loss": 0.8826, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.4556962025316456, |
|
"grad_norm": 0.32301968336105347, |
|
"learning_rate": 0.00019873753419363336, |
|
"loss": 0.8821, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.4767932489451476, |
|
"grad_norm": 0.3255464732646942, |
|
"learning_rate": 0.00019861820947924565, |
|
"loss": 0.87, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.49789029535865, |
|
"grad_norm": 0.33072352409362793, |
|
"learning_rate": 0.0001984935364964625, |
|
"loss": 0.8755, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.518987341772152, |
|
"grad_norm": 0.39160993695259094, |
|
"learning_rate": 0.0001983635220065562, |
|
"loss": 0.861, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.540084388185654, |
|
"grad_norm": 0.35338205099105835, |
|
"learning_rate": 0.00019822817306048006, |
|
"loss": 0.864, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.5611814345991561, |
|
"grad_norm": 0.30854344367980957, |
|
"learning_rate": 0.00019808749699848593, |
|
"loss": 0.8521, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.5822784810126582, |
|
"grad_norm": 0.3593827188014984, |
|
"learning_rate": 0.00019794150144972602, |
|
"loss": 0.8738, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.6033755274261603, |
|
"grad_norm": 0.30474522709846497, |
|
"learning_rate": 0.0001977901943318393, |
|
"loss": 0.8612, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.6244725738396624, |
|
"grad_norm": 0.3270438611507416, |
|
"learning_rate": 0.0001976335838505221, |
|
"loss": 0.8838, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.6455696202531644, |
|
"grad_norm": 0.3059784471988678, |
|
"learning_rate": 0.00019747167849908304, |
|
"loss": 0.8687, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 0.3195934295654297, |
|
"learning_rate": 0.00019730448705798239, |
|
"loss": 0.8639, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.6877637130801688, |
|
"grad_norm": 0.31650060415267944, |
|
"learning_rate": 0.00019713201859435602, |
|
"loss": 0.8825, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.7088607594936709, |
|
"grad_norm": 0.33506250381469727, |
|
"learning_rate": 0.0001969542824615235, |
|
"loss": 0.8663, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.729957805907173, |
|
"grad_norm": 0.3536715805530548, |
|
"learning_rate": 0.00019677128829848103, |
|
"loss": 0.8498, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.7510548523206753, |
|
"grad_norm": 0.35253262519836426, |
|
"learning_rate": 0.00019658304602937856, |
|
"loss": 0.8614, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.7721518987341773, |
|
"grad_norm": 0.3264296054840088, |
|
"learning_rate": 0.0001963895658629816, |
|
"loss": 0.8456, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.7932489451476794, |
|
"grad_norm": 0.32499557733535767, |
|
"learning_rate": 0.00019619085829211764, |
|
"loss": 0.8435, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.8143459915611815, |
|
"grad_norm": 0.31721100211143494, |
|
"learning_rate": 0.00019598693409310708, |
|
"loss": 0.8716, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.8354430379746836, |
|
"grad_norm": 0.31418412923812866, |
|
"learning_rate": 0.00019577780432517879, |
|
"loss": 0.859, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.8565400843881856, |
|
"grad_norm": 0.3075924515724182, |
|
"learning_rate": 0.0001955634803298703, |
|
"loss": 0.8573, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.8776371308016877, |
|
"grad_norm": 0.30187729001045227, |
|
"learning_rate": 0.00019534397373041285, |
|
"loss": 0.8381, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.8987341772151898, |
|
"grad_norm": 0.3083683252334595, |
|
"learning_rate": 0.00019511929643110097, |
|
"loss": 0.8536, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.9198312236286919, |
|
"grad_norm": 0.2972455620765686, |
|
"learning_rate": 0.0001948894606166468, |
|
"loss": 0.8487, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.9409282700421941, |
|
"grad_norm": 0.3294317424297333, |
|
"learning_rate": 0.00019465447875151946, |
|
"loss": 0.8485, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.9620253164556962, |
|
"grad_norm": 0.28597962856292725, |
|
"learning_rate": 0.00019441436357926892, |
|
"loss": 0.8608, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.9831223628691983, |
|
"grad_norm": 0.30924198031425476, |
|
"learning_rate": 0.00019416912812183498, |
|
"loss": 0.8583, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.629499912261963, |
|
"eval_runtime": 0.5557, |
|
"eval_samples_per_second": 3.599, |
|
"eval_steps_per_second": 1.8, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 2.0042194092827006, |
|
"grad_norm": 0.3041239380836487, |
|
"learning_rate": 0.000193918785678841, |
|
"loss": 0.8475, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 2.0253164556962027, |
|
"grad_norm": 0.31540679931640625, |
|
"learning_rate": 0.0001936633498268728, |
|
"loss": 0.8119, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.0464135021097047, |
|
"grad_norm": 0.3115026652812958, |
|
"learning_rate": 0.0001934028344187421, |
|
"loss": 0.8259, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 2.067510548523207, |
|
"grad_norm": 0.39703112840652466, |
|
"learning_rate": 0.00019313725358273548, |
|
"loss": 0.8041, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.088607594936709, |
|
"grad_norm": 0.3294001817703247, |
|
"learning_rate": 0.00019286662172184808, |
|
"loss": 0.8003, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 2.109704641350211, |
|
"grad_norm": 0.3266647160053253, |
|
"learning_rate": 0.00019259095351300252, |
|
"loss": 0.8109, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.130801687763713, |
|
"grad_norm": 0.31092244386672974, |
|
"learning_rate": 0.0001923102639062529, |
|
"loss": 0.8212, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 2.151898734177215, |
|
"grad_norm": 0.3094409704208374, |
|
"learning_rate": 0.00019202456812397406, |
|
"loss": 0.8187, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.172995780590717, |
|
"grad_norm": 0.32525700330734253, |
|
"learning_rate": 0.00019173388166003613, |
|
"loss": 0.8058, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 2.1940928270042193, |
|
"grad_norm": 0.296203076839447, |
|
"learning_rate": 0.00019143822027896406, |
|
"loss": 0.8037, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.2151898734177213, |
|
"grad_norm": 0.3076232373714447, |
|
"learning_rate": 0.0001911376000150828, |
|
"loss": 0.8208, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 2.2362869198312234, |
|
"grad_norm": 0.2956830859184265, |
|
"learning_rate": 0.0001908320371716478, |
|
"loss": 0.788, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.257383966244726, |
|
"grad_norm": 0.3284301161766052, |
|
"learning_rate": 0.00019052154831996073, |
|
"loss": 0.7986, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 2.278481012658228, |
|
"grad_norm": 0.31620386242866516, |
|
"learning_rate": 0.00019020615029847072, |
|
"loss": 0.8049, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.29957805907173, |
|
"grad_norm": 0.3297165632247925, |
|
"learning_rate": 0.00018988586021186147, |
|
"loss": 0.8309, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 2.320675105485232, |
|
"grad_norm": 0.2959255874156952, |
|
"learning_rate": 0.0001895606954301233, |
|
"loss": 0.7965, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.3417721518987342, |
|
"grad_norm": 0.3089437782764435, |
|
"learning_rate": 0.00018923067358761136, |
|
"loss": 0.7946, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 2.3628691983122363, |
|
"grad_norm": 0.30109426379203796, |
|
"learning_rate": 0.00018889581258208903, |
|
"loss": 0.8123, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.3839662447257384, |
|
"grad_norm": 0.32586753368377686, |
|
"learning_rate": 0.0001885561305737577, |
|
"loss": 0.8162, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 2.4050632911392404, |
|
"grad_norm": 0.3282499611377716, |
|
"learning_rate": 0.00018821164598427145, |
|
"loss": 0.8196, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.4261603375527425, |
|
"grad_norm": 0.3379076421260834, |
|
"learning_rate": 0.00018786237749573837, |
|
"loss": 0.816, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 2.4472573839662446, |
|
"grad_norm": 0.30591997504234314, |
|
"learning_rate": 0.00018750834404970718, |
|
"loss": 0.8015, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.4683544303797467, |
|
"grad_norm": 0.3031338155269623, |
|
"learning_rate": 0.00018714956484613995, |
|
"loss": 0.817, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 2.489451476793249, |
|
"grad_norm": 0.3209945261478424, |
|
"learning_rate": 0.0001867860593423711, |
|
"loss": 0.8134, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.510548523206751, |
|
"grad_norm": 0.31199783086776733, |
|
"learning_rate": 0.000186417847252052, |
|
"loss": 0.8053, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 2.5316455696202533, |
|
"grad_norm": 0.29871612787246704, |
|
"learning_rate": 0.00018604494854408178, |
|
"loss": 0.804, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.5527426160337554, |
|
"grad_norm": 0.31065070629119873, |
|
"learning_rate": 0.0001856673834415246, |
|
"loss": 0.8033, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 2.5738396624472575, |
|
"grad_norm": 0.2946309447288513, |
|
"learning_rate": 0.00018528517242051283, |
|
"loss": 0.8006, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.5949367088607596, |
|
"grad_norm": 0.2882119119167328, |
|
"learning_rate": 0.00018489833620913642, |
|
"loss": 0.8059, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 2.6160337552742616, |
|
"grad_norm": 0.31968575716018677, |
|
"learning_rate": 0.00018450689578631898, |
|
"loss": 0.8045, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.6371308016877637, |
|
"grad_norm": 0.3142683804035187, |
|
"learning_rate": 0.00018411087238068003, |
|
"loss": 0.8045, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 2.6582278481012658, |
|
"grad_norm": 0.3006449043750763, |
|
"learning_rate": 0.0001837102874693836, |
|
"loss": 0.8056, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.679324894514768, |
|
"grad_norm": 0.3158734440803528, |
|
"learning_rate": 0.0001833051627769736, |
|
"loss": 0.8275, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 2.70042194092827, |
|
"grad_norm": 0.30407387018203735, |
|
"learning_rate": 0.00018289552027419558, |
|
"loss": 0.8133, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.721518987341772, |
|
"grad_norm": 0.3022385835647583, |
|
"learning_rate": 0.0001824813821768053, |
|
"loss": 0.8026, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 2.742616033755274, |
|
"grad_norm": 0.29126378893852234, |
|
"learning_rate": 0.00018206277094436377, |
|
"loss": 0.8075, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.7637130801687766, |
|
"grad_norm": 0.29224568605422974, |
|
"learning_rate": 0.00018163970927901937, |
|
"loss": 0.811, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 2.7848101265822782, |
|
"grad_norm": 0.3017180860042572, |
|
"learning_rate": 0.00018121222012427665, |
|
"loss": 0.7945, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.8059071729957807, |
|
"grad_norm": 0.2974776029586792, |
|
"learning_rate": 0.00018078032666375194, |
|
"loss": 0.8078, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 2.827004219409283, |
|
"grad_norm": 0.2910807430744171, |
|
"learning_rate": 0.0001803440523199162, |
|
"loss": 0.7887, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.848101265822785, |
|
"grad_norm": 0.3062914311885834, |
|
"learning_rate": 0.0001799034207528247, |
|
"loss": 0.7928, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 2.869198312236287, |
|
"grad_norm": 0.29467758536338806, |
|
"learning_rate": 0.0001794584558588338, |
|
"loss": 0.8047, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.890295358649789, |
|
"grad_norm": 0.32635724544525146, |
|
"learning_rate": 0.00017900918176930522, |
|
"loss": 0.8144, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 2.911392405063291, |
|
"grad_norm": 0.31900787353515625, |
|
"learning_rate": 0.00017855562284929718, |
|
"loss": 0.8089, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.932489451476793, |
|
"grad_norm": 0.3085595667362213, |
|
"learning_rate": 0.00017809780369624302, |
|
"loss": 0.8048, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 2.9535864978902953, |
|
"grad_norm": 0.2946968078613281, |
|
"learning_rate": 0.00017763574913861734, |
|
"loss": 0.8157, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.9746835443037973, |
|
"grad_norm": 0.29723235964775085, |
|
"learning_rate": 0.00017716948423458938, |
|
"loss": 0.796, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 2.9957805907173, |
|
"grad_norm": 0.2712932229042053, |
|
"learning_rate": 0.00017669903427066424, |
|
"loss": 0.8179, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 1.6558986902236938, |
|
"eval_runtime": 0.5507, |
|
"eval_samples_per_second": 3.632, |
|
"eval_steps_per_second": 1.816, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 3.0168776371308015, |
|
"grad_norm": 0.3356448709964752, |
|
"learning_rate": 0.0001762244247603113, |
|
"loss": 0.7628, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 3.037974683544304, |
|
"grad_norm": 0.3006523847579956, |
|
"learning_rate": 0.00017574568144258077, |
|
"loss": 0.7558, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 3.059071729957806, |
|
"grad_norm": 0.30827251076698303, |
|
"learning_rate": 0.00017526283028070777, |
|
"loss": 0.7567, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 3.080168776371308, |
|
"grad_norm": 0.3096933662891388, |
|
"learning_rate": 0.00017477589746070417, |
|
"loss": 0.7581, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 3.1012658227848102, |
|
"grad_norm": 0.32005831599235535, |
|
"learning_rate": 0.00017428490938993862, |
|
"loss": 0.7549, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 3.1223628691983123, |
|
"grad_norm": 0.30930569767951965, |
|
"learning_rate": 0.00017378989269570437, |
|
"loss": 0.7702, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 3.1434599156118144, |
|
"grad_norm": 0.32762596011161804, |
|
"learning_rate": 0.0001732908742237752, |
|
"loss": 0.7471, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 3.1645569620253164, |
|
"grad_norm": 0.32086798548698425, |
|
"learning_rate": 0.00017278788103694943, |
|
"loss": 0.7618, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 3.1856540084388185, |
|
"grad_norm": 0.3558262586593628, |
|
"learning_rate": 0.00017228094041358248, |
|
"loss": 0.7764, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 3.2067510548523206, |
|
"grad_norm": 0.3397001326084137, |
|
"learning_rate": 0.0001717700798461074, |
|
"loss": 0.753, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 3.2278481012658227, |
|
"grad_norm": 0.30650395154953003, |
|
"learning_rate": 0.00017125532703954365, |
|
"loss": 0.7595, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 3.2489451476793247, |
|
"grad_norm": 0.317777156829834, |
|
"learning_rate": 0.0001707367099099951, |
|
"loss": 0.7546, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 3.270042194092827, |
|
"grad_norm": 0.3183245062828064, |
|
"learning_rate": 0.00017021425658313565, |
|
"loss": 0.7633, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 3.291139240506329, |
|
"grad_norm": 0.3169344365596771, |
|
"learning_rate": 0.00016968799539268407, |
|
"loss": 0.7759, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 3.3122362869198314, |
|
"grad_norm": 0.30704858899116516, |
|
"learning_rate": 0.00016915795487886746, |
|
"loss": 0.7565, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 0.3002530038356781, |
|
"learning_rate": 0.0001686241637868734, |
|
"loss": 0.7509, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 3.3544303797468356, |
|
"grad_norm": 0.3143273591995239, |
|
"learning_rate": 0.00016808665106529094, |
|
"loss": 0.7482, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 3.3755274261603376, |
|
"grad_norm": 0.30195352435112, |
|
"learning_rate": 0.00016754544586454094, |
|
"loss": 0.762, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 3.3966244725738397, |
|
"grad_norm": 0.32630935311317444, |
|
"learning_rate": 0.00016700057753529484, |
|
"loss": 0.7637, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 3.4177215189873418, |
|
"grad_norm": 0.31649506092071533, |
|
"learning_rate": 0.0001664520756268832, |
|
"loss": 0.7577, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 3.438818565400844, |
|
"grad_norm": 0.3301686644554138, |
|
"learning_rate": 0.0001658999698856929, |
|
"loss": 0.7534, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 3.459915611814346, |
|
"grad_norm": 0.3230050802230835, |
|
"learning_rate": 0.00016534429025355426, |
|
"loss": 0.7567, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 3.481012658227848, |
|
"grad_norm": 0.30652645230293274, |
|
"learning_rate": 0.00016478506686611697, |
|
"loss": 0.757, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 3.50210970464135, |
|
"grad_norm": 0.32210221886634827, |
|
"learning_rate": 0.0001642223300512158, |
|
"loss": 0.7734, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 3.523206751054852, |
|
"grad_norm": 0.3032419681549072, |
|
"learning_rate": 0.00016365611032722604, |
|
"loss": 0.7519, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 3.5443037974683547, |
|
"grad_norm": 0.2990473508834839, |
|
"learning_rate": 0.00016308643840140828, |
|
"loss": 0.7579, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 3.5654008438818563, |
|
"grad_norm": 0.32090187072753906, |
|
"learning_rate": 0.000162513345168243, |
|
"loss": 0.7569, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 3.586497890295359, |
|
"grad_norm": 0.3112528920173645, |
|
"learning_rate": 0.00016193686170775537, |
|
"loss": 0.7752, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 3.607594936708861, |
|
"grad_norm": 0.311675488948822, |
|
"learning_rate": 0.00016135701928382952, |
|
"loss": 0.7523, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 3.628691983122363, |
|
"grad_norm": 0.316641628742218, |
|
"learning_rate": 0.000160773849342513, |
|
"loss": 0.7651, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 3.649789029535865, |
|
"grad_norm": 0.32175716757774353, |
|
"learning_rate": 0.00016018738351031156, |
|
"loss": 0.7646, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 3.670886075949367, |
|
"grad_norm": 0.30499377846717834, |
|
"learning_rate": 0.00015959765359247388, |
|
"loss": 0.7654, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 3.691983122362869, |
|
"grad_norm": 0.3078381419181824, |
|
"learning_rate": 0.0001590046915712667, |
|
"loss": 0.7682, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 3.7130801687763713, |
|
"grad_norm": 0.3422172963619232, |
|
"learning_rate": 0.00015840852960424036, |
|
"loss": 0.7504, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 3.7341772151898733, |
|
"grad_norm": 0.30137816071510315, |
|
"learning_rate": 0.00015780920002248484, |
|
"loss": 0.75, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 3.7552742616033754, |
|
"grad_norm": 0.31054186820983887, |
|
"learning_rate": 0.00015720673532887647, |
|
"loss": 0.7511, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 3.7763713080168775, |
|
"grad_norm": 0.3199822008609772, |
|
"learning_rate": 0.00015660116819631506, |
|
"loss": 0.7659, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 3.7974683544303796, |
|
"grad_norm": 0.30703869462013245, |
|
"learning_rate": 0.0001559925314659521, |
|
"loss": 0.7641, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 3.818565400843882, |
|
"grad_norm": 0.3145774006843567, |
|
"learning_rate": 0.00015538085814540962, |
|
"loss": 0.7589, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 3.8396624472573837, |
|
"grad_norm": 0.3188943862915039, |
|
"learning_rate": 0.00015476618140699034, |
|
"loss": 0.7615, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 3.8607594936708862, |
|
"grad_norm": 0.32847416400909424, |
|
"learning_rate": 0.00015414853458587833, |
|
"loss": 0.7569, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 3.8818565400843883, |
|
"grad_norm": 0.33269551396369934, |
|
"learning_rate": 0.00015352795117833145, |
|
"loss": 0.7539, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 3.9029535864978904, |
|
"grad_norm": 0.30027341842651367, |
|
"learning_rate": 0.00015290446483986472, |
|
"loss": 0.76, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 3.9240506329113924, |
|
"grad_norm": 0.3010607063770294, |
|
"learning_rate": 0.00015227810938342492, |
|
"loss": 0.7574, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 3.9451476793248945, |
|
"grad_norm": 0.30083900690078735, |
|
"learning_rate": 0.0001516489187775572, |
|
"loss": 0.7556, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 3.9662447257383966, |
|
"grad_norm": 0.30435124039649963, |
|
"learning_rate": 0.00015101692714456259, |
|
"loss": 0.7612, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 3.9873417721518987, |
|
"grad_norm": 0.31260964274406433, |
|
"learning_rate": 0.00015038216875864756, |
|
"loss": 0.7533, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 1.6894222497940063, |
|
"eval_runtime": 0.5552, |
|
"eval_samples_per_second": 3.602, |
|
"eval_steps_per_second": 1.801, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 4.008438818565401, |
|
"grad_norm": 0.3140685558319092, |
|
"learning_rate": 0.00014974467804406533, |
|
"loss": 0.749, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 4.029535864978903, |
|
"grad_norm": 0.3326011896133423, |
|
"learning_rate": 0.00014910448957324897, |
|
"loss": 0.7177, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 4.050632911392405, |
|
"grad_norm": 0.32034996151924133, |
|
"learning_rate": 0.00014846163806493627, |
|
"loss": 0.7061, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 4.071729957805907, |
|
"grad_norm": 0.31769704818725586, |
|
"learning_rate": 0.00014781615838228715, |
|
"loss": 0.6986, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 4.0928270042194095, |
|
"grad_norm": 0.35571393370628357, |
|
"learning_rate": 0.00014716808553099286, |
|
"loss": 0.7042, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 4.113924050632911, |
|
"grad_norm": 0.33056944608688354, |
|
"learning_rate": 0.00014651745465737737, |
|
"loss": 0.7195, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 4.135021097046414, |
|
"grad_norm": 0.35726672410964966, |
|
"learning_rate": 0.00014586430104649163, |
|
"loss": 0.7245, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 4.156118143459915, |
|
"grad_norm": 0.3273336887359619, |
|
"learning_rate": 0.0001452086601201997, |
|
"loss": 0.709, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 4.177215189873418, |
|
"grad_norm": 0.33940553665161133, |
|
"learning_rate": 0.00014455056743525792, |
|
"loss": 0.7115, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 4.198312236286919, |
|
"grad_norm": 0.34996211528778076, |
|
"learning_rate": 0.00014389005868138658, |
|
"loss": 0.7078, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 4.219409282700422, |
|
"grad_norm": 0.33837664127349854, |
|
"learning_rate": 0.00014322716967933428, |
|
"loss": 0.7042, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 4.2405063291139244, |
|
"grad_norm": 0.3329886198043823, |
|
"learning_rate": 0.0001425619363789354, |
|
"loss": 0.7212, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 4.261603375527426, |
|
"grad_norm": 0.35570377111434937, |
|
"learning_rate": 0.00014189439485716053, |
|
"loss": 0.7088, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 4.282700421940929, |
|
"grad_norm": 0.3659791648387909, |
|
"learning_rate": 0.00014122458131615975, |
|
"loss": 0.7023, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 4.30379746835443, |
|
"grad_norm": 0.3362638056278229, |
|
"learning_rate": 0.00014055253208129938, |
|
"loss": 0.7138, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 4.324894514767933, |
|
"grad_norm": 0.3303203284740448, |
|
"learning_rate": 0.00013987828359919222, |
|
"loss": 0.7085, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 4.345991561181434, |
|
"grad_norm": 0.32455962896347046, |
|
"learning_rate": 0.00013920187243572057, |
|
"loss": 0.7142, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 4.367088607594937, |
|
"grad_norm": 0.33820950984954834, |
|
"learning_rate": 0.00013852333527405346, |
|
"loss": 0.7198, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 4.3881856540084385, |
|
"grad_norm": 0.3443733751773834, |
|
"learning_rate": 0.00013784270891265717, |
|
"loss": 0.7281, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 4.409282700421941, |
|
"grad_norm": 0.3376203179359436, |
|
"learning_rate": 0.00013716003026329965, |
|
"loss": 0.7157, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 4.430379746835443, |
|
"grad_norm": 0.3343973159790039, |
|
"learning_rate": 0.0001364753363490485, |
|
"loss": 0.7157, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 4.451476793248945, |
|
"grad_norm": 0.32973435521125793, |
|
"learning_rate": 0.00013578866430226342, |
|
"loss": 0.7183, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 4.472573839662447, |
|
"grad_norm": 0.3444620370864868, |
|
"learning_rate": 0.00013510005136258227, |
|
"loss": 0.7196, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 4.493670886075949, |
|
"grad_norm": 0.33004656434059143, |
|
"learning_rate": 0.00013440953487490144, |
|
"loss": 0.7139, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 4.514767932489452, |
|
"grad_norm": 0.3244040608406067, |
|
"learning_rate": 0.00013371715228735077, |
|
"loss": 0.7144, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 4.5358649789029535, |
|
"grad_norm": 0.3370364308357239, |
|
"learning_rate": 0.0001330229411492625, |
|
"loss": 0.7014, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 4.556962025316456, |
|
"grad_norm": 0.3164542317390442, |
|
"learning_rate": 0.00013232693910913485, |
|
"loss": 0.7124, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 4.578059071729958, |
|
"grad_norm": 0.3478745222091675, |
|
"learning_rate": 0.0001316291839125904, |
|
"loss": 0.7253, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 4.59915611814346, |
|
"grad_norm": 0.33551761507987976, |
|
"learning_rate": 0.00013092971340032905, |
|
"loss": 0.7237, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 4.620253164556962, |
|
"grad_norm": 0.3593490421772003, |
|
"learning_rate": 0.00013022856550607572, |
|
"loss": 0.7187, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 4.641350210970464, |
|
"grad_norm": 0.33983170986175537, |
|
"learning_rate": 0.0001295257782545233, |
|
"loss": 0.715, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 4.662447257383966, |
|
"grad_norm": 0.3238469064235687, |
|
"learning_rate": 0.00012882138975927026, |
|
"loss": 0.7024, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 4.6835443037974684, |
|
"grad_norm": 0.3401734232902527, |
|
"learning_rate": 0.00012811543822075397, |
|
"loss": 0.7175, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 4.70464135021097, |
|
"grad_norm": 0.35343295335769653, |
|
"learning_rate": 0.00012740796192417875, |
|
"loss": 0.7445, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 4.725738396624473, |
|
"grad_norm": 0.3328774869441986, |
|
"learning_rate": 0.00012669899923743968, |
|
"loss": 0.7007, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 4.746835443037975, |
|
"grad_norm": 0.341886967420578, |
|
"learning_rate": 0.00012598858860904193, |
|
"loss": 0.7275, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 4.767932489451477, |
|
"grad_norm": 0.33224210143089294, |
|
"learning_rate": 0.00012527676856601542, |
|
"loss": 0.7093, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 4.789029535864979, |
|
"grad_norm": 0.3608289062976837, |
|
"learning_rate": 0.0001245635777118256, |
|
"loss": 0.7237, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 4.810126582278481, |
|
"grad_norm": 0.3258775472640991, |
|
"learning_rate": 0.00012384905472427975, |
|
"loss": 0.7068, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 4.831223628691983, |
|
"grad_norm": 0.3356561064720154, |
|
"learning_rate": 0.0001231332383534296, |
|
"loss": 0.7208, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 4.852320675105485, |
|
"grad_norm": 0.32746249437332153, |
|
"learning_rate": 0.00012241616741946962, |
|
"loss": 0.7143, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 4.8734177215189876, |
|
"grad_norm": 0.33153674006462097, |
|
"learning_rate": 0.0001216978808106318, |
|
"loss": 0.726, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 4.894514767932489, |
|
"grad_norm": 0.33083775639533997, |
|
"learning_rate": 0.00012097841748107681, |
|
"loss": 0.7015, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 4.915611814345992, |
|
"grad_norm": 0.3352629542350769, |
|
"learning_rate": 0.00012025781644878118, |
|
"loss": 0.7234, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 4.936708860759493, |
|
"grad_norm": 0.3423599898815155, |
|
"learning_rate": 0.00011953611679342143, |
|
"loss": 0.733, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 4.957805907172996, |
|
"grad_norm": 0.3402250409126282, |
|
"learning_rate": 0.00011881335765425473, |
|
"loss": 0.7187, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 4.978902953586498, |
|
"grad_norm": 0.345759779214859, |
|
"learning_rate": 0.00011808957822799614, |
|
"loss": 0.7119, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.3274582326412201, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 0.716, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 1.7251324653625488, |
|
"eval_runtime": 0.554, |
|
"eval_samples_per_second": 3.61, |
|
"eval_steps_per_second": 1.805, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 5.0210970464135025, |
|
"grad_norm": 0.36755794286727905, |
|
"learning_rate": 0.0001166391155755964, |
|
"loss": 0.6589, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 5.042194092827004, |
|
"grad_norm": 0.3618139624595642, |
|
"learning_rate": 0.00011591251101102906, |
|
"loss": 0.6697, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 5.063291139240507, |
|
"grad_norm": 0.38643401861190796, |
|
"learning_rate": 0.00011518504347825145, |
|
"loss": 0.661, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 5.084388185654008, |
|
"grad_norm": 0.3515397012233734, |
|
"learning_rate": 0.00011445675242932457, |
|
"loss": 0.6455, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 5.105485232067511, |
|
"grad_norm": 0.37624698877334595, |
|
"learning_rate": 0.00011372767736097039, |
|
"loss": 0.6628, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 5.1265822784810124, |
|
"grad_norm": 0.3468095660209656, |
|
"learning_rate": 0.00011299785781242982, |
|
"loss": 0.6591, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 5.147679324894515, |
|
"grad_norm": 0.3849187195301056, |
|
"learning_rate": 0.00011226733336331855, |
|
"loss": 0.6726, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 5.168776371308017, |
|
"grad_norm": 0.36786338686943054, |
|
"learning_rate": 0.00011153614363148032, |
|
"loss": 0.6795, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 5.189873417721519, |
|
"grad_norm": 0.35997211933135986, |
|
"learning_rate": 0.00011080432827083873, |
|
"loss": 0.676, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 5.210970464135021, |
|
"grad_norm": 0.3702506721019745, |
|
"learning_rate": 0.00011007192696924638, |
|
"loss": 0.6734, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 5.232067510548523, |
|
"grad_norm": 0.35727155208587646, |
|
"learning_rate": 0.00010933897944633265, |
|
"loss": 0.6719, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 5.253164556962025, |
|
"grad_norm": 0.35158923268318176, |
|
"learning_rate": 0.0001086055254513497, |
|
"loss": 0.6572, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 5.274261603375527, |
|
"grad_norm": 0.3676392734050751, |
|
"learning_rate": 0.00010787160476101668, |
|
"loss": 0.663, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 5.29535864978903, |
|
"grad_norm": 0.35416457056999207, |
|
"learning_rate": 0.00010713725717736254, |
|
"loss": 0.6619, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 5.3164556962025316, |
|
"grad_norm": 0.36827412247657776, |
|
"learning_rate": 0.00010640252252556759, |
|
"loss": 0.6861, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 5.337552742616034, |
|
"grad_norm": 0.37270885705947876, |
|
"learning_rate": 0.00010566744065180368, |
|
"loss": 0.6842, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 5.358649789029536, |
|
"grad_norm": 0.6396368741989136, |
|
"learning_rate": 0.00010493205142107312, |
|
"loss": 0.6648, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 5.379746835443038, |
|
"grad_norm": 0.3901231288909912, |
|
"learning_rate": 0.00010419639471504682, |
|
"loss": 0.6682, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 5.40084388185654, |
|
"grad_norm": 0.3932683765888214, |
|
"learning_rate": 0.0001034605104299016, |
|
"loss": 0.6715, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 5.421940928270042, |
|
"grad_norm": 0.3795235753059387, |
|
"learning_rate": 0.00010272443847415615, |
|
"loss": 0.6826, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 5.443037974683544, |
|
"grad_norm": 0.3844228982925415, |
|
"learning_rate": 0.00010198821876650701, |
|
"loss": 0.6624, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 5.4641350210970465, |
|
"grad_norm": 0.37277084589004517, |
|
"learning_rate": 0.00010125189123366368, |
|
"loss": 0.6818, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 5.485232067510548, |
|
"grad_norm": 0.3795084059238434, |
|
"learning_rate": 0.0001005154958081831, |
|
"loss": 0.6688, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 5.506329113924051, |
|
"grad_norm": 0.37196341156959534, |
|
"learning_rate": 9.977907242630426e-05, |
|
"loss": 0.6627, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 5.527426160337553, |
|
"grad_norm": 0.3792167603969574, |
|
"learning_rate": 9.904266102578231e-05, |
|
"loss": 0.6768, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 5.548523206751055, |
|
"grad_norm": 0.3688276410102844, |
|
"learning_rate": 9.830630154372252e-05, |
|
"loss": 0.6663, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 5.569620253164557, |
|
"grad_norm": 0.3876282870769501, |
|
"learning_rate": 9.75700339144146e-05, |
|
"loss": 0.6757, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 5.590717299578059, |
|
"grad_norm": 0.35115256905555725, |
|
"learning_rate": 9.68338980671669e-05, |
|
"loss": 0.6846, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 5.6118143459915615, |
|
"grad_norm": 0.3650346100330353, |
|
"learning_rate": 9.609793392414086e-05, |
|
"loss": 0.6948, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 5.632911392405063, |
|
"grad_norm": 0.3864571750164032, |
|
"learning_rate": 9.536218139818614e-05, |
|
"loss": 0.6712, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 5.654008438818566, |
|
"grad_norm": 0.36888009309768677, |
|
"learning_rate": 9.462668039067602e-05, |
|
"loss": 0.6705, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 5.675105485232067, |
|
"grad_norm": 0.36247017979621887, |
|
"learning_rate": 9.389147078934329e-05, |
|
"loss": 0.6696, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 5.69620253164557, |
|
"grad_norm": 0.3620111048221588, |
|
"learning_rate": 9.31565924661172e-05, |
|
"loss": 0.6686, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 5.717299578059071, |
|
"grad_norm": 0.3552044630050659, |
|
"learning_rate": 9.242208527496121e-05, |
|
"loss": 0.6922, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 5.738396624472574, |
|
"grad_norm": 0.36270490288734436, |
|
"learning_rate": 9.168798904971143e-05, |
|
"loss": 0.6625, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 5.759493670886076, |
|
"grad_norm": 0.3620161712169647, |
|
"learning_rate": 9.095434360191642e-05, |
|
"loss": 0.6684, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 5.780590717299578, |
|
"grad_norm": 0.37736937403678894, |
|
"learning_rate": 9.02211887186783e-05, |
|
"loss": 0.6896, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 5.80168776371308, |
|
"grad_norm": 0.4165714979171753, |
|
"learning_rate": 8.948856416049475e-05, |
|
"loss": 0.6704, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 5.822784810126582, |
|
"grad_norm": 0.3674893081188202, |
|
"learning_rate": 8.875650965910279e-05, |
|
"loss": 0.6871, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 5.843881856540085, |
|
"grad_norm": 0.39419984817504883, |
|
"learning_rate": 8.802506491532421e-05, |
|
"loss": 0.6941, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 5.864978902953586, |
|
"grad_norm": 0.3581133782863617, |
|
"learning_rate": 8.72942695969123e-05, |
|
"loss": 0.6815, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 5.886075949367089, |
|
"grad_norm": 0.3663847744464874, |
|
"learning_rate": 8.656416333640066e-05, |
|
"loss": 0.6792, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 5.9071729957805905, |
|
"grad_norm": 0.39068740606307983, |
|
"learning_rate": 8.583478572895394e-05, |
|
"loss": 0.6689, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 5.928270042194093, |
|
"grad_norm": 0.3782387971878052, |
|
"learning_rate": 8.510617633022044e-05, |
|
"loss": 0.6825, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 5.949367088607595, |
|
"grad_norm": 0.3802437484264374, |
|
"learning_rate": 8.437837465418684e-05, |
|
"loss": 0.669, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 5.970464135021097, |
|
"grad_norm": 0.35812443494796753, |
|
"learning_rate": 8.365142017103542e-05, |
|
"loss": 0.6788, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 5.991561181434599, |
|
"grad_norm": 0.36878547072410583, |
|
"learning_rate": 8.292535230500342e-05, |
|
"loss": 0.6876, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 1.782979130744934, |
|
"eval_runtime": 0.5539, |
|
"eval_samples_per_second": 3.611, |
|
"eval_steps_per_second": 1.805, |
|
"step": 1422 |
|
}, |
|
{ |
|
"epoch": 6.012658227848101, |
|
"grad_norm": 0.40747499465942383, |
|
"learning_rate": 8.2200210432245e-05, |
|
"loss": 0.6326, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 6.033755274261603, |
|
"grad_norm": 0.40314236283302307, |
|
"learning_rate": 8.147603387869582e-05, |
|
"loss": 0.6234, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 6.0548523206751055, |
|
"grad_norm": 0.3922586739063263, |
|
"learning_rate": 8.075286191794025e-05, |
|
"loss": 0.6238, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 6.075949367088608, |
|
"grad_norm": 0.41105443239212036, |
|
"learning_rate": 8.003073376908163e-05, |
|
"loss": 0.6312, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 6.09704641350211, |
|
"grad_norm": 0.3970966339111328, |
|
"learning_rate": 7.930968859461516e-05, |
|
"loss": 0.6233, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 6.118143459915612, |
|
"grad_norm": 0.42427581548690796, |
|
"learning_rate": 7.85897654983041e-05, |
|
"loss": 0.6348, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 6.139240506329114, |
|
"grad_norm": 0.38989487290382385, |
|
"learning_rate": 7.787100352305908e-05, |
|
"loss": 0.6237, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 6.160337552742616, |
|
"grad_norm": 0.4042844772338867, |
|
"learning_rate": 7.715344164882085e-05, |
|
"loss": 0.6232, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 6.181434599156118, |
|
"grad_norm": 0.40070950984954834, |
|
"learning_rate": 7.643711879044612e-05, |
|
"loss": 0.6173, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 6.2025316455696204, |
|
"grad_norm": 0.40951260924339294, |
|
"learning_rate": 7.572207379559721e-05, |
|
"loss": 0.6369, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 6.223628691983122, |
|
"grad_norm": 0.40946945548057556, |
|
"learning_rate": 7.50083454426354e-05, |
|
"loss": 0.6267, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 6.244725738396625, |
|
"grad_norm": 0.40567830204963684, |
|
"learning_rate": 7.429597243851764e-05, |
|
"loss": 0.616, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 6.265822784810126, |
|
"grad_norm": 0.4094925820827484, |
|
"learning_rate": 7.358499341669756e-05, |
|
"loss": 0.6231, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 6.286919831223629, |
|
"grad_norm": 0.396982878446579, |
|
"learning_rate": 7.287544693503028e-05, |
|
"loss": 0.6263, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 6.308016877637131, |
|
"grad_norm": 0.41034215688705444, |
|
"learning_rate": 7.216737147368127e-05, |
|
"loss": 0.6466, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 6.329113924050633, |
|
"grad_norm": 0.4219072163105011, |
|
"learning_rate": 7.146080543303965e-05, |
|
"loss": 0.6479, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 6.350210970464135, |
|
"grad_norm": 0.39759665727615356, |
|
"learning_rate": 7.075578713163541e-05, |
|
"loss": 0.6235, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 6.371308016877637, |
|
"grad_norm": 0.4137880504131317, |
|
"learning_rate": 7.00523548040616e-05, |
|
"loss": 0.6221, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 6.3924050632911396, |
|
"grad_norm": 0.4084639847278595, |
|
"learning_rate": 6.935054659890052e-05, |
|
"loss": 0.633, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 6.413502109704641, |
|
"grad_norm": 0.39727863669395447, |
|
"learning_rate": 6.865040057665506e-05, |
|
"loss": 0.6356, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 6.434599156118144, |
|
"grad_norm": 0.4197627007961273, |
|
"learning_rate": 6.795195470768444e-05, |
|
"loss": 0.6355, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 6.455696202531645, |
|
"grad_norm": 0.4036734402179718, |
|
"learning_rate": 6.725524687014514e-05, |
|
"loss": 0.6367, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 6.476793248945148, |
|
"grad_norm": 0.4073878526687622, |
|
"learning_rate": 6.656031484793657e-05, |
|
"loss": 0.6367, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 6.4978902953586495, |
|
"grad_norm": 0.4095742702484131, |
|
"learning_rate": 6.586719632865198e-05, |
|
"loss": 0.6292, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 6.518987341772152, |
|
"grad_norm": 0.408542662858963, |
|
"learning_rate": 6.517592890153476e-05, |
|
"loss": 0.6312, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 6.540084388185654, |
|
"grad_norm": 0.4064979553222656, |
|
"learning_rate": 6.448655005543969e-05, |
|
"loss": 0.6373, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 6.561181434599156, |
|
"grad_norm": 0.4208141565322876, |
|
"learning_rate": 6.379909717679985e-05, |
|
"loss": 0.6289, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 6.582278481012658, |
|
"grad_norm": 0.4085118770599365, |
|
"learning_rate": 6.311360754759923e-05, |
|
"loss": 0.6289, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 6.60337552742616, |
|
"grad_norm": 0.4019670784473419, |
|
"learning_rate": 6.243011834335075e-05, |
|
"loss": 0.639, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 6.624472573839663, |
|
"grad_norm": 0.4115982949733734, |
|
"learning_rate": 6.17486666310801e-05, |
|
"loss": 0.6437, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 6.6455696202531644, |
|
"grad_norm": 0.40410783886909485, |
|
"learning_rate": 6.106928936731571e-05, |
|
"loss": 0.6439, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 6.666666666666667, |
|
"grad_norm": 0.3954565227031708, |
|
"learning_rate": 6.039202339608432e-05, |
|
"loss": 0.6339, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 6.687763713080169, |
|
"grad_norm": 0.40417176485061646, |
|
"learning_rate": 5.971690544691294e-05, |
|
"loss": 0.6238, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 6.708860759493671, |
|
"grad_norm": 0.40106064081192017, |
|
"learning_rate": 5.90439721328369e-05, |
|
"loss": 0.6183, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 6.729957805907173, |
|
"grad_norm": 0.3997708261013031, |
|
"learning_rate": 5.837325994841434e-05, |
|
"loss": 0.6349, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 6.751054852320675, |
|
"grad_norm": 0.40423154830932617, |
|
"learning_rate": 5.770480526774693e-05, |
|
"loss": 0.6319, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 6.772151898734177, |
|
"grad_norm": 0.39728954434394836, |
|
"learning_rate": 5.7038644342507205e-05, |
|
"loss": 0.6454, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 6.793248945147679, |
|
"grad_norm": 0.4143037497997284, |
|
"learning_rate": 5.6374813299972805e-05, |
|
"loss": 0.6532, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 6.814345991561181, |
|
"grad_norm": 0.4104886054992676, |
|
"learning_rate": 5.571334814106681e-05, |
|
"loss": 0.6375, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 6.8354430379746836, |
|
"grad_norm": 0.41742509603500366, |
|
"learning_rate": 5.505428473840576e-05, |
|
"loss": 0.6443, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 6.856540084388186, |
|
"grad_norm": 0.4019664227962494, |
|
"learning_rate": 5.4397658834353895e-05, |
|
"loss": 0.6207, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 6.877637130801688, |
|
"grad_norm": 0.4325370490550995, |
|
"learning_rate": 5.3743506039084913e-05, |
|
"loss": 0.6357, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 6.89873417721519, |
|
"grad_norm": 0.4043619632720947, |
|
"learning_rate": 5.309186182865076e-05, |
|
"loss": 0.646, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 6.919831223628692, |
|
"grad_norm": 0.4148579239845276, |
|
"learning_rate": 5.244276154305758e-05, |
|
"loss": 0.6417, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 6.940928270042194, |
|
"grad_norm": 0.41201284527778625, |
|
"learning_rate": 5.179624038434938e-05, |
|
"loss": 0.6396, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 6.962025316455696, |
|
"grad_norm": 0.4112018644809723, |
|
"learning_rate": 5.115233341469877e-05, |
|
"loss": 0.6391, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 6.9831223628691985, |
|
"grad_norm": 0.42246147990226746, |
|
"learning_rate": 5.0511075554505426e-05, |
|
"loss": 0.6344, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 1.8556586503982544, |
|
"eval_runtime": 0.5548, |
|
"eval_samples_per_second": 3.605, |
|
"eval_steps_per_second": 1.802, |
|
"step": 1659 |
|
}, |
|
{ |
|
"epoch": 7.0042194092827, |
|
"grad_norm": 0.38922393321990967, |
|
"learning_rate": 4.987250158050244e-05, |
|
"loss": 0.6267, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 7.025316455696203, |
|
"grad_norm": 0.4556201100349426, |
|
"learning_rate": 4.923664612387019e-05, |
|
"loss": 0.5894, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 7.046413502109704, |
|
"grad_norm": 0.4320254325866699, |
|
"learning_rate": 4.860354366835825e-05, |
|
"loss": 0.6007, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 7.067510548523207, |
|
"grad_norm": 0.41525062918663025, |
|
"learning_rate": 4.7973228548415385e-05, |
|
"loss": 0.5944, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 7.0886075949367084, |
|
"grad_norm": 0.46430733799934387, |
|
"learning_rate": 4.734573494732735e-05, |
|
"loss": 0.5945, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 7.109704641350211, |
|
"grad_norm": 0.421763151884079, |
|
"learning_rate": 4.6721096895363114e-05, |
|
"loss": 0.583, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 7.1308016877637135, |
|
"grad_norm": 0.44340547919273376, |
|
"learning_rate": 4.6099348267929334e-05, |
|
"loss": 0.6034, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 7.151898734177215, |
|
"grad_norm": 0.4334201216697693, |
|
"learning_rate": 4.548052278373327e-05, |
|
"loss": 0.592, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 7.172995780590718, |
|
"grad_norm": 0.4375658631324768, |
|
"learning_rate": 4.486465400295404e-05, |
|
"loss": 0.5942, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 7.194092827004219, |
|
"grad_norm": 0.4318469762802124, |
|
"learning_rate": 4.4251775325422795e-05, |
|
"loss": 0.6079, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 7.215189873417722, |
|
"grad_norm": 0.4487842619419098, |
|
"learning_rate": 4.364191998881104e-05, |
|
"loss": 0.5938, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 7.236286919831223, |
|
"grad_norm": 0.4327734112739563, |
|
"learning_rate": 4.303512106682849e-05, |
|
"loss": 0.5965, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 7.257383966244726, |
|
"grad_norm": 0.4447080194950104, |
|
"learning_rate": 4.243141146742905e-05, |
|
"loss": 0.5953, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 7.2784810126582276, |
|
"grad_norm": 0.4422175884246826, |
|
"learning_rate": 4.183082393102636e-05, |
|
"loss": 0.5849, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 7.29957805907173, |
|
"grad_norm": 0.4476224184036255, |
|
"learning_rate": 4.1233391028718116e-05, |
|
"loss": 0.5962, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 7.320675105485232, |
|
"grad_norm": 0.4534938931465149, |
|
"learning_rate": 4.063914516051984e-05, |
|
"loss": 0.5838, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 7.341772151898734, |
|
"grad_norm": 0.45842060446739197, |
|
"learning_rate": 4.004811855360748e-05, |
|
"loss": 0.6046, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 7.362869198312236, |
|
"grad_norm": 0.43340378999710083, |
|
"learning_rate": 3.9460343260569964e-05, |
|
"loss": 0.5972, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 7.383966244725738, |
|
"grad_norm": 0.4477992057800293, |
|
"learning_rate": 3.887585115767068e-05, |
|
"loss": 0.6067, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 7.405063291139241, |
|
"grad_norm": 0.44521939754486084, |
|
"learning_rate": 3.82946739431189e-05, |
|
"loss": 0.5959, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 7.4261603375527425, |
|
"grad_norm": 0.42936068773269653, |
|
"learning_rate": 3.771684313535062e-05, |
|
"loss": 0.5963, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 7.447257383966245, |
|
"grad_norm": 0.45330098271369934, |
|
"learning_rate": 3.7142390071319454e-05, |
|
"loss": 0.6001, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 7.468354430379747, |
|
"grad_norm": 0.451648473739624, |
|
"learning_rate": 3.65713459047969e-05, |
|
"loss": 0.6104, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 7.489451476793249, |
|
"grad_norm": 0.4406780004501343, |
|
"learning_rate": 3.60037416046829e-05, |
|
"loss": 0.5942, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 7.510548523206751, |
|
"grad_norm": 0.4443998634815216, |
|
"learning_rate": 3.543960795332653e-05, |
|
"loss": 0.5919, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 7.531645569620253, |
|
"grad_norm": 0.45104894042015076, |
|
"learning_rate": 3.487897554485628e-05, |
|
"loss": 0.5995, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 7.552742616033755, |
|
"grad_norm": 0.45314210653305054, |
|
"learning_rate": 3.43218747835211e-05, |
|
"loss": 0.587, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 7.5738396624472575, |
|
"grad_norm": 0.4450884163379669, |
|
"learning_rate": 3.376833588204148e-05, |
|
"loss": 0.5879, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 7.594936708860759, |
|
"grad_norm": 0.44848042726516724, |
|
"learning_rate": 3.3218388859970875e-05, |
|
"loss": 0.598, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 7.616033755274262, |
|
"grad_norm": 0.45061829686164856, |
|
"learning_rate": 3.2672063542067734e-05, |
|
"loss": 0.6111, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 7.637130801687764, |
|
"grad_norm": 0.43524765968322754, |
|
"learning_rate": 3.2129389556678016e-05, |
|
"loss": 0.6004, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 7.658227848101266, |
|
"grad_norm": 0.46142658591270447, |
|
"learning_rate": 3.15903963341285e-05, |
|
"loss": 0.594, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 7.679324894514768, |
|
"grad_norm": 0.45434656739234924, |
|
"learning_rate": 3.1055113105130506e-05, |
|
"loss": 0.6002, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 7.70042194092827, |
|
"grad_norm": 0.45925071835517883, |
|
"learning_rate": 3.052356889919489e-05, |
|
"loss": 0.5914, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 7.7215189873417724, |
|
"grad_norm": 0.44464772939682007, |
|
"learning_rate": 2.9995792543057478e-05, |
|
"loss": 0.6064, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 7.742616033755274, |
|
"grad_norm": 0.44117605686187744, |
|
"learning_rate": 2.9471812659115917e-05, |
|
"loss": 0.5993, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 7.763713080168777, |
|
"grad_norm": 0.4454299509525299, |
|
"learning_rate": 2.895165766387733e-05, |
|
"loss": 0.5957, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 7.784810126582278, |
|
"grad_norm": 0.4484660029411316, |
|
"learning_rate": 2.843535576641725e-05, |
|
"loss": 0.5985, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 7.805907172995781, |
|
"grad_norm": 0.4591384828090668, |
|
"learning_rate": 2.7922934966849823e-05, |
|
"loss": 0.6044, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 7.827004219409282, |
|
"grad_norm": 0.4372834861278534, |
|
"learning_rate": 2.7414423054809302e-05, |
|
"loss": 0.5958, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 7.848101265822785, |
|
"grad_norm": 0.44407814741134644, |
|
"learning_rate": 2.690984760794284e-05, |
|
"loss": 0.5965, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 7.869198312236287, |
|
"grad_norm": 0.44278717041015625, |
|
"learning_rate": 2.6409235990415026e-05, |
|
"loss": 0.6062, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 7.890295358649789, |
|
"grad_norm": 0.4526854455471039, |
|
"learning_rate": 2.591261535142383e-05, |
|
"loss": 0.6035, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 7.911392405063291, |
|
"grad_norm": 0.4361235499382019, |
|
"learning_rate": 2.5420012623728208e-05, |
|
"loss": 0.6041, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 7.932489451476793, |
|
"grad_norm": 0.4319293200969696, |
|
"learning_rate": 2.4931454522187593e-05, |
|
"loss": 0.6005, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 7.953586497890296, |
|
"grad_norm": 0.44515419006347656, |
|
"learning_rate": 2.4446967542313015e-05, |
|
"loss": 0.614, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 7.974683544303797, |
|
"grad_norm": 0.4468868672847748, |
|
"learning_rate": 2.3966577958830128e-05, |
|
"loss": 0.5999, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 7.9957805907173, |
|
"grad_norm": 0.43502089381217957, |
|
"learning_rate": 2.3490311824254386e-05, |
|
"loss": 0.591, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 1.9239612817764282, |
|
"eval_runtime": 0.555, |
|
"eval_samples_per_second": 3.604, |
|
"eval_steps_per_second": 1.802, |
|
"step": 1896 |
|
}, |
|
{ |
|
"epoch": 8.016877637130802, |
|
"grad_norm": 0.43088486790657043, |
|
"learning_rate": 2.3018194967478145e-05, |
|
"loss": 0.5772, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 8.037974683544304, |
|
"grad_norm": 0.4887051582336426, |
|
"learning_rate": 2.2550252992369837e-05, |
|
"loss": 0.5858, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 8.059071729957806, |
|
"grad_norm": 0.46031367778778076, |
|
"learning_rate": 2.2086511276385556e-05, |
|
"loss": 0.5698, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 8.080168776371307, |
|
"grad_norm": 0.44916045665740967, |
|
"learning_rate": 2.1626994969192617e-05, |
|
"loss": 0.5832, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 8.10126582278481, |
|
"grad_norm": 0.45516934990882874, |
|
"learning_rate": 2.1171728991305795e-05, |
|
"loss": 0.5678, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 8.122362869198312, |
|
"grad_norm": 0.4672262668609619, |
|
"learning_rate": 2.072073803273572e-05, |
|
"loss": 0.5609, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 8.143459915611814, |
|
"grad_norm": 0.4732205271720886, |
|
"learning_rate": 2.0274046551649918e-05, |
|
"loss": 0.5748, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 8.164556962025316, |
|
"grad_norm": 0.4523015022277832, |
|
"learning_rate": 1.9831678773046424e-05, |
|
"loss": 0.572, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 8.185654008438819, |
|
"grad_norm": 0.46077847480773926, |
|
"learning_rate": 1.9393658687439985e-05, |
|
"loss": 0.5734, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 8.20675105485232, |
|
"grad_norm": 0.48243677616119385, |
|
"learning_rate": 1.8960010049561028e-05, |
|
"loss": 0.5749, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 8.227848101265822, |
|
"grad_norm": 0.45998242497444153, |
|
"learning_rate": 1.8530756377067394e-05, |
|
"loss": 0.5635, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 8.248945147679326, |
|
"grad_norm": 0.4926050305366516, |
|
"learning_rate": 1.8105920949268862e-05, |
|
"loss": 0.5656, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 8.270042194092827, |
|
"grad_norm": 0.44752731919288635, |
|
"learning_rate": 1.7685526805864727e-05, |
|
"loss": 0.5713, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 8.291139240506329, |
|
"grad_norm": 0.4773804843425751, |
|
"learning_rate": 1.7269596745694295e-05, |
|
"loss": 0.5753, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 8.31223628691983, |
|
"grad_norm": 0.4709602892398834, |
|
"learning_rate": 1.6858153325500435e-05, |
|
"loss": 0.5604, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 8.333333333333334, |
|
"grad_norm": 0.46927887201309204, |
|
"learning_rate": 1.6451218858706374e-05, |
|
"loss": 0.578, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 8.354430379746836, |
|
"grad_norm": 0.467042475938797, |
|
"learning_rate": 1.60488154142054e-05, |
|
"loss": 0.5876, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 8.375527426160337, |
|
"grad_norm": 0.4665224850177765, |
|
"learning_rate": 1.565096481516427e-05, |
|
"loss": 0.5727, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 8.396624472573839, |
|
"grad_norm": 0.46915140748023987, |
|
"learning_rate": 1.5257688637839484e-05, |
|
"loss": 0.5744, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 8.417721518987342, |
|
"grad_norm": 0.4630158841609955, |
|
"learning_rate": 1.4869008210407243e-05, |
|
"loss": 0.5609, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 8.438818565400844, |
|
"grad_norm": 0.46690833568573, |
|
"learning_rate": 1.4484944611806773e-05, |
|
"loss": 0.5764, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 8.459915611814345, |
|
"grad_norm": 0.46290239691734314, |
|
"learning_rate": 1.410551867059724e-05, |
|
"loss": 0.5817, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 8.481012658227849, |
|
"grad_norm": 0.4760874807834625, |
|
"learning_rate": 1.3730750963828032e-05, |
|
"loss": 0.5704, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 8.50210970464135, |
|
"grad_norm": 0.47287824749946594, |
|
"learning_rate": 1.3360661815922903e-05, |
|
"loss": 0.574, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 8.523206751054852, |
|
"grad_norm": 0.46598172187805176, |
|
"learning_rate": 1.2995271297577816e-05, |
|
"loss": 0.5792, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 8.544303797468354, |
|
"grad_norm": 0.4603840410709381, |
|
"learning_rate": 1.2634599224672294e-05, |
|
"loss": 0.5674, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 8.565400843881857, |
|
"grad_norm": 0.48355668783187866, |
|
"learning_rate": 1.227866515719489e-05, |
|
"loss": 0.5676, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 8.586497890295359, |
|
"grad_norm": 0.4648090898990631, |
|
"learning_rate": 1.1927488398182395e-05, |
|
"loss": 0.5595, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 8.60759493670886, |
|
"grad_norm": 0.47335174679756165, |
|
"learning_rate": 1.1581087992672935e-05, |
|
"loss": 0.5743, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 8.628691983122362, |
|
"grad_norm": 0.46940383315086365, |
|
"learning_rate": 1.1239482726673201e-05, |
|
"loss": 0.5719, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 8.649789029535865, |
|
"grad_norm": 0.46548011898994446, |
|
"learning_rate": 1.0902691126139542e-05, |
|
"loss": 0.5722, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 8.670886075949367, |
|
"grad_norm": 0.4601798355579376, |
|
"learning_rate": 1.0570731455973414e-05, |
|
"loss": 0.5752, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 8.691983122362869, |
|
"grad_norm": 0.47531968355178833, |
|
"learning_rate": 1.024362171903065e-05, |
|
"loss": 0.5833, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 8.713080168776372, |
|
"grad_norm": 0.46817246079444885, |
|
"learning_rate": 9.921379655145313e-06, |
|
"loss": 0.5716, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 8.734177215189874, |
|
"grad_norm": 0.47469767928123474, |
|
"learning_rate": 9.604022740167495e-06, |
|
"loss": 0.5825, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 8.755274261603375, |
|
"grad_norm": 0.4739144444465637, |
|
"learning_rate": 9.29156818501561e-06, |
|
"loss": 0.5669, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 8.776371308016877, |
|
"grad_norm": 0.5070598721504211, |
|
"learning_rate": 8.984032934743026e-06, |
|
"loss": 0.5797, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 8.79746835443038, |
|
"grad_norm": 0.4720567464828491, |
|
"learning_rate": 8.681433667619065e-06, |
|
"loss": 0.5635, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 8.818565400843882, |
|
"grad_norm": 0.45980048179626465, |
|
"learning_rate": 8.383786794224569e-06, |
|
"loss": 0.5715, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 8.839662447257384, |
|
"grad_norm": 0.4796925187110901, |
|
"learning_rate": 8.09110845656187e-06, |
|
"loss": 0.5785, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 8.860759493670885, |
|
"grad_norm": 0.4801785349845886, |
|
"learning_rate": 7.803414527179343e-06, |
|
"loss": 0.5772, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 8.881856540084389, |
|
"grad_norm": 0.4780319631099701, |
|
"learning_rate": 7.520720608310683e-06, |
|
"loss": 0.5726, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 8.90295358649789, |
|
"grad_norm": 0.4621387720108032, |
|
"learning_rate": 7.243042031028713e-06, |
|
"loss": 0.5752, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 8.924050632911392, |
|
"grad_norm": 0.4708462059497833, |
|
"learning_rate": 6.9703938544139706e-06, |
|
"loss": 0.5716, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 8.945147679324894, |
|
"grad_norm": 0.4696125090122223, |
|
"learning_rate": 6.702790864738018e-06, |
|
"loss": 0.5666, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 8.966244725738397, |
|
"grad_norm": 0.47694242000579834, |
|
"learning_rate": 6.440247574661573e-06, |
|
"loss": 0.568, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 8.987341772151899, |
|
"grad_norm": 0.4875074326992035, |
|
"learning_rate": 6.182778222447383e-06, |
|
"loss": 0.5677, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 1.984204888343811, |
|
"eval_runtime": 0.5545, |
|
"eval_samples_per_second": 3.607, |
|
"eval_steps_per_second": 1.804, |
|
"step": 2133 |
|
}, |
|
{ |
|
"epoch": 9.0084388185654, |
|
"grad_norm": 0.4417002499103546, |
|
"learning_rate": 5.930396771188129e-06, |
|
"loss": 0.566, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 9.029535864978904, |
|
"grad_norm": 0.4602307677268982, |
|
"learning_rate": 5.683116908049168e-06, |
|
"loss": 0.5625, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 9.050632911392405, |
|
"grad_norm": 0.4665865898132324, |
|
"learning_rate": 5.440952043526215e-06, |
|
"loss": 0.5584, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 9.071729957805907, |
|
"grad_norm": 0.47397249937057495, |
|
"learning_rate": 5.203915310718099e-06, |
|
"loss": 0.558, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 9.092827004219409, |
|
"grad_norm": 0.47351840138435364, |
|
"learning_rate": 4.972019564614539e-06, |
|
"loss": 0.5516, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 9.113924050632912, |
|
"grad_norm": 0.4746864438056946, |
|
"learning_rate": 4.745277381398938e-06, |
|
"loss": 0.5536, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 9.135021097046414, |
|
"grad_norm": 0.4737743139266968, |
|
"learning_rate": 4.523701057766361e-06, |
|
"loss": 0.5577, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 9.156118143459915, |
|
"grad_norm": 0.4778996706008911, |
|
"learning_rate": 4.307302610256736e-06, |
|
"loss": 0.5541, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 9.177215189873417, |
|
"grad_norm": 0.4771966338157654, |
|
"learning_rate": 4.0960937746030605e-06, |
|
"loss": 0.552, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 9.19831223628692, |
|
"grad_norm": 0.4708782732486725, |
|
"learning_rate": 3.890086005095051e-06, |
|
"loss": 0.5515, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 9.219409282700422, |
|
"grad_norm": 0.49065274000167847, |
|
"learning_rate": 3.6892904739578736e-06, |
|
"loss": 0.5593, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 9.240506329113924, |
|
"grad_norm": 0.47753557562828064, |
|
"learning_rate": 3.493718070746299e-06, |
|
"loss": 0.5558, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 9.261603375527427, |
|
"grad_norm": 0.4750811755657196, |
|
"learning_rate": 3.3033794017541254e-06, |
|
"loss": 0.5588, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 9.282700421940929, |
|
"grad_norm": 0.46534910798072815, |
|
"learning_rate": 3.1182847894389634e-06, |
|
"loss": 0.5567, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 9.30379746835443, |
|
"grad_norm": 0.46732398867607117, |
|
"learning_rate": 2.9384442718624395e-06, |
|
"loss": 0.5712, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 9.324894514767932, |
|
"grad_norm": 0.4703245759010315, |
|
"learning_rate": 2.763867602145842e-06, |
|
"loss": 0.5566, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 9.345991561181435, |
|
"grad_norm": 0.4757389426231384, |
|
"learning_rate": 2.5945642479411448e-06, |
|
"loss": 0.5669, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 9.367088607594937, |
|
"grad_norm": 0.49615395069122314, |
|
"learning_rate": 2.430543390917539e-06, |
|
"loss": 0.5771, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 9.388185654008439, |
|
"grad_norm": 0.4786005914211273, |
|
"learning_rate": 2.2718139262635775e-06, |
|
"loss": 0.5581, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 9.40928270042194, |
|
"grad_norm": 0.4749692380428314, |
|
"learning_rate": 2.1183844622047034e-06, |
|
"loss": 0.5566, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 9.430379746835444, |
|
"grad_norm": 0.4892341196537018, |
|
"learning_rate": 1.9702633195363917e-06, |
|
"loss": 0.5577, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 9.451476793248945, |
|
"grad_norm": 0.485775351524353, |
|
"learning_rate": 1.8274585311729653e-06, |
|
"loss": 0.5724, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 9.472573839662447, |
|
"grad_norm": 0.47523242235183716, |
|
"learning_rate": 1.6899778417118983e-06, |
|
"loss": 0.5472, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 9.49367088607595, |
|
"grad_norm": 0.45913201570510864, |
|
"learning_rate": 1.557828707013831e-06, |
|
"loss": 0.5576, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 9.514767932489452, |
|
"grad_norm": 0.4752641022205353, |
|
"learning_rate": 1.4310182937982141e-06, |
|
"loss": 0.5605, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 9.535864978902953, |
|
"grad_norm": 0.47292277216911316, |
|
"learning_rate": 1.309553479254666e-06, |
|
"loss": 0.5653, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 9.556962025316455, |
|
"grad_norm": 0.4652714133262634, |
|
"learning_rate": 1.1934408506699802e-06, |
|
"loss": 0.5571, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 9.578059071729959, |
|
"grad_norm": 0.4648183584213257, |
|
"learning_rate": 1.0826867050708678e-06, |
|
"loss": 0.5603, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 9.59915611814346, |
|
"grad_norm": 0.4922165274620056, |
|
"learning_rate": 9.772970488825417e-07, |
|
"loss": 0.5627, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 9.620253164556962, |
|
"grad_norm": 0.4720841348171234, |
|
"learning_rate": 8.772775976028546e-07, |
|
"loss": 0.5517, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 9.641350210970463, |
|
"grad_norm": 0.510443389415741, |
|
"learning_rate": 7.826337754924473e-07, |
|
"loss": 0.5641, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 9.662447257383967, |
|
"grad_norm": 0.46088987588882446, |
|
"learning_rate": 6.933707152805058e-07, |
|
"loss": 0.5595, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 9.683544303797468, |
|
"grad_norm": 0.48828017711639404, |
|
"learning_rate": 6.094932578864287e-07, |
|
"loss": 0.565, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 9.70464135021097, |
|
"grad_norm": 0.4722835421562195, |
|
"learning_rate": 5.31005952157304e-07, |
|
"loss": 0.5611, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 9.725738396624472, |
|
"grad_norm": 0.4879083037376404, |
|
"learning_rate": 4.5791305462120625e-07, |
|
"loss": 0.5746, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 9.746835443037975, |
|
"grad_norm": 0.4783223271369934, |
|
"learning_rate": 3.902185292563365e-07, |
|
"loss": 0.569, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 9.767932489451477, |
|
"grad_norm": 0.4648023247718811, |
|
"learning_rate": 3.2792604727608367e-07, |
|
"loss": 0.5503, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 9.789029535864978, |
|
"grad_norm": 0.47712016105651855, |
|
"learning_rate": 2.710389869298946e-07, |
|
"loss": 0.5522, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 9.810126582278482, |
|
"grad_norm": 0.4652068614959717, |
|
"learning_rate": 2.1956043332010955e-07, |
|
"loss": 0.556, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 9.831223628691983, |
|
"grad_norm": 0.48310577869415283, |
|
"learning_rate": 1.7349317823459609e-07, |
|
"loss": 0.5637, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 9.852320675105485, |
|
"grad_norm": 0.5295414924621582, |
|
"learning_rate": 1.3283971999537015e-07, |
|
"loss": 0.5559, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 9.873417721518987, |
|
"grad_norm": 0.4798893928527832, |
|
"learning_rate": 9.76022633231155e-08, |
|
"loss": 0.5543, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 9.89451476793249, |
|
"grad_norm": 0.48038187623023987, |
|
"learning_rate": 6.778271921760171e-08, |
|
"loss": 0.5626, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 9.915611814345992, |
|
"grad_norm": 0.47445496916770935, |
|
"learning_rate": 4.338270485405582e-08, |
|
"loss": 0.5545, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 9.936708860759493, |
|
"grad_norm": 0.47432997822761536, |
|
"learning_rate": 2.4403543495454818e-08, |
|
"loss": 0.5651, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 9.957805907172995, |
|
"grad_norm": 0.4740554094314575, |
|
"learning_rate": 1.0846264420771857e-08, |
|
"loss": 0.5637, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 9.978902953586498, |
|
"grad_norm": 0.4762984812259674, |
|
"learning_rate": 2.7116028691431817e-09, |
|
"loss": 0.5611, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.4564709961414337, |
|
"learning_rate": 0.0, |
|
"loss": 0.5648, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 2.0032131671905518, |
|
"eval_runtime": 0.5807, |
|
"eval_samples_per_second": 3.444, |
|
"eval_steps_per_second": 1.722, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 2370, |
|
"total_flos": 3.5097090775444357e+18, |
|
"train_loss": 0.7390849222110797, |
|
"train_runtime": 8188.9555, |
|
"train_samples_per_second": 9.243, |
|
"train_steps_per_second": 0.289 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2370, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.5097090775444357e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|