|
{ |
|
"best_metric": 0.0030439873225986958, |
|
"best_model_checkpoint": "./Tb_Dataset/checkpoint-1000", |
|
"epoch": 4.0, |
|
"eval_steps": 100, |
|
"global_step": 1304, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03067484662576687, |
|
"grad_norm": 1.666305661201477, |
|
"learning_rate": 0.00019846625766871168, |
|
"loss": 0.4847, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06134969325153374, |
|
"grad_norm": 1.4896280765533447, |
|
"learning_rate": 0.00019693251533742332, |
|
"loss": 0.278, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09202453987730061, |
|
"grad_norm": 1.3254238367080688, |
|
"learning_rate": 0.000195398773006135, |
|
"loss": 0.2401, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12269938650306748, |
|
"grad_norm": 1.1399273872375488, |
|
"learning_rate": 0.00019386503067484663, |
|
"loss": 0.188, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.15337423312883436, |
|
"grad_norm": 2.320486307144165, |
|
"learning_rate": 0.0001923312883435583, |
|
"loss": 0.1944, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.18404907975460122, |
|
"grad_norm": 0.24434129893779755, |
|
"learning_rate": 0.00019079754601226997, |
|
"loss": 0.0738, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2147239263803681, |
|
"grad_norm": 0.7523425221443176, |
|
"learning_rate": 0.0001892638036809816, |
|
"loss": 0.171, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.24539877300613497, |
|
"grad_norm": 3.6861462593078613, |
|
"learning_rate": 0.00018773006134969328, |
|
"loss": 0.178, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.27607361963190186, |
|
"grad_norm": 2.111417770385742, |
|
"learning_rate": 0.00018619631901840492, |
|
"loss": 0.2488, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3067484662576687, |
|
"grad_norm": 0.8211548328399658, |
|
"learning_rate": 0.00018466257668711656, |
|
"loss": 0.0996, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3067484662576687, |
|
"eval_accuracy": 0.5625, |
|
"eval_loss": 1.0429491996765137, |
|
"eval_runtime": 0.6744, |
|
"eval_samples_per_second": 23.723, |
|
"eval_steps_per_second": 2.965, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3374233128834356, |
|
"grad_norm": 0.12294389307498932, |
|
"learning_rate": 0.00018312883435582823, |
|
"loss": 0.1403, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.36809815950920244, |
|
"grad_norm": 1.0773956775665283, |
|
"learning_rate": 0.00018159509202453987, |
|
"loss": 0.0801, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3987730061349693, |
|
"grad_norm": 1.2252161502838135, |
|
"learning_rate": 0.00018006134969325154, |
|
"loss": 0.1667, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4294478527607362, |
|
"grad_norm": 0.3200875520706177, |
|
"learning_rate": 0.00017852760736196318, |
|
"loss": 0.1365, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4601226993865031, |
|
"grad_norm": 0.4385143518447876, |
|
"learning_rate": 0.00017699386503067485, |
|
"loss": 0.1371, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.49079754601226994, |
|
"grad_norm": 0.11226026713848114, |
|
"learning_rate": 0.00017546012269938652, |
|
"loss": 0.042, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5214723926380368, |
|
"grad_norm": 1.9924014806747437, |
|
"learning_rate": 0.00017392638036809816, |
|
"loss": 0.1138, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5521472392638037, |
|
"grad_norm": 2.601227045059204, |
|
"learning_rate": 0.00017239263803680983, |
|
"loss": 0.1729, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5828220858895705, |
|
"grad_norm": 5.278203964233398, |
|
"learning_rate": 0.00017085889570552147, |
|
"loss": 0.1036, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6134969325153374, |
|
"grad_norm": 1.20023512840271, |
|
"learning_rate": 0.00016932515337423314, |
|
"loss": 0.0481, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6134969325153374, |
|
"eval_accuracy": 0.8125, |
|
"eval_loss": 0.5665359497070312, |
|
"eval_runtime": 0.4807, |
|
"eval_samples_per_second": 33.285, |
|
"eval_steps_per_second": 4.161, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6441717791411042, |
|
"grad_norm": 0.7395023107528687, |
|
"learning_rate": 0.0001677914110429448, |
|
"loss": 0.1225, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6748466257668712, |
|
"grad_norm": 0.05845380574464798, |
|
"learning_rate": 0.00016625766871165645, |
|
"loss": 0.1034, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7055214723926381, |
|
"grad_norm": 2.7569024562835693, |
|
"learning_rate": 0.00016472392638036812, |
|
"loss": 0.0538, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7361963190184049, |
|
"grad_norm": 0.06222417205572128, |
|
"learning_rate": 0.00016319018404907976, |
|
"loss": 0.0958, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7668711656441718, |
|
"grad_norm": 1.9317396879196167, |
|
"learning_rate": 0.00016165644171779143, |
|
"loss": 0.09, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7975460122699386, |
|
"grad_norm": 3.952806234359741, |
|
"learning_rate": 0.0001601226993865031, |
|
"loss": 0.0878, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8282208588957055, |
|
"grad_norm": 1.5617386102676392, |
|
"learning_rate": 0.00015858895705521474, |
|
"loss": 0.1304, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8588957055214724, |
|
"grad_norm": 0.32101425528526306, |
|
"learning_rate": 0.0001570552147239264, |
|
"loss": 0.0684, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8895705521472392, |
|
"grad_norm": 0.6578021049499512, |
|
"learning_rate": 0.00015552147239263805, |
|
"loss": 0.0917, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9202453987730062, |
|
"grad_norm": 0.03468528762459755, |
|
"learning_rate": 0.0001539877300613497, |
|
"loss": 0.0391, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9202453987730062, |
|
"eval_accuracy": 0.6875, |
|
"eval_loss": 1.0036612749099731, |
|
"eval_runtime": 0.4639, |
|
"eval_samples_per_second": 34.488, |
|
"eval_steps_per_second": 4.311, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.950920245398773, |
|
"grad_norm": 0.04961508885025978, |
|
"learning_rate": 0.00015245398773006136, |
|
"loss": 0.0435, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9815950920245399, |
|
"grad_norm": 0.7167016863822937, |
|
"learning_rate": 0.000150920245398773, |
|
"loss": 0.0667, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.0122699386503067, |
|
"grad_norm": 0.3984834551811218, |
|
"learning_rate": 0.00014938650306748467, |
|
"loss": 0.1054, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.0429447852760736, |
|
"grad_norm": 0.09521777182817459, |
|
"learning_rate": 0.0001478527607361963, |
|
"loss": 0.0565, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.0736196319018405, |
|
"grad_norm": 3.836611032485962, |
|
"learning_rate": 0.00014631901840490798, |
|
"loss": 0.0498, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.1042944785276074, |
|
"grad_norm": 0.6689714789390564, |
|
"learning_rate": 0.00014478527607361964, |
|
"loss": 0.0693, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.1349693251533743, |
|
"grad_norm": 0.42697030305862427, |
|
"learning_rate": 0.00014325153374233129, |
|
"loss": 0.1017, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.165644171779141, |
|
"grad_norm": 0.28322863578796387, |
|
"learning_rate": 0.00014171779141104295, |
|
"loss": 0.0072, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.196319018404908, |
|
"grad_norm": 0.02922147326171398, |
|
"learning_rate": 0.0001401840490797546, |
|
"loss": 0.0355, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.2269938650306749, |
|
"grad_norm": 0.4956642687320709, |
|
"learning_rate": 0.00013865030674846626, |
|
"loss": 0.0711, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2269938650306749, |
|
"eval_accuracy": 0.875, |
|
"eval_loss": 0.5200170874595642, |
|
"eval_runtime": 0.4602, |
|
"eval_samples_per_second": 34.769, |
|
"eval_steps_per_second": 4.346, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2576687116564418, |
|
"grad_norm": 0.35631048679351807, |
|
"learning_rate": 0.00013711656441717793, |
|
"loss": 0.018, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.2883435582822087, |
|
"grad_norm": 0.053142815828323364, |
|
"learning_rate": 0.00013558282208588957, |
|
"loss": 0.0114, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.3190184049079754, |
|
"grad_norm": 3.590878963470459, |
|
"learning_rate": 0.00013404907975460124, |
|
"loss": 0.0385, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.3496932515337423, |
|
"grad_norm": 0.0230648685246706, |
|
"learning_rate": 0.00013251533742331288, |
|
"loss": 0.0333, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.3803680981595092, |
|
"grad_norm": 3.86190128326416, |
|
"learning_rate": 0.00013098159509202455, |
|
"loss": 0.0393, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.4110429447852761, |
|
"grad_norm": 0.9012386798858643, |
|
"learning_rate": 0.00012944785276073622, |
|
"loss": 0.0571, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.441717791411043, |
|
"grad_norm": 0.07151665538549423, |
|
"learning_rate": 0.00012791411042944786, |
|
"loss": 0.0104, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.4723926380368098, |
|
"grad_norm": 0.783393919467926, |
|
"learning_rate": 0.00012638036809815953, |
|
"loss": 0.101, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.5030674846625767, |
|
"grad_norm": 0.11429934948682785, |
|
"learning_rate": 0.00012484662576687117, |
|
"loss": 0.0861, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.5337423312883436, |
|
"grad_norm": 0.22566761076450348, |
|
"learning_rate": 0.00012331288343558281, |
|
"loss": 0.0258, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.5337423312883436, |
|
"eval_accuracy": 0.9375, |
|
"eval_loss": 0.38183438777923584, |
|
"eval_runtime": 0.6415, |
|
"eval_samples_per_second": 24.943, |
|
"eval_steps_per_second": 3.118, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.5644171779141103, |
|
"grad_norm": 0.0231198500841856, |
|
"learning_rate": 0.0001217791411042945, |
|
"loss": 0.0112, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.5950920245398774, |
|
"grad_norm": 0.07680622488260269, |
|
"learning_rate": 0.00012024539877300614, |
|
"loss": 0.0332, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.6257668711656441, |
|
"grad_norm": 4.2723798751831055, |
|
"learning_rate": 0.0001187116564417178, |
|
"loss": 0.0615, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.656441717791411, |
|
"grad_norm": 0.020270884037017822, |
|
"learning_rate": 0.00011717791411042945, |
|
"loss": 0.0025, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.687116564417178, |
|
"grad_norm": 0.017487134784460068, |
|
"learning_rate": 0.0001156441717791411, |
|
"loss": 0.0032, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.7177914110429446, |
|
"grad_norm": 3.106546401977539, |
|
"learning_rate": 0.00011411042944785277, |
|
"loss": 0.0776, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.7484662576687118, |
|
"grad_norm": 0.2955380380153656, |
|
"learning_rate": 0.00011257668711656441, |
|
"loss": 0.046, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.7791411042944785, |
|
"grad_norm": 0.03838299587368965, |
|
"learning_rate": 0.00011104294478527608, |
|
"loss": 0.0039, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.8098159509202454, |
|
"grad_norm": 0.02066592499613762, |
|
"learning_rate": 0.00010950920245398772, |
|
"loss": 0.0411, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.8404907975460123, |
|
"grad_norm": 0.029740285128355026, |
|
"learning_rate": 0.00010797546012269939, |
|
"loss": 0.0547, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.8404907975460123, |
|
"eval_accuracy": 0.9375, |
|
"eval_loss": 0.3414860665798187, |
|
"eval_runtime": 0.6402, |
|
"eval_samples_per_second": 24.991, |
|
"eval_steps_per_second": 3.124, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.871165644171779, |
|
"grad_norm": 0.06896264851093292, |
|
"learning_rate": 0.00010644171779141106, |
|
"loss": 0.0412, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.9018404907975461, |
|
"grad_norm": 0.17898960411548615, |
|
"learning_rate": 0.0001049079754601227, |
|
"loss": 0.0059, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.9325153374233128, |
|
"grad_norm": 0.02462293580174446, |
|
"learning_rate": 0.00010337423312883437, |
|
"loss": 0.0334, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.9631901840490797, |
|
"grad_norm": 0.15255212783813477, |
|
"learning_rate": 0.00010184049079754601, |
|
"loss": 0.0484, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.9938650306748467, |
|
"grad_norm": 1.2095363140106201, |
|
"learning_rate": 0.00010030674846625767, |
|
"loss": 0.0409, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.0245398773006134, |
|
"grad_norm": 0.05067542567849159, |
|
"learning_rate": 9.877300613496932e-05, |
|
"loss": 0.0239, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.0552147239263805, |
|
"grad_norm": 0.048488594591617584, |
|
"learning_rate": 9.723926380368099e-05, |
|
"loss": 0.0092, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.085889570552147, |
|
"grad_norm": 0.06358503550291061, |
|
"learning_rate": 9.570552147239264e-05, |
|
"loss": 0.0284, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.116564417177914, |
|
"grad_norm": 0.02291274443268776, |
|
"learning_rate": 9.41717791411043e-05, |
|
"loss": 0.0312, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.147239263803681, |
|
"grad_norm": 0.01959369145333767, |
|
"learning_rate": 9.263803680981595e-05, |
|
"loss": 0.0029, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.147239263803681, |
|
"eval_accuracy": 0.9375, |
|
"eval_loss": 0.06373238563537598, |
|
"eval_runtime": 1.4073, |
|
"eval_samples_per_second": 11.369, |
|
"eval_steps_per_second": 1.421, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.1779141104294477, |
|
"grad_norm": 0.015473265200853348, |
|
"learning_rate": 9.110429447852761e-05, |
|
"loss": 0.0052, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.208588957055215, |
|
"grad_norm": 6.127484321594238, |
|
"learning_rate": 8.957055214723928e-05, |
|
"loss": 0.0215, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.2392638036809815, |
|
"grad_norm": 0.014825492165982723, |
|
"learning_rate": 8.803680981595093e-05, |
|
"loss": 0.0051, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.2699386503067487, |
|
"grad_norm": 0.09054987877607346, |
|
"learning_rate": 8.650306748466259e-05, |
|
"loss": 0.0089, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.3006134969325154, |
|
"grad_norm": 0.012495328672230244, |
|
"learning_rate": 8.496932515337423e-05, |
|
"loss": 0.0035, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.331288343558282, |
|
"grad_norm": 0.012604707852005959, |
|
"learning_rate": 8.343558282208588e-05, |
|
"loss": 0.0046, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.361963190184049, |
|
"grad_norm": 0.012834266759455204, |
|
"learning_rate": 8.190184049079755e-05, |
|
"loss": 0.0685, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.392638036809816, |
|
"grad_norm": 1.8484301567077637, |
|
"learning_rate": 8.036809815950921e-05, |
|
"loss": 0.0058, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.4233128834355826, |
|
"grad_norm": 0.01303753349930048, |
|
"learning_rate": 7.883435582822086e-05, |
|
"loss": 0.0201, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.4539877300613497, |
|
"grad_norm": 0.011059868149459362, |
|
"learning_rate": 7.730061349693252e-05, |
|
"loss": 0.0543, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.4539877300613497, |
|
"eval_accuracy": 0.8125, |
|
"eval_loss": 0.7362374067306519, |
|
"eval_runtime": 0.6252, |
|
"eval_samples_per_second": 25.594, |
|
"eval_steps_per_second": 3.199, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.4846625766871164, |
|
"grad_norm": 0.013651982881128788, |
|
"learning_rate": 7.576687116564417e-05, |
|
"loss": 0.0023, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.5153374233128836, |
|
"grad_norm": 0.013891194015741348, |
|
"learning_rate": 7.423312883435584e-05, |
|
"loss": 0.0035, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.5460122699386503, |
|
"grad_norm": 0.022767823189496994, |
|
"learning_rate": 7.26993865030675e-05, |
|
"loss": 0.0023, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.5766871165644174, |
|
"grad_norm": 0.01230280939489603, |
|
"learning_rate": 7.116564417177914e-05, |
|
"loss": 0.0163, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.607361963190184, |
|
"grad_norm": 0.01010560616850853, |
|
"learning_rate": 6.963190184049079e-05, |
|
"loss": 0.0016, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.638036809815951, |
|
"grad_norm": 0.010164570063352585, |
|
"learning_rate": 6.809815950920245e-05, |
|
"loss": 0.0287, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.668711656441718, |
|
"grad_norm": 0.010293642990291119, |
|
"learning_rate": 6.656441717791412e-05, |
|
"loss": 0.0307, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.6993865030674846, |
|
"grad_norm": 0.01078992523252964, |
|
"learning_rate": 6.503067484662577e-05, |
|
"loss": 0.0106, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.7300613496932513, |
|
"grad_norm": 0.01032053492963314, |
|
"learning_rate": 6.349693251533743e-05, |
|
"loss": 0.0014, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.7607361963190185, |
|
"grad_norm": 0.009690840728580952, |
|
"learning_rate": 6.196319018404908e-05, |
|
"loss": 0.0265, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.7607361963190185, |
|
"eval_accuracy": 0.75, |
|
"eval_loss": 1.0916930437088013, |
|
"eval_runtime": 0.4595, |
|
"eval_samples_per_second": 34.82, |
|
"eval_steps_per_second": 4.352, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.791411042944785, |
|
"grad_norm": 0.014224858023226261, |
|
"learning_rate": 6.0429447852760736e-05, |
|
"loss": 0.0622, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.8220858895705523, |
|
"grad_norm": 0.013891954906284809, |
|
"learning_rate": 5.88957055214724e-05, |
|
"loss": 0.0153, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.852760736196319, |
|
"grad_norm": 0.0167331974953413, |
|
"learning_rate": 5.736196319018405e-05, |
|
"loss": 0.0026, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.883435582822086, |
|
"grad_norm": 0.03341999277472496, |
|
"learning_rate": 5.582822085889571e-05, |
|
"loss": 0.0119, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.914110429447853, |
|
"grad_norm": 0.012064835987985134, |
|
"learning_rate": 5.429447852760736e-05, |
|
"loss": 0.0149, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.9447852760736195, |
|
"grad_norm": 0.012997813522815704, |
|
"learning_rate": 5.276073619631902e-05, |
|
"loss": 0.0034, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.9754601226993866, |
|
"grad_norm": 0.014833126217126846, |
|
"learning_rate": 5.122699386503068e-05, |
|
"loss": 0.0209, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 3.0061349693251533, |
|
"grad_norm": 0.010445049963891506, |
|
"learning_rate": 4.9693251533742335e-05, |
|
"loss": 0.0434, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.03680981595092, |
|
"grad_norm": 0.012170241214334965, |
|
"learning_rate": 4.815950920245399e-05, |
|
"loss": 0.0058, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 3.067484662576687, |
|
"grad_norm": 0.010554679669439793, |
|
"learning_rate": 4.6625766871165645e-05, |
|
"loss": 0.0017, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.067484662576687, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.0030439873225986958, |
|
"eval_runtime": 0.4699, |
|
"eval_samples_per_second": 34.049, |
|
"eval_steps_per_second": 4.256, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.098159509202454, |
|
"grad_norm": 0.010963929817080498, |
|
"learning_rate": 4.5092024539877307e-05, |
|
"loss": 0.0241, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 3.128834355828221, |
|
"grad_norm": 0.010549047961831093, |
|
"learning_rate": 4.3558282208588955e-05, |
|
"loss": 0.0015, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 3.1595092024539877, |
|
"grad_norm": 0.012895047664642334, |
|
"learning_rate": 4.2024539877300617e-05, |
|
"loss": 0.0021, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 3.190184049079755, |
|
"grad_norm": 0.009695703163743019, |
|
"learning_rate": 4.049079754601227e-05, |
|
"loss": 0.0013, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 3.2208588957055215, |
|
"grad_norm": 0.01083831675350666, |
|
"learning_rate": 3.895705521472393e-05, |
|
"loss": 0.0018, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 3.2515337423312882, |
|
"grad_norm": 0.030923034995794296, |
|
"learning_rate": 3.742331288343559e-05, |
|
"loss": 0.0012, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 3.2822085889570554, |
|
"grad_norm": 0.009053406305611134, |
|
"learning_rate": 3.5889570552147236e-05, |
|
"loss": 0.0011, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 3.312883435582822, |
|
"grad_norm": 0.009371621534228325, |
|
"learning_rate": 3.43558282208589e-05, |
|
"loss": 0.0011, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 3.3435582822085887, |
|
"grad_norm": 0.009119446389377117, |
|
"learning_rate": 3.282208588957055e-05, |
|
"loss": 0.0011, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 3.374233128834356, |
|
"grad_norm": 0.009182000532746315, |
|
"learning_rate": 3.1288343558282215e-05, |
|
"loss": 0.0054, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.374233128834356, |
|
"eval_accuracy": 1.0, |
|
"eval_loss": 0.036368854343891144, |
|
"eval_runtime": 0.4537, |
|
"eval_samples_per_second": 35.267, |
|
"eval_steps_per_second": 4.408, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.4049079754601226, |
|
"grad_norm": 0.00895740371197462, |
|
"learning_rate": 2.9754601226993867e-05, |
|
"loss": 0.0017, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 3.4355828220858897, |
|
"grad_norm": 0.20501597225666046, |
|
"learning_rate": 2.822085889570552e-05, |
|
"loss": 0.0013, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 3.4662576687116564, |
|
"grad_norm": 0.009406461380422115, |
|
"learning_rate": 2.668711656441718e-05, |
|
"loss": 0.0011, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 3.4969325153374236, |
|
"grad_norm": 0.009192903526127338, |
|
"learning_rate": 2.5153374233128835e-05, |
|
"loss": 0.0012, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.5276073619631902, |
|
"grad_norm": 0.009342888370156288, |
|
"learning_rate": 2.361963190184049e-05, |
|
"loss": 0.0362, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.558282208588957, |
|
"grad_norm": 0.01002499833703041, |
|
"learning_rate": 2.208588957055215e-05, |
|
"loss": 0.0011, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 3.588957055214724, |
|
"grad_norm": 0.01044855359941721, |
|
"learning_rate": 2.0552147239263807e-05, |
|
"loss": 0.0011, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 3.6196319018404908, |
|
"grad_norm": 0.030983537435531616, |
|
"learning_rate": 1.9018404907975462e-05, |
|
"loss": 0.0011, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 3.6503067484662575, |
|
"grad_norm": 0.009104466997087002, |
|
"learning_rate": 1.7484662576687117e-05, |
|
"loss": 0.0012, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 3.6809815950920246, |
|
"grad_norm": 0.009420313872396946, |
|
"learning_rate": 1.5950920245398772e-05, |
|
"loss": 0.0234, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.6809815950920246, |
|
"eval_accuracy": 0.875, |
|
"eval_loss": 0.23103433847427368, |
|
"eval_runtime": 0.4704, |
|
"eval_samples_per_second": 34.012, |
|
"eval_steps_per_second": 4.252, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.7116564417177913, |
|
"grad_norm": 0.00850239023566246, |
|
"learning_rate": 1.441717791411043e-05, |
|
"loss": 0.001, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 3.7423312883435584, |
|
"grad_norm": 0.008601406589150429, |
|
"learning_rate": 1.2883435582822087e-05, |
|
"loss": 0.0013, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 3.773006134969325, |
|
"grad_norm": 0.011089220643043518, |
|
"learning_rate": 1.1349693251533742e-05, |
|
"loss": 0.0011, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 3.8036809815950923, |
|
"grad_norm": 0.008406179025769234, |
|
"learning_rate": 9.8159509202454e-06, |
|
"loss": 0.001, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 3.834355828220859, |
|
"grad_norm": 0.009516136720776558, |
|
"learning_rate": 8.282208588957055e-06, |
|
"loss": 0.0012, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 3.8650306748466257, |
|
"grad_norm": 0.011050822213292122, |
|
"learning_rate": 6.748466257668712e-06, |
|
"loss": 0.0012, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 3.895705521472393, |
|
"grad_norm": 0.008980349637567997, |
|
"learning_rate": 5.214723926380368e-06, |
|
"loss": 0.0012, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 3.9263803680981595, |
|
"grad_norm": 0.00903311651200056, |
|
"learning_rate": 3.680981595092025e-06, |
|
"loss": 0.0014, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 3.957055214723926, |
|
"grad_norm": 0.00838613323867321, |
|
"learning_rate": 2.147239263803681e-06, |
|
"loss": 0.0253, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 3.9877300613496933, |
|
"grad_norm": 0.008292277343571186, |
|
"learning_rate": 6.134969325153375e-07, |
|
"loss": 0.0076, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.9877300613496933, |
|
"eval_accuracy": 0.875, |
|
"eval_loss": 0.40373101830482483, |
|
"eval_runtime": 0.4908, |
|
"eval_samples_per_second": 32.599, |
|
"eval_steps_per_second": 4.075, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 1304, |
|
"total_flos": 1.6167928713188475e+18, |
|
"train_loss": 0.04957773063711799, |
|
"train_runtime": 1223.9715, |
|
"train_samples_per_second": 17.046, |
|
"train_steps_per_second": 1.065 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1304, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.6167928713188475e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|