{ "best_metric": 0.0030439873225986958, "best_model_checkpoint": "./Tb_Dataset/checkpoint-1000", "epoch": 4.0, "eval_steps": 100, "global_step": 1304, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03067484662576687, "grad_norm": 1.666305661201477, "learning_rate": 0.00019846625766871168, "loss": 0.4847, "step": 10 }, { "epoch": 0.06134969325153374, "grad_norm": 1.4896280765533447, "learning_rate": 0.00019693251533742332, "loss": 0.278, "step": 20 }, { "epoch": 0.09202453987730061, "grad_norm": 1.3254238367080688, "learning_rate": 0.000195398773006135, "loss": 0.2401, "step": 30 }, { "epoch": 0.12269938650306748, "grad_norm": 1.1399273872375488, "learning_rate": 0.00019386503067484663, "loss": 0.188, "step": 40 }, { "epoch": 0.15337423312883436, "grad_norm": 2.320486307144165, "learning_rate": 0.0001923312883435583, "loss": 0.1944, "step": 50 }, { "epoch": 0.18404907975460122, "grad_norm": 0.24434129893779755, "learning_rate": 0.00019079754601226997, "loss": 0.0738, "step": 60 }, { "epoch": 0.2147239263803681, "grad_norm": 0.7523425221443176, "learning_rate": 0.0001892638036809816, "loss": 0.171, "step": 70 }, { "epoch": 0.24539877300613497, "grad_norm": 3.6861462593078613, "learning_rate": 0.00018773006134969328, "loss": 0.178, "step": 80 }, { "epoch": 0.27607361963190186, "grad_norm": 2.111417770385742, "learning_rate": 0.00018619631901840492, "loss": 0.2488, "step": 90 }, { "epoch": 0.3067484662576687, "grad_norm": 0.8211548328399658, "learning_rate": 0.00018466257668711656, "loss": 0.0996, "step": 100 }, { "epoch": 0.3067484662576687, "eval_accuracy": 0.5625, "eval_loss": 1.0429491996765137, "eval_runtime": 0.6744, "eval_samples_per_second": 23.723, "eval_steps_per_second": 2.965, "step": 100 }, { "epoch": 0.3374233128834356, "grad_norm": 0.12294389307498932, "learning_rate": 0.00018312883435582823, "loss": 0.1403, "step": 110 }, { "epoch": 0.36809815950920244, "grad_norm": 1.0773956775665283, "learning_rate": 0.00018159509202453987, "loss": 0.0801, "step": 120 }, { "epoch": 0.3987730061349693, "grad_norm": 1.2252161502838135, "learning_rate": 0.00018006134969325154, "loss": 0.1667, "step": 130 }, { "epoch": 0.4294478527607362, "grad_norm": 0.3200875520706177, "learning_rate": 0.00017852760736196318, "loss": 0.1365, "step": 140 }, { "epoch": 0.4601226993865031, "grad_norm": 0.4385143518447876, "learning_rate": 0.00017699386503067485, "loss": 0.1371, "step": 150 }, { "epoch": 0.49079754601226994, "grad_norm": 0.11226026713848114, "learning_rate": 0.00017546012269938652, "loss": 0.042, "step": 160 }, { "epoch": 0.5214723926380368, "grad_norm": 1.9924014806747437, "learning_rate": 0.00017392638036809816, "loss": 0.1138, "step": 170 }, { "epoch": 0.5521472392638037, "grad_norm": 2.601227045059204, "learning_rate": 0.00017239263803680983, "loss": 0.1729, "step": 180 }, { "epoch": 0.5828220858895705, "grad_norm": 5.278203964233398, "learning_rate": 0.00017085889570552147, "loss": 0.1036, "step": 190 }, { "epoch": 0.6134969325153374, "grad_norm": 1.20023512840271, "learning_rate": 0.00016932515337423314, "loss": 0.0481, "step": 200 }, { "epoch": 0.6134969325153374, "eval_accuracy": 0.8125, "eval_loss": 0.5665359497070312, "eval_runtime": 0.4807, "eval_samples_per_second": 33.285, "eval_steps_per_second": 4.161, "step": 200 }, { "epoch": 0.6441717791411042, "grad_norm": 0.7395023107528687, "learning_rate": 0.0001677914110429448, "loss": 0.1225, "step": 210 }, { "epoch": 0.6748466257668712, "grad_norm": 0.05845380574464798, "learning_rate": 0.00016625766871165645, "loss": 0.1034, "step": 220 }, { "epoch": 0.7055214723926381, "grad_norm": 2.7569024562835693, "learning_rate": 0.00016472392638036812, "loss": 0.0538, "step": 230 }, { "epoch": 0.7361963190184049, "grad_norm": 0.06222417205572128, "learning_rate": 0.00016319018404907976, "loss": 0.0958, "step": 240 }, { "epoch": 0.7668711656441718, "grad_norm": 1.9317396879196167, "learning_rate": 0.00016165644171779143, "loss": 0.09, "step": 250 }, { "epoch": 0.7975460122699386, "grad_norm": 3.952806234359741, "learning_rate": 0.0001601226993865031, "loss": 0.0878, "step": 260 }, { "epoch": 0.8282208588957055, "grad_norm": 1.5617386102676392, "learning_rate": 0.00015858895705521474, "loss": 0.1304, "step": 270 }, { "epoch": 0.8588957055214724, "grad_norm": 0.32101425528526306, "learning_rate": 0.0001570552147239264, "loss": 0.0684, "step": 280 }, { "epoch": 0.8895705521472392, "grad_norm": 0.6578021049499512, "learning_rate": 0.00015552147239263805, "loss": 0.0917, "step": 290 }, { "epoch": 0.9202453987730062, "grad_norm": 0.03468528762459755, "learning_rate": 0.0001539877300613497, "loss": 0.0391, "step": 300 }, { "epoch": 0.9202453987730062, "eval_accuracy": 0.6875, "eval_loss": 1.0036612749099731, "eval_runtime": 0.4639, "eval_samples_per_second": 34.488, "eval_steps_per_second": 4.311, "step": 300 }, { "epoch": 0.950920245398773, "grad_norm": 0.04961508885025978, "learning_rate": 0.00015245398773006136, "loss": 0.0435, "step": 310 }, { "epoch": 0.9815950920245399, "grad_norm": 0.7167016863822937, "learning_rate": 0.000150920245398773, "loss": 0.0667, "step": 320 }, { "epoch": 1.0122699386503067, "grad_norm": 0.3984834551811218, "learning_rate": 0.00014938650306748467, "loss": 0.1054, "step": 330 }, { "epoch": 1.0429447852760736, "grad_norm": 0.09521777182817459, "learning_rate": 0.0001478527607361963, "loss": 0.0565, "step": 340 }, { "epoch": 1.0736196319018405, "grad_norm": 3.836611032485962, "learning_rate": 0.00014631901840490798, "loss": 0.0498, "step": 350 }, { "epoch": 1.1042944785276074, "grad_norm": 0.6689714789390564, "learning_rate": 0.00014478527607361964, "loss": 0.0693, "step": 360 }, { "epoch": 1.1349693251533743, "grad_norm": 0.42697030305862427, "learning_rate": 0.00014325153374233129, "loss": 0.1017, "step": 370 }, { "epoch": 1.165644171779141, "grad_norm": 0.28322863578796387, "learning_rate": 0.00014171779141104295, "loss": 0.0072, "step": 380 }, { "epoch": 1.196319018404908, "grad_norm": 0.02922147326171398, "learning_rate": 0.0001401840490797546, "loss": 0.0355, "step": 390 }, { "epoch": 1.2269938650306749, "grad_norm": 0.4956642687320709, "learning_rate": 0.00013865030674846626, "loss": 0.0711, "step": 400 }, { "epoch": 1.2269938650306749, "eval_accuracy": 0.875, "eval_loss": 0.5200170874595642, "eval_runtime": 0.4602, "eval_samples_per_second": 34.769, "eval_steps_per_second": 4.346, "step": 400 }, { "epoch": 1.2576687116564418, "grad_norm": 0.35631048679351807, "learning_rate": 0.00013711656441717793, "loss": 0.018, "step": 410 }, { "epoch": 1.2883435582822087, "grad_norm": 0.053142815828323364, "learning_rate": 0.00013558282208588957, "loss": 0.0114, "step": 420 }, { "epoch": 1.3190184049079754, "grad_norm": 3.590878963470459, "learning_rate": 0.00013404907975460124, "loss": 0.0385, "step": 430 }, { "epoch": 1.3496932515337423, "grad_norm": 0.0230648685246706, "learning_rate": 0.00013251533742331288, "loss": 0.0333, "step": 440 }, { "epoch": 1.3803680981595092, "grad_norm": 3.86190128326416, "learning_rate": 0.00013098159509202455, "loss": 0.0393, "step": 450 }, { "epoch": 1.4110429447852761, "grad_norm": 0.9012386798858643, "learning_rate": 0.00012944785276073622, "loss": 0.0571, "step": 460 }, { "epoch": 1.441717791411043, "grad_norm": 0.07151665538549423, "learning_rate": 0.00012791411042944786, "loss": 0.0104, "step": 470 }, { "epoch": 1.4723926380368098, "grad_norm": 0.783393919467926, "learning_rate": 0.00012638036809815953, "loss": 0.101, "step": 480 }, { "epoch": 1.5030674846625767, "grad_norm": 0.11429934948682785, "learning_rate": 0.00012484662576687117, "loss": 0.0861, "step": 490 }, { "epoch": 1.5337423312883436, "grad_norm": 0.22566761076450348, "learning_rate": 0.00012331288343558281, "loss": 0.0258, "step": 500 }, { "epoch": 1.5337423312883436, "eval_accuracy": 0.9375, "eval_loss": 0.38183438777923584, "eval_runtime": 0.6415, "eval_samples_per_second": 24.943, "eval_steps_per_second": 3.118, "step": 500 }, { "epoch": 1.5644171779141103, "grad_norm": 0.0231198500841856, "learning_rate": 0.0001217791411042945, "loss": 0.0112, "step": 510 }, { "epoch": 1.5950920245398774, "grad_norm": 0.07680622488260269, "learning_rate": 0.00012024539877300614, "loss": 0.0332, "step": 520 }, { "epoch": 1.6257668711656441, "grad_norm": 4.2723798751831055, "learning_rate": 0.0001187116564417178, "loss": 0.0615, "step": 530 }, { "epoch": 1.656441717791411, "grad_norm": 0.020270884037017822, "learning_rate": 0.00011717791411042945, "loss": 0.0025, "step": 540 }, { "epoch": 1.687116564417178, "grad_norm": 0.017487134784460068, "learning_rate": 0.0001156441717791411, "loss": 0.0032, "step": 550 }, { "epoch": 1.7177914110429446, "grad_norm": 3.106546401977539, "learning_rate": 0.00011411042944785277, "loss": 0.0776, "step": 560 }, { "epoch": 1.7484662576687118, "grad_norm": 0.2955380380153656, "learning_rate": 0.00011257668711656441, "loss": 0.046, "step": 570 }, { "epoch": 1.7791411042944785, "grad_norm": 0.03838299587368965, "learning_rate": 0.00011104294478527608, "loss": 0.0039, "step": 580 }, { "epoch": 1.8098159509202454, "grad_norm": 0.02066592499613762, "learning_rate": 0.00010950920245398772, "loss": 0.0411, "step": 590 }, { "epoch": 1.8404907975460123, "grad_norm": 0.029740285128355026, "learning_rate": 0.00010797546012269939, "loss": 0.0547, "step": 600 }, { "epoch": 1.8404907975460123, "eval_accuracy": 0.9375, "eval_loss": 0.3414860665798187, "eval_runtime": 0.6402, "eval_samples_per_second": 24.991, "eval_steps_per_second": 3.124, "step": 600 }, { "epoch": 1.871165644171779, "grad_norm": 0.06896264851093292, "learning_rate": 0.00010644171779141106, "loss": 0.0412, "step": 610 }, { "epoch": 1.9018404907975461, "grad_norm": 0.17898960411548615, "learning_rate": 0.0001049079754601227, "loss": 0.0059, "step": 620 }, { "epoch": 1.9325153374233128, "grad_norm": 0.02462293580174446, "learning_rate": 0.00010337423312883437, "loss": 0.0334, "step": 630 }, { "epoch": 1.9631901840490797, "grad_norm": 0.15255212783813477, "learning_rate": 0.00010184049079754601, "loss": 0.0484, "step": 640 }, { "epoch": 1.9938650306748467, "grad_norm": 1.2095363140106201, "learning_rate": 0.00010030674846625767, "loss": 0.0409, "step": 650 }, { "epoch": 2.0245398773006134, "grad_norm": 0.05067542567849159, "learning_rate": 9.877300613496932e-05, "loss": 0.0239, "step": 660 }, { "epoch": 2.0552147239263805, "grad_norm": 0.048488594591617584, "learning_rate": 9.723926380368099e-05, "loss": 0.0092, "step": 670 }, { "epoch": 2.085889570552147, "grad_norm": 0.06358503550291061, "learning_rate": 9.570552147239264e-05, "loss": 0.0284, "step": 680 }, { "epoch": 2.116564417177914, "grad_norm": 0.02291274443268776, "learning_rate": 9.41717791411043e-05, "loss": 0.0312, "step": 690 }, { "epoch": 2.147239263803681, "grad_norm": 0.01959369145333767, "learning_rate": 9.263803680981595e-05, "loss": 0.0029, "step": 700 }, { "epoch": 2.147239263803681, "eval_accuracy": 0.9375, "eval_loss": 0.06373238563537598, "eval_runtime": 1.4073, "eval_samples_per_second": 11.369, "eval_steps_per_second": 1.421, "step": 700 }, { "epoch": 2.1779141104294477, "grad_norm": 0.015473265200853348, "learning_rate": 9.110429447852761e-05, "loss": 0.0052, "step": 710 }, { "epoch": 2.208588957055215, "grad_norm": 6.127484321594238, "learning_rate": 8.957055214723928e-05, "loss": 0.0215, "step": 720 }, { "epoch": 2.2392638036809815, "grad_norm": 0.014825492165982723, "learning_rate": 8.803680981595093e-05, "loss": 0.0051, "step": 730 }, { "epoch": 2.2699386503067487, "grad_norm": 0.09054987877607346, "learning_rate": 8.650306748466259e-05, "loss": 0.0089, "step": 740 }, { "epoch": 2.3006134969325154, "grad_norm": 0.012495328672230244, "learning_rate": 8.496932515337423e-05, "loss": 0.0035, "step": 750 }, { "epoch": 2.331288343558282, "grad_norm": 0.012604707852005959, "learning_rate": 8.343558282208588e-05, "loss": 0.0046, "step": 760 }, { "epoch": 2.361963190184049, "grad_norm": 0.012834266759455204, "learning_rate": 8.190184049079755e-05, "loss": 0.0685, "step": 770 }, { "epoch": 2.392638036809816, "grad_norm": 1.8484301567077637, "learning_rate": 8.036809815950921e-05, "loss": 0.0058, "step": 780 }, { "epoch": 2.4233128834355826, "grad_norm": 0.01303753349930048, "learning_rate": 7.883435582822086e-05, "loss": 0.0201, "step": 790 }, { "epoch": 2.4539877300613497, "grad_norm": 0.011059868149459362, "learning_rate": 7.730061349693252e-05, "loss": 0.0543, "step": 800 }, { "epoch": 2.4539877300613497, "eval_accuracy": 0.8125, "eval_loss": 0.7362374067306519, "eval_runtime": 0.6252, "eval_samples_per_second": 25.594, "eval_steps_per_second": 3.199, "step": 800 }, { "epoch": 2.4846625766871164, "grad_norm": 0.013651982881128788, "learning_rate": 7.576687116564417e-05, "loss": 0.0023, "step": 810 }, { "epoch": 2.5153374233128836, "grad_norm": 0.013891194015741348, "learning_rate": 7.423312883435584e-05, "loss": 0.0035, "step": 820 }, { "epoch": 2.5460122699386503, "grad_norm": 0.022767823189496994, "learning_rate": 7.26993865030675e-05, "loss": 0.0023, "step": 830 }, { "epoch": 2.5766871165644174, "grad_norm": 0.01230280939489603, "learning_rate": 7.116564417177914e-05, "loss": 0.0163, "step": 840 }, { "epoch": 2.607361963190184, "grad_norm": 0.01010560616850853, "learning_rate": 6.963190184049079e-05, "loss": 0.0016, "step": 850 }, { "epoch": 2.638036809815951, "grad_norm": 0.010164570063352585, "learning_rate": 6.809815950920245e-05, "loss": 0.0287, "step": 860 }, { "epoch": 2.668711656441718, "grad_norm": 0.010293642990291119, "learning_rate": 6.656441717791412e-05, "loss": 0.0307, "step": 870 }, { "epoch": 2.6993865030674846, "grad_norm": 0.01078992523252964, "learning_rate": 6.503067484662577e-05, "loss": 0.0106, "step": 880 }, { "epoch": 2.7300613496932513, "grad_norm": 0.01032053492963314, "learning_rate": 6.349693251533743e-05, "loss": 0.0014, "step": 890 }, { "epoch": 2.7607361963190185, "grad_norm": 0.009690840728580952, "learning_rate": 6.196319018404908e-05, "loss": 0.0265, "step": 900 }, { "epoch": 2.7607361963190185, "eval_accuracy": 0.75, "eval_loss": 1.0916930437088013, "eval_runtime": 0.4595, "eval_samples_per_second": 34.82, "eval_steps_per_second": 4.352, "step": 900 }, { "epoch": 2.791411042944785, "grad_norm": 0.014224858023226261, "learning_rate": 6.0429447852760736e-05, "loss": 0.0622, "step": 910 }, { "epoch": 2.8220858895705523, "grad_norm": 0.013891954906284809, "learning_rate": 5.88957055214724e-05, "loss": 0.0153, "step": 920 }, { "epoch": 2.852760736196319, "grad_norm": 0.0167331974953413, "learning_rate": 5.736196319018405e-05, "loss": 0.0026, "step": 930 }, { "epoch": 2.883435582822086, "grad_norm": 0.03341999277472496, "learning_rate": 5.582822085889571e-05, "loss": 0.0119, "step": 940 }, { "epoch": 2.914110429447853, "grad_norm": 0.012064835987985134, "learning_rate": 5.429447852760736e-05, "loss": 0.0149, "step": 950 }, { "epoch": 2.9447852760736195, "grad_norm": 0.012997813522815704, "learning_rate": 5.276073619631902e-05, "loss": 0.0034, "step": 960 }, { "epoch": 2.9754601226993866, "grad_norm": 0.014833126217126846, "learning_rate": 5.122699386503068e-05, "loss": 0.0209, "step": 970 }, { "epoch": 3.0061349693251533, "grad_norm": 0.010445049963891506, "learning_rate": 4.9693251533742335e-05, "loss": 0.0434, "step": 980 }, { "epoch": 3.03680981595092, "grad_norm": 0.012170241214334965, "learning_rate": 4.815950920245399e-05, "loss": 0.0058, "step": 990 }, { "epoch": 3.067484662576687, "grad_norm": 0.010554679669439793, "learning_rate": 4.6625766871165645e-05, "loss": 0.0017, "step": 1000 }, { "epoch": 3.067484662576687, "eval_accuracy": 1.0, "eval_loss": 0.0030439873225986958, "eval_runtime": 0.4699, "eval_samples_per_second": 34.049, "eval_steps_per_second": 4.256, "step": 1000 }, { "epoch": 3.098159509202454, "grad_norm": 0.010963929817080498, "learning_rate": 4.5092024539877307e-05, "loss": 0.0241, "step": 1010 }, { "epoch": 3.128834355828221, "grad_norm": 0.010549047961831093, "learning_rate": 4.3558282208588955e-05, "loss": 0.0015, "step": 1020 }, { "epoch": 3.1595092024539877, "grad_norm": 0.012895047664642334, "learning_rate": 4.2024539877300617e-05, "loss": 0.0021, "step": 1030 }, { "epoch": 3.190184049079755, "grad_norm": 0.009695703163743019, "learning_rate": 4.049079754601227e-05, "loss": 0.0013, "step": 1040 }, { "epoch": 3.2208588957055215, "grad_norm": 0.01083831675350666, "learning_rate": 3.895705521472393e-05, "loss": 0.0018, "step": 1050 }, { "epoch": 3.2515337423312882, "grad_norm": 0.030923034995794296, "learning_rate": 3.742331288343559e-05, "loss": 0.0012, "step": 1060 }, { "epoch": 3.2822085889570554, "grad_norm": 0.009053406305611134, "learning_rate": 3.5889570552147236e-05, "loss": 0.0011, "step": 1070 }, { "epoch": 3.312883435582822, "grad_norm": 0.009371621534228325, "learning_rate": 3.43558282208589e-05, "loss": 0.0011, "step": 1080 }, { "epoch": 3.3435582822085887, "grad_norm": 0.009119446389377117, "learning_rate": 3.282208588957055e-05, "loss": 0.0011, "step": 1090 }, { "epoch": 3.374233128834356, "grad_norm": 0.009182000532746315, "learning_rate": 3.1288343558282215e-05, "loss": 0.0054, "step": 1100 }, { "epoch": 3.374233128834356, "eval_accuracy": 1.0, "eval_loss": 0.036368854343891144, "eval_runtime": 0.4537, "eval_samples_per_second": 35.267, "eval_steps_per_second": 4.408, "step": 1100 }, { "epoch": 3.4049079754601226, "grad_norm": 0.00895740371197462, "learning_rate": 2.9754601226993867e-05, "loss": 0.0017, "step": 1110 }, { "epoch": 3.4355828220858897, "grad_norm": 0.20501597225666046, "learning_rate": 2.822085889570552e-05, "loss": 0.0013, "step": 1120 }, { "epoch": 3.4662576687116564, "grad_norm": 0.009406461380422115, "learning_rate": 2.668711656441718e-05, "loss": 0.0011, "step": 1130 }, { "epoch": 3.4969325153374236, "grad_norm": 0.009192903526127338, "learning_rate": 2.5153374233128835e-05, "loss": 0.0012, "step": 1140 }, { "epoch": 3.5276073619631902, "grad_norm": 0.009342888370156288, "learning_rate": 2.361963190184049e-05, "loss": 0.0362, "step": 1150 }, { "epoch": 3.558282208588957, "grad_norm": 0.01002499833703041, "learning_rate": 2.208588957055215e-05, "loss": 0.0011, "step": 1160 }, { "epoch": 3.588957055214724, "grad_norm": 0.01044855359941721, "learning_rate": 2.0552147239263807e-05, "loss": 0.0011, "step": 1170 }, { "epoch": 3.6196319018404908, "grad_norm": 0.030983537435531616, "learning_rate": 1.9018404907975462e-05, "loss": 0.0011, "step": 1180 }, { "epoch": 3.6503067484662575, "grad_norm": 0.009104466997087002, "learning_rate": 1.7484662576687117e-05, "loss": 0.0012, "step": 1190 }, { "epoch": 3.6809815950920246, "grad_norm": 0.009420313872396946, "learning_rate": 1.5950920245398772e-05, "loss": 0.0234, "step": 1200 }, { "epoch": 3.6809815950920246, "eval_accuracy": 0.875, "eval_loss": 0.23103433847427368, "eval_runtime": 0.4704, "eval_samples_per_second": 34.012, "eval_steps_per_second": 4.252, "step": 1200 }, { "epoch": 3.7116564417177913, "grad_norm": 0.00850239023566246, "learning_rate": 1.441717791411043e-05, "loss": 0.001, "step": 1210 }, { "epoch": 3.7423312883435584, "grad_norm": 0.008601406589150429, "learning_rate": 1.2883435582822087e-05, "loss": 0.0013, "step": 1220 }, { "epoch": 3.773006134969325, "grad_norm": 0.011089220643043518, "learning_rate": 1.1349693251533742e-05, "loss": 0.0011, "step": 1230 }, { "epoch": 3.8036809815950923, "grad_norm": 0.008406179025769234, "learning_rate": 9.8159509202454e-06, "loss": 0.001, "step": 1240 }, { "epoch": 3.834355828220859, "grad_norm": 0.009516136720776558, "learning_rate": 8.282208588957055e-06, "loss": 0.0012, "step": 1250 }, { "epoch": 3.8650306748466257, "grad_norm": 0.011050822213292122, "learning_rate": 6.748466257668712e-06, "loss": 0.0012, "step": 1260 }, { "epoch": 3.895705521472393, "grad_norm": 0.008980349637567997, "learning_rate": 5.214723926380368e-06, "loss": 0.0012, "step": 1270 }, { "epoch": 3.9263803680981595, "grad_norm": 0.00903311651200056, "learning_rate": 3.680981595092025e-06, "loss": 0.0014, "step": 1280 }, { "epoch": 3.957055214723926, "grad_norm": 0.00838613323867321, "learning_rate": 2.147239263803681e-06, "loss": 0.0253, "step": 1290 }, { "epoch": 3.9877300613496933, "grad_norm": 0.008292277343571186, "learning_rate": 6.134969325153375e-07, "loss": 0.0076, "step": 1300 }, { "epoch": 3.9877300613496933, "eval_accuracy": 0.875, "eval_loss": 0.40373101830482483, "eval_runtime": 0.4908, "eval_samples_per_second": 32.599, "eval_steps_per_second": 4.075, "step": 1300 }, { "epoch": 4.0, "step": 1304, "total_flos": 1.6167928713188475e+18, "train_loss": 0.04957773063711799, "train_runtime": 1223.9715, "train_samples_per_second": 17.046, "train_steps_per_second": 1.065 } ], "logging_steps": 10, "max_steps": 1304, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.6167928713188475e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }