|
{ |
|
"best_metric": 2.8364877700805664, |
|
"best_model_checkpoint": "test-hasy-6/checkpoint-1082", |
|
"epoch": 2.0, |
|
"eval_steps": 100, |
|
"global_step": 1082, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.711106538772583, |
|
"learning_rate": 1.9815157116451017e-05, |
|
"loss": 3.6832, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.2622244358062744, |
|
"learning_rate": 1.9630314232902035e-05, |
|
"loss": 3.6516, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.6894755363464355, |
|
"learning_rate": 1.944547134935305e-05, |
|
"loss": 3.6016, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 3.1826202869415283, |
|
"learning_rate": 1.9260628465804068e-05, |
|
"loss": 3.5555, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.9819042682647705, |
|
"learning_rate": 1.9075785582255083e-05, |
|
"loss": 3.4945, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.705105781555176, |
|
"learning_rate": 1.88909426987061e-05, |
|
"loss": 3.5038, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 3.76656174659729, |
|
"learning_rate": 1.8706099815157116e-05, |
|
"loss": 3.472, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 3.7350897789001465, |
|
"learning_rate": 1.8521256931608135e-05, |
|
"loss": 3.6583, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 3.705801486968994, |
|
"learning_rate": 1.833641404805915e-05, |
|
"loss": 3.7701, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 4.12357759475708, |
|
"learning_rate": 1.8151571164510168e-05, |
|
"loss": 3.606, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 3.3650107383728027, |
|
"learning_rate": 1.7966728280961186e-05, |
|
"loss": 3.5468, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 3.4115757942199707, |
|
"learning_rate": 1.77818853974122e-05, |
|
"loss": 3.7078, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 3.238929271697998, |
|
"learning_rate": 1.759704251386322e-05, |
|
"loss": 3.497, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 4.156219005584717, |
|
"learning_rate": 1.7412199630314234e-05, |
|
"loss": 3.4416, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 2.9438037872314453, |
|
"learning_rate": 1.7227356746765253e-05, |
|
"loss": 3.4672, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 4.352924346923828, |
|
"learning_rate": 1.7042513863216268e-05, |
|
"loss": 3.6725, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 3.657327890396118, |
|
"learning_rate": 1.6857670979667286e-05, |
|
"loss": 3.734, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 3.249286651611328, |
|
"learning_rate": 1.66728280961183e-05, |
|
"loss": 3.5018, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 3.500436305999756, |
|
"learning_rate": 1.6487985212569316e-05, |
|
"loss": 3.4335, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 3.593245029449463, |
|
"learning_rate": 1.6303142329020334e-05, |
|
"loss": 3.4789, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 5.4212646484375, |
|
"learning_rate": 1.611829944547135e-05, |
|
"loss": 3.679, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 4.081950664520264, |
|
"learning_rate": 1.5933456561922367e-05, |
|
"loss": 3.3923, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 3.5438432693481445, |
|
"learning_rate": 1.5748613678373382e-05, |
|
"loss": 3.6173, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 3.3405466079711914, |
|
"learning_rate": 1.55637707948244e-05, |
|
"loss": 3.4723, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 4.1731061935424805, |
|
"learning_rate": 1.5378927911275416e-05, |
|
"loss": 3.4844, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 3.103394031524658, |
|
"learning_rate": 1.5194085027726432e-05, |
|
"loss": 3.3946, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 3.5173914432525635, |
|
"learning_rate": 1.5009242144177449e-05, |
|
"loss": 3.5343, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 4.252523899078369, |
|
"learning_rate": 1.4824399260628467e-05, |
|
"loss": 3.4245, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 4.194942474365234, |
|
"learning_rate": 1.4639556377079484e-05, |
|
"loss": 3.457, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 3.6533360481262207, |
|
"learning_rate": 1.44547134935305e-05, |
|
"loss": 3.3204, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 3.9382996559143066, |
|
"learning_rate": 1.4269870609981517e-05, |
|
"loss": 3.3361, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 4.06477165222168, |
|
"learning_rate": 1.4085027726432534e-05, |
|
"loss": 3.3031, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 4.143244743347168, |
|
"learning_rate": 1.390018484288355e-05, |
|
"loss": 3.207, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 4.467826843261719, |
|
"learning_rate": 1.3715341959334567e-05, |
|
"loss": 3.3971, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 3.793196678161621, |
|
"learning_rate": 1.3530499075785584e-05, |
|
"loss": 3.3889, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 3.1169040203094482, |
|
"learning_rate": 1.33456561922366e-05, |
|
"loss": 3.3227, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 4.126261234283447, |
|
"learning_rate": 1.3160813308687617e-05, |
|
"loss": 3.4843, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 4.5660223960876465, |
|
"learning_rate": 1.2975970425138634e-05, |
|
"loss": 3.3331, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 4.103394985198975, |
|
"learning_rate": 1.279112754158965e-05, |
|
"loss": 3.4159, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 4.094814300537109, |
|
"learning_rate": 1.2606284658040667e-05, |
|
"loss": 3.2274, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 4.130598068237305, |
|
"learning_rate": 1.2421441774491683e-05, |
|
"loss": 3.2463, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 3.7482187747955322, |
|
"learning_rate": 1.2236598890942698e-05, |
|
"loss": 3.2457, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 3.43017840385437, |
|
"learning_rate": 1.2051756007393715e-05, |
|
"loss": 3.5525, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 3.194152355194092, |
|
"learning_rate": 1.1866913123844732e-05, |
|
"loss": 3.5108, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 3.5279622077941895, |
|
"learning_rate": 1.1682070240295748e-05, |
|
"loss": 3.3414, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 3.6658153533935547, |
|
"learning_rate": 1.1497227356746765e-05, |
|
"loss": 3.4046, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 3.5500826835632324, |
|
"learning_rate": 1.1312384473197783e-05, |
|
"loss": 3.0882, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 5.9582953453063965, |
|
"learning_rate": 1.11275415896488e-05, |
|
"loss": 3.192, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 5.833331108093262, |
|
"learning_rate": 1.0942698706099817e-05, |
|
"loss": 3.5321, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 3.8356950283050537, |
|
"learning_rate": 1.0757855822550833e-05, |
|
"loss": 3.6224, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 4.684799671173096, |
|
"learning_rate": 1.057301293900185e-05, |
|
"loss": 3.3353, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 3.551720142364502, |
|
"learning_rate": 1.0388170055452866e-05, |
|
"loss": 3.495, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 3.6974239349365234, |
|
"learning_rate": 1.0203327171903883e-05, |
|
"loss": 3.3445, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 4.8344950675964355, |
|
"learning_rate": 1.00184842883549e-05, |
|
"loss": 3.5095, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.46361746361746364, |
|
"eval_loss": 2.99053692817688, |
|
"eval_runtime": 1.5396, |
|
"eval_samples_per_second": 312.424, |
|
"eval_steps_per_second": 39.621, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 4.154804706573486, |
|
"learning_rate": 9.833641404805916e-06, |
|
"loss": 3.3211, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 3.6485326290130615, |
|
"learning_rate": 9.648798521256933e-06, |
|
"loss": 3.2122, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 4.6978912353515625, |
|
"learning_rate": 9.46395563770795e-06, |
|
"loss": 3.3578, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 3.3605403900146484, |
|
"learning_rate": 9.279112754158966e-06, |
|
"loss": 3.2178, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 4.516849517822266, |
|
"learning_rate": 9.094269870609981e-06, |
|
"loss": 3.3387, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 3.1765856742858887, |
|
"learning_rate": 8.909426987060998e-06, |
|
"loss": 3.3164, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 3.9158923625946045, |
|
"learning_rate": 8.724584103512016e-06, |
|
"loss": 3.2342, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 3.418714761734009, |
|
"learning_rate": 8.539741219963033e-06, |
|
"loss": 3.3066, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 4.705308437347412, |
|
"learning_rate": 8.35489833641405e-06, |
|
"loss": 3.0839, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 3.6252079010009766, |
|
"learning_rate": 8.170055452865066e-06, |
|
"loss": 3.0772, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 3.9924700260162354, |
|
"learning_rate": 7.985212569316083e-06, |
|
"loss": 3.2432, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 4.91044282913208, |
|
"learning_rate": 7.8003696857671e-06, |
|
"loss": 3.2282, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 4.189777374267578, |
|
"learning_rate": 7.615526802218115e-06, |
|
"loss": 3.1491, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 6.070817947387695, |
|
"learning_rate": 7.430683918669132e-06, |
|
"loss": 2.9568, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 3.7259085178375244, |
|
"learning_rate": 7.245841035120148e-06, |
|
"loss": 3.3533, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 5.180878639221191, |
|
"learning_rate": 7.060998151571166e-06, |
|
"loss": 3.2111, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 3.779390811920166, |
|
"learning_rate": 6.876155268022182e-06, |
|
"loss": 3.1691, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 3.938795566558838, |
|
"learning_rate": 6.691312384473199e-06, |
|
"loss": 3.0765, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 4.160928726196289, |
|
"learning_rate": 6.506469500924215e-06, |
|
"loss": 3.2189, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 4.43211555480957, |
|
"learning_rate": 6.321626617375231e-06, |
|
"loss": 3.4135, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 4.024065971374512, |
|
"learning_rate": 6.136783733826248e-06, |
|
"loss": 3.0384, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 4.307715892791748, |
|
"learning_rate": 5.951940850277265e-06, |
|
"loss": 3.429, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 3.6535394191741943, |
|
"learning_rate": 5.767097966728281e-06, |
|
"loss": 3.4501, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 3.8214519023895264, |
|
"learning_rate": 5.582255083179298e-06, |
|
"loss": 3.2541, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 4.626838684082031, |
|
"learning_rate": 5.3974121996303146e-06, |
|
"loss": 3.2509, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 5.32305383682251, |
|
"learning_rate": 5.212569316081332e-06, |
|
"loss": 3.1942, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 6.955519199371338, |
|
"learning_rate": 5.027726432532349e-06, |
|
"loss": 3.0025, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 5.317496299743652, |
|
"learning_rate": 4.8428835489833645e-06, |
|
"loss": 3.02, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 3.5093274116516113, |
|
"learning_rate": 4.658040665434381e-06, |
|
"loss": 3.2135, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 5.330411434173584, |
|
"learning_rate": 4.473197781885398e-06, |
|
"loss": 3.1959, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 6.167944431304932, |
|
"learning_rate": 4.288354898336414e-06, |
|
"loss": 3.2624, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 5.011616230010986, |
|
"learning_rate": 4.103512014787431e-06, |
|
"loss": 3.1496, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 4.2308549880981445, |
|
"learning_rate": 3.918669131238448e-06, |
|
"loss": 3.2032, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 4.570101261138916, |
|
"learning_rate": 3.7338262476894642e-06, |
|
"loss": 3.1373, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 5.977038860321045, |
|
"learning_rate": 3.548983364140481e-06, |
|
"loss": 3.2066, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 3.6211962699890137, |
|
"learning_rate": 3.3641404805914975e-06, |
|
"loss": 3.229, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 4.511559963226318, |
|
"learning_rate": 3.1792975970425146e-06, |
|
"loss": 3.2265, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 4.407166004180908, |
|
"learning_rate": 2.9944547134935308e-06, |
|
"loss": 3.1788, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 3.707716941833496, |
|
"learning_rate": 2.8096118299445474e-06, |
|
"loss": 3.0538, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 3.5526440143585205, |
|
"learning_rate": 2.624768946395564e-06, |
|
"loss": 3.232, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 4.847479820251465, |
|
"learning_rate": 2.4399260628465807e-06, |
|
"loss": 3.1423, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 5.645847797393799, |
|
"learning_rate": 2.2550831792975973e-06, |
|
"loss": 3.015, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 4.522593021392822, |
|
"learning_rate": 2.070240295748614e-06, |
|
"loss": 3.2805, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 4.721327304840088, |
|
"learning_rate": 1.8853974121996305e-06, |
|
"loss": 3.324, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 3.5393543243408203, |
|
"learning_rate": 1.700554528650647e-06, |
|
"loss": 3.0815, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 5.153535842895508, |
|
"learning_rate": 1.5157116451016638e-06, |
|
"loss": 3.0635, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 4.251449108123779, |
|
"learning_rate": 1.3308687615526802e-06, |
|
"loss": 2.9802, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 3.994425058364868, |
|
"learning_rate": 1.1460258780036969e-06, |
|
"loss": 3.5064, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 4.248856544494629, |
|
"learning_rate": 9.611829944547135e-07, |
|
"loss": 2.9155, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 5.162292957305908, |
|
"learning_rate": 7.763401109057302e-07, |
|
"loss": 3.2488, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 4.9712443351745605, |
|
"learning_rate": 5.914972273567468e-07, |
|
"loss": 3.3756, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 4.459498405456543, |
|
"learning_rate": 4.066543438077634e-07, |
|
"loss": 3.1618, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 4.354925155639648, |
|
"learning_rate": 2.2181146025878005e-07, |
|
"loss": 3.1594, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 6.441626071929932, |
|
"learning_rate": 3.696857670979668e-08, |
|
"loss": 2.8453, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.4968814968814969, |
|
"eval_loss": 2.8364877700805664, |
|
"eval_runtime": 1.6024, |
|
"eval_samples_per_second": 300.173, |
|
"eval_steps_per_second": 38.068, |
|
"step": 1082 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 1082, |
|
"total_flos": 6.710386542096384e+17, |
|
"train_loss": 3.3293299987884635, |
|
"train_runtime": 76.1938, |
|
"train_samples_per_second": 113.526, |
|
"train_steps_per_second": 14.201 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.4968814968814969, |
|
"eval_loss": 2.8364877700805664, |
|
"eval_runtime": 1.534, |
|
"eval_samples_per_second": 313.565, |
|
"eval_steps_per_second": 39.766, |
|
"step": 1082 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1082, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 100, |
|
"total_flos": 6.710386542096384e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|