|
{ |
|
"best_metric": 0.6764523983001709, |
|
"best_model_checkpoint": "autotrain-l21an-6mkt7/checkpoint-3000", |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 3000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.025, |
|
"grad_norm": 3.444312810897827, |
|
"learning_rate": 2.4999999999999998e-06, |
|
"loss": 0.6305, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.167919635772705, |
|
"learning_rate": 4.9999999999999996e-06, |
|
"loss": 0.7491, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.075, |
|
"grad_norm": 2.0621402263641357, |
|
"learning_rate": 7.5e-06, |
|
"loss": 0.6931, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.2826991081237793, |
|
"learning_rate": 9.999999999999999e-06, |
|
"loss": 0.7093, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 2.8610947132110596, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.631, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 2.2747015953063965, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.6127, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.175, |
|
"grad_norm": 1.3792694807052612, |
|
"learning_rate": 1.7500000000000002e-05, |
|
"loss": 0.5963, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.0398032665252686, |
|
"learning_rate": 1.9999999999999998e-05, |
|
"loss": 0.6556, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.225, |
|
"grad_norm": 1.9805101156234741, |
|
"learning_rate": 2.25e-05, |
|
"loss": 0.6831, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.3323885202407837, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.6389, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.275, |
|
"grad_norm": 1.6555352210998535, |
|
"learning_rate": 2.75e-05, |
|
"loss": 0.6913, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.4770078659057617, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6447, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.325, |
|
"grad_norm": 1.9949244260787964, |
|
"learning_rate": 2.9722222222222223e-05, |
|
"loss": 0.6993, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.2942368984222412, |
|
"learning_rate": 2.9444444444444445e-05, |
|
"loss": 0.6581, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 1.8725063800811768, |
|
"learning_rate": 2.9166666666666666e-05, |
|
"loss": 0.6695, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.4848814010620117, |
|
"learning_rate": 2.8888888888888888e-05, |
|
"loss": 0.7076, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.425, |
|
"grad_norm": 1.406736969947815, |
|
"learning_rate": 2.8611111111111113e-05, |
|
"loss": 0.6301, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.2826756238937378, |
|
"learning_rate": 2.8333333333333332e-05, |
|
"loss": 0.6121, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.475, |
|
"grad_norm": 1.0705897808074951, |
|
"learning_rate": 2.8055555555555557e-05, |
|
"loss": 0.6439, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.7978061437606812, |
|
"learning_rate": 2.777777777777778e-05, |
|
"loss": 0.6782, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.525, |
|
"grad_norm": 1.2017405033111572, |
|
"learning_rate": 2.75e-05, |
|
"loss": 0.7025, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.4544589519500732, |
|
"learning_rate": 2.7222222222222223e-05, |
|
"loss": 0.7228, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.575, |
|
"grad_norm": 2.094083070755005, |
|
"learning_rate": 2.6944444444444445e-05, |
|
"loss": 0.6065, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.4134550094604492, |
|
"learning_rate": 2.6666666666666667e-05, |
|
"loss": 0.6496, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 1.1640647649765015, |
|
"learning_rate": 2.6388888888888892e-05, |
|
"loss": 0.6816, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.7982358932495117, |
|
"learning_rate": 2.611111111111111e-05, |
|
"loss": 0.6302, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.675, |
|
"grad_norm": 1.6336771249771118, |
|
"learning_rate": 2.5833333333333336e-05, |
|
"loss": 0.692, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.2203083038330078, |
|
"learning_rate": 2.5555555555555557e-05, |
|
"loss": 0.7533, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.725, |
|
"grad_norm": 1.4167596101760864, |
|
"learning_rate": 2.5277777777777776e-05, |
|
"loss": 0.6567, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 2.341327667236328, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.6472, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.775, |
|
"grad_norm": 1.1488889455795288, |
|
"learning_rate": 2.4722222222222223e-05, |
|
"loss": 0.6461, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.6008880138397217, |
|
"learning_rate": 2.4444444444444445e-05, |
|
"loss": 0.661, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.825, |
|
"grad_norm": 1.6397242546081543, |
|
"learning_rate": 2.4166666666666667e-05, |
|
"loss": 0.6897, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.567724347114563, |
|
"learning_rate": 2.388888888888889e-05, |
|
"loss": 0.6097, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"grad_norm": 1.290542483329773, |
|
"learning_rate": 2.3611111111111114e-05, |
|
"loss": 0.6284, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.4457989931106567, |
|
"learning_rate": 2.3333333333333336e-05, |
|
"loss": 0.5923, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.925, |
|
"grad_norm": 1.1782793998718262, |
|
"learning_rate": 2.3055555555555554e-05, |
|
"loss": 0.6642, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.2478561401367188, |
|
"learning_rate": 2.277777777777778e-05, |
|
"loss": 0.6531, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.975, |
|
"grad_norm": 1.3522217273712158, |
|
"learning_rate": 2.25e-05, |
|
"loss": 0.6705, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.4155220985412598, |
|
"learning_rate": 2.222222222222222e-05, |
|
"loss": 0.7137, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.6765261292457581, |
|
"eval_runtime": 37.4092, |
|
"eval_samples_per_second": 53.463, |
|
"eval_steps_per_second": 3.341, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.025, |
|
"grad_norm": 1.0540518760681152, |
|
"learning_rate": 2.1944444444444445e-05, |
|
"loss": 0.6601, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 1.219468116760254, |
|
"learning_rate": 2.1666666666666667e-05, |
|
"loss": 0.6739, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.075, |
|
"grad_norm": 1.1928082704544067, |
|
"learning_rate": 2.138888888888889e-05, |
|
"loss": 0.6487, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 1.0191409587860107, |
|
"learning_rate": 2.111111111111111e-05, |
|
"loss": 0.6864, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.125, |
|
"grad_norm": 1.0731534957885742, |
|
"learning_rate": 2.0833333333333333e-05, |
|
"loss": 0.7744, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 1.1843361854553223, |
|
"learning_rate": 2.0555555555555558e-05, |
|
"loss": 0.6698, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.175, |
|
"grad_norm": 1.1600492000579834, |
|
"learning_rate": 2.027777777777778e-05, |
|
"loss": 0.6421, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.3744503259658813, |
|
"learning_rate": 1.9999999999999998e-05, |
|
"loss": 0.633, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.225, |
|
"grad_norm": 0.8186588287353516, |
|
"learning_rate": 1.9722222222222224e-05, |
|
"loss": 0.678, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.2602007389068604, |
|
"learning_rate": 1.9444444444444445e-05, |
|
"loss": 0.6264, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.275, |
|
"grad_norm": 1.5430500507354736, |
|
"learning_rate": 1.9166666666666667e-05, |
|
"loss": 0.721, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 1.6438603401184082, |
|
"learning_rate": 1.888888888888889e-05, |
|
"loss": 0.6736, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.325, |
|
"grad_norm": 1.1491776704788208, |
|
"learning_rate": 1.861111111111111e-05, |
|
"loss": 0.5332, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 1.087183952331543, |
|
"learning_rate": 1.8333333333333336e-05, |
|
"loss": 0.6576, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.375, |
|
"grad_norm": 1.6351675987243652, |
|
"learning_rate": 1.8055555555555555e-05, |
|
"loss": 0.6625, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 1.5467578172683716, |
|
"learning_rate": 1.7777777777777777e-05, |
|
"loss": 0.7248, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.425, |
|
"grad_norm": 1.1913565397262573, |
|
"learning_rate": 1.7500000000000002e-05, |
|
"loss": 0.6188, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 1.1346111297607422, |
|
"learning_rate": 1.7222222222222224e-05, |
|
"loss": 0.6452, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.475, |
|
"grad_norm": 1.3555978536605835, |
|
"learning_rate": 1.6944444444444442e-05, |
|
"loss": 0.7024, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.8716872930526733, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.7005, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.525, |
|
"grad_norm": 0.9957846999168396, |
|
"learning_rate": 1.638888888888889e-05, |
|
"loss": 0.6219, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 2.1997838020324707, |
|
"learning_rate": 1.6111111111111115e-05, |
|
"loss": 0.6525, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.575, |
|
"grad_norm": 1.3860341310501099, |
|
"learning_rate": 1.5833333333333333e-05, |
|
"loss": 0.6718, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.8452956676483154, |
|
"learning_rate": 1.5555555555555555e-05, |
|
"loss": 0.6738, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.625, |
|
"grad_norm": 0.9731984734535217, |
|
"learning_rate": 1.527777777777778e-05, |
|
"loss": 0.6558, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 1.831750750541687, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.6236, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.675, |
|
"grad_norm": 1.6755101680755615, |
|
"learning_rate": 1.4722222222222222e-05, |
|
"loss": 0.7126, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 1.3757505416870117, |
|
"learning_rate": 1.4444444444444444e-05, |
|
"loss": 0.6822, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.725, |
|
"grad_norm": 1.377435326576233, |
|
"learning_rate": 1.4166666666666666e-05, |
|
"loss": 0.6324, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 1.0001251697540283, |
|
"learning_rate": 1.388888888888889e-05, |
|
"loss": 0.7036, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.775, |
|
"grad_norm": 1.0013527870178223, |
|
"learning_rate": 1.3611111111111111e-05, |
|
"loss": 0.6765, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 1.0746055841445923, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 0.654, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.825, |
|
"grad_norm": 0.6743106842041016, |
|
"learning_rate": 1.3055555555555555e-05, |
|
"loss": 0.6923, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.9659077525138855, |
|
"learning_rate": 1.2777777777777779e-05, |
|
"loss": 0.6976, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 0.7309139966964722, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.6904, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.6239315271377563, |
|
"learning_rate": 1.2222222222222222e-05, |
|
"loss": 0.6307, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.925, |
|
"grad_norm": 1.3958375453948975, |
|
"learning_rate": 1.1944444444444444e-05, |
|
"loss": 0.6437, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.7617831230163574, |
|
"learning_rate": 1.1666666666666668e-05, |
|
"loss": 0.6333, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.975, |
|
"grad_norm": 0.8432300686836243, |
|
"learning_rate": 1.138888888888889e-05, |
|
"loss": 0.6214, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.236924409866333, |
|
"learning_rate": 1.111111111111111e-05, |
|
"loss": 0.6424, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.6764588952064514, |
|
"eval_runtime": 38.544, |
|
"eval_samples_per_second": 51.889, |
|
"eval_steps_per_second": 3.243, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.025, |
|
"grad_norm": 1.2284152507781982, |
|
"learning_rate": 1.0833333333333334e-05, |
|
"loss": 0.7041, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 0.9430116415023804, |
|
"learning_rate": 1.0555555555555555e-05, |
|
"loss": 0.7112, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.075, |
|
"grad_norm": 0.5471211075782776, |
|
"learning_rate": 1.0277777777777779e-05, |
|
"loss": 0.6696, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 0.9567949771881104, |
|
"learning_rate": 9.999999999999999e-06, |
|
"loss": 0.6739, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.125, |
|
"grad_norm": 0.633762001991272, |
|
"learning_rate": 9.722222222222223e-06, |
|
"loss": 0.6315, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 1.0539363622665405, |
|
"learning_rate": 9.444444444444445e-06, |
|
"loss": 0.7649, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.175, |
|
"grad_norm": 0.9735732078552246, |
|
"learning_rate": 9.166666666666668e-06, |
|
"loss": 0.7079, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 1.3620977401733398, |
|
"learning_rate": 8.888888888888888e-06, |
|
"loss": 0.6549, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.225, |
|
"grad_norm": 0.9268941879272461, |
|
"learning_rate": 8.611111111111112e-06, |
|
"loss": 0.6548, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.4519413709640503, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 0.6647, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.275, |
|
"grad_norm": 0.8613296747207642, |
|
"learning_rate": 8.055555555555557e-06, |
|
"loss": 0.7568, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 1.1156052350997925, |
|
"learning_rate": 7.777777777777777e-06, |
|
"loss": 0.6659, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.325, |
|
"grad_norm": 0.8070225119590759, |
|
"learning_rate": 7.5e-06, |
|
"loss": 0.604, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 1.2144221067428589, |
|
"learning_rate": 7.222222222222222e-06, |
|
"loss": 0.6342, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.375, |
|
"grad_norm": 0.8116927742958069, |
|
"learning_rate": 6.944444444444445e-06, |
|
"loss": 0.6891, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.8394978642463684, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.6856, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.425, |
|
"grad_norm": 1.1582024097442627, |
|
"learning_rate": 6.388888888888889e-06, |
|
"loss": 0.6683, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 0.9621151089668274, |
|
"learning_rate": 6.111111111111111e-06, |
|
"loss": 0.678, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.475, |
|
"grad_norm": 1.0509181022644043, |
|
"learning_rate": 5.833333333333334e-06, |
|
"loss": 0.7102, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.7675669193267822, |
|
"learning_rate": 5.555555555555555e-06, |
|
"loss": 0.6606, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.525, |
|
"grad_norm": 0.9356604218482971, |
|
"learning_rate": 5.277777777777778e-06, |
|
"loss": 0.6634, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 1.1379098892211914, |
|
"learning_rate": 4.9999999999999996e-06, |
|
"loss": 0.6443, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.575, |
|
"grad_norm": 1.0013926029205322, |
|
"learning_rate": 4.722222222222222e-06, |
|
"loss": 0.6122, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 0.771693229675293, |
|
"learning_rate": 4.444444444444444e-06, |
|
"loss": 0.6926, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.625, |
|
"grad_norm": 0.7376611232757568, |
|
"learning_rate": 4.166666666666667e-06, |
|
"loss": 0.5957, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 0.7340726256370544, |
|
"learning_rate": 3.888888888888889e-06, |
|
"loss": 0.6933, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.675, |
|
"grad_norm": 0.7760947942733765, |
|
"learning_rate": 3.611111111111111e-06, |
|
"loss": 0.691, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 0.9809922575950623, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 0.7015, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.725, |
|
"grad_norm": 0.9861670732498169, |
|
"learning_rate": 3.0555555555555556e-06, |
|
"loss": 0.7057, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.8055828809738159, |
|
"learning_rate": 2.7777777777777775e-06, |
|
"loss": 0.6386, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.775, |
|
"grad_norm": 1.0951838493347168, |
|
"learning_rate": 2.4999999999999998e-06, |
|
"loss": 0.6868, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 1.086242437362671, |
|
"learning_rate": 2.222222222222222e-06, |
|
"loss": 0.6992, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.825, |
|
"grad_norm": 0.6613348126411438, |
|
"learning_rate": 1.9444444444444444e-06, |
|
"loss": 0.6338, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 0.944501519203186, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 0.6175, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.875, |
|
"grad_norm": 0.5407629609107971, |
|
"learning_rate": 1.3888888888888887e-06, |
|
"loss": 0.6125, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 1.0618243217468262, |
|
"learning_rate": 1.111111111111111e-06, |
|
"loss": 0.6675, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.925, |
|
"grad_norm": 0.6185476183891296, |
|
"learning_rate": 8.333333333333333e-07, |
|
"loss": 0.6369, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 0.9023645520210266, |
|
"learning_rate": 5.555555555555555e-07, |
|
"loss": 0.6468, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 2.975, |
|
"grad_norm": 1.4191973209381104, |
|
"learning_rate": 2.7777777777777776e-07, |
|
"loss": 0.6627, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.7791981101036072, |
|
"learning_rate": 0.0, |
|
"loss": 0.6685, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.6764523983001709, |
|
"eval_runtime": 38.9354, |
|
"eval_samples_per_second": 51.367, |
|
"eval_steps_per_second": 3.21, |
|
"step": 3000 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 3000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.01 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|