{ "best_metric": 0.6764523983001709, "best_model_checkpoint": "autotrain-l21an-6mkt7/checkpoint-3000", "epoch": 3.0, "eval_steps": 500, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025, "grad_norm": 3.444312810897827, "learning_rate": 2.4999999999999998e-06, "loss": 0.6305, "step": 25 }, { "epoch": 0.05, "grad_norm": 4.167919635772705, "learning_rate": 4.9999999999999996e-06, "loss": 0.7491, "step": 50 }, { "epoch": 0.075, "grad_norm": 2.0621402263641357, "learning_rate": 7.5e-06, "loss": 0.6931, "step": 75 }, { "epoch": 0.1, "grad_norm": 2.2826991081237793, "learning_rate": 9.999999999999999e-06, "loss": 0.7093, "step": 100 }, { "epoch": 0.125, "grad_norm": 2.8610947132110596, "learning_rate": 1.25e-05, "loss": 0.631, "step": 125 }, { "epoch": 0.15, "grad_norm": 2.2747015953063965, "learning_rate": 1.5e-05, "loss": 0.6127, "step": 150 }, { "epoch": 0.175, "grad_norm": 1.3792694807052612, "learning_rate": 1.7500000000000002e-05, "loss": 0.5963, "step": 175 }, { "epoch": 0.2, "grad_norm": 2.0398032665252686, "learning_rate": 1.9999999999999998e-05, "loss": 0.6556, "step": 200 }, { "epoch": 0.225, "grad_norm": 1.9805101156234741, "learning_rate": 2.25e-05, "loss": 0.6831, "step": 225 }, { "epoch": 0.25, "grad_norm": 1.3323885202407837, "learning_rate": 2.5e-05, "loss": 0.6389, "step": 250 }, { "epoch": 0.275, "grad_norm": 1.6555352210998535, "learning_rate": 2.75e-05, "loss": 0.6913, "step": 275 }, { "epoch": 0.3, "grad_norm": 1.4770078659057617, "learning_rate": 3e-05, "loss": 0.6447, "step": 300 }, { "epoch": 0.325, "grad_norm": 1.9949244260787964, "learning_rate": 2.9722222222222223e-05, "loss": 0.6993, "step": 325 }, { "epoch": 0.35, "grad_norm": 1.2942368984222412, "learning_rate": 2.9444444444444445e-05, "loss": 0.6581, "step": 350 }, { "epoch": 0.375, "grad_norm": 1.8725063800811768, "learning_rate": 2.9166666666666666e-05, "loss": 0.6695, "step": 375 }, { "epoch": 0.4, "grad_norm": 1.4848814010620117, "learning_rate": 2.8888888888888888e-05, "loss": 0.7076, "step": 400 }, { "epoch": 0.425, "grad_norm": 1.406736969947815, "learning_rate": 2.8611111111111113e-05, "loss": 0.6301, "step": 425 }, { "epoch": 0.45, "grad_norm": 1.2826756238937378, "learning_rate": 2.8333333333333332e-05, "loss": 0.6121, "step": 450 }, { "epoch": 0.475, "grad_norm": 1.0705897808074951, "learning_rate": 2.8055555555555557e-05, "loss": 0.6439, "step": 475 }, { "epoch": 0.5, "grad_norm": 1.7978061437606812, "learning_rate": 2.777777777777778e-05, "loss": 0.6782, "step": 500 }, { "epoch": 0.525, "grad_norm": 1.2017405033111572, "learning_rate": 2.75e-05, "loss": 0.7025, "step": 525 }, { "epoch": 0.55, "grad_norm": 1.4544589519500732, "learning_rate": 2.7222222222222223e-05, "loss": 0.7228, "step": 550 }, { "epoch": 0.575, "grad_norm": 2.094083070755005, "learning_rate": 2.6944444444444445e-05, "loss": 0.6065, "step": 575 }, { "epoch": 0.6, "grad_norm": 1.4134550094604492, "learning_rate": 2.6666666666666667e-05, "loss": 0.6496, "step": 600 }, { "epoch": 0.625, "grad_norm": 1.1640647649765015, "learning_rate": 2.6388888888888892e-05, "loss": 0.6816, "step": 625 }, { "epoch": 0.65, "grad_norm": 1.7982358932495117, "learning_rate": 2.611111111111111e-05, "loss": 0.6302, "step": 650 }, { "epoch": 0.675, "grad_norm": 1.6336771249771118, "learning_rate": 2.5833333333333336e-05, "loss": 0.692, "step": 675 }, { "epoch": 0.7, "grad_norm": 1.2203083038330078, "learning_rate": 2.5555555555555557e-05, "loss": 0.7533, "step": 700 }, { "epoch": 0.725, "grad_norm": 1.4167596101760864, "learning_rate": 2.5277777777777776e-05, "loss": 0.6567, "step": 725 }, { "epoch": 0.75, "grad_norm": 2.341327667236328, "learning_rate": 2.5e-05, "loss": 0.6472, "step": 750 }, { "epoch": 0.775, "grad_norm": 1.1488889455795288, "learning_rate": 2.4722222222222223e-05, "loss": 0.6461, "step": 775 }, { "epoch": 0.8, "grad_norm": 1.6008880138397217, "learning_rate": 2.4444444444444445e-05, "loss": 0.661, "step": 800 }, { "epoch": 0.825, "grad_norm": 1.6397242546081543, "learning_rate": 2.4166666666666667e-05, "loss": 0.6897, "step": 825 }, { "epoch": 0.85, "grad_norm": 1.567724347114563, "learning_rate": 2.388888888888889e-05, "loss": 0.6097, "step": 850 }, { "epoch": 0.875, "grad_norm": 1.290542483329773, "learning_rate": 2.3611111111111114e-05, "loss": 0.6284, "step": 875 }, { "epoch": 0.9, "grad_norm": 1.4457989931106567, "learning_rate": 2.3333333333333336e-05, "loss": 0.5923, "step": 900 }, { "epoch": 0.925, "grad_norm": 1.1782793998718262, "learning_rate": 2.3055555555555554e-05, "loss": 0.6642, "step": 925 }, { "epoch": 0.95, "grad_norm": 1.2478561401367188, "learning_rate": 2.277777777777778e-05, "loss": 0.6531, "step": 950 }, { "epoch": 0.975, "grad_norm": 1.3522217273712158, "learning_rate": 2.25e-05, "loss": 0.6705, "step": 975 }, { "epoch": 1.0, "grad_norm": 1.4155220985412598, "learning_rate": 2.222222222222222e-05, "loss": 0.7137, "step": 1000 }, { "epoch": 1.0, "eval_loss": 0.6765261292457581, "eval_runtime": 37.4092, "eval_samples_per_second": 53.463, "eval_steps_per_second": 3.341, "step": 1000 }, { "epoch": 1.025, "grad_norm": 1.0540518760681152, "learning_rate": 2.1944444444444445e-05, "loss": 0.6601, "step": 1025 }, { "epoch": 1.05, "grad_norm": 1.219468116760254, "learning_rate": 2.1666666666666667e-05, "loss": 0.6739, "step": 1050 }, { "epoch": 1.075, "grad_norm": 1.1928082704544067, "learning_rate": 2.138888888888889e-05, "loss": 0.6487, "step": 1075 }, { "epoch": 1.1, "grad_norm": 1.0191409587860107, "learning_rate": 2.111111111111111e-05, "loss": 0.6864, "step": 1100 }, { "epoch": 1.125, "grad_norm": 1.0731534957885742, "learning_rate": 2.0833333333333333e-05, "loss": 0.7744, "step": 1125 }, { "epoch": 1.15, "grad_norm": 1.1843361854553223, "learning_rate": 2.0555555555555558e-05, "loss": 0.6698, "step": 1150 }, { "epoch": 1.175, "grad_norm": 1.1600492000579834, "learning_rate": 2.027777777777778e-05, "loss": 0.6421, "step": 1175 }, { "epoch": 1.2, "grad_norm": 1.3744503259658813, "learning_rate": 1.9999999999999998e-05, "loss": 0.633, "step": 1200 }, { "epoch": 1.225, "grad_norm": 0.8186588287353516, "learning_rate": 1.9722222222222224e-05, "loss": 0.678, "step": 1225 }, { "epoch": 1.25, "grad_norm": 1.2602007389068604, "learning_rate": 1.9444444444444445e-05, "loss": 0.6264, "step": 1250 }, { "epoch": 1.275, "grad_norm": 1.5430500507354736, "learning_rate": 1.9166666666666667e-05, "loss": 0.721, "step": 1275 }, { "epoch": 1.3, "grad_norm": 1.6438603401184082, "learning_rate": 1.888888888888889e-05, "loss": 0.6736, "step": 1300 }, { "epoch": 1.325, "grad_norm": 1.1491776704788208, "learning_rate": 1.861111111111111e-05, "loss": 0.5332, "step": 1325 }, { "epoch": 1.35, "grad_norm": 1.087183952331543, "learning_rate": 1.8333333333333336e-05, "loss": 0.6576, "step": 1350 }, { "epoch": 1.375, "grad_norm": 1.6351675987243652, "learning_rate": 1.8055555555555555e-05, "loss": 0.6625, "step": 1375 }, { "epoch": 1.4, "grad_norm": 1.5467578172683716, "learning_rate": 1.7777777777777777e-05, "loss": 0.7248, "step": 1400 }, { "epoch": 1.425, "grad_norm": 1.1913565397262573, "learning_rate": 1.7500000000000002e-05, "loss": 0.6188, "step": 1425 }, { "epoch": 1.45, "grad_norm": 1.1346111297607422, "learning_rate": 1.7222222222222224e-05, "loss": 0.6452, "step": 1450 }, { "epoch": 1.475, "grad_norm": 1.3555978536605835, "learning_rate": 1.6944444444444442e-05, "loss": 0.7024, "step": 1475 }, { "epoch": 1.5, "grad_norm": 0.8716872930526733, "learning_rate": 1.6666666666666667e-05, "loss": 0.7005, "step": 1500 }, { "epoch": 1.525, "grad_norm": 0.9957846999168396, "learning_rate": 1.638888888888889e-05, "loss": 0.6219, "step": 1525 }, { "epoch": 1.55, "grad_norm": 2.1997838020324707, "learning_rate": 1.6111111111111115e-05, "loss": 0.6525, "step": 1550 }, { "epoch": 1.575, "grad_norm": 1.3860341310501099, "learning_rate": 1.5833333333333333e-05, "loss": 0.6718, "step": 1575 }, { "epoch": 1.6, "grad_norm": 0.8452956676483154, "learning_rate": 1.5555555555555555e-05, "loss": 0.6738, "step": 1600 }, { "epoch": 1.625, "grad_norm": 0.9731984734535217, "learning_rate": 1.527777777777778e-05, "loss": 0.6558, "step": 1625 }, { "epoch": 1.65, "grad_norm": 1.831750750541687, "learning_rate": 1.5e-05, "loss": 0.6236, "step": 1650 }, { "epoch": 1.675, "grad_norm": 1.6755101680755615, "learning_rate": 1.4722222222222222e-05, "loss": 0.7126, "step": 1675 }, { "epoch": 1.7, "grad_norm": 1.3757505416870117, "learning_rate": 1.4444444444444444e-05, "loss": 0.6822, "step": 1700 }, { "epoch": 1.725, "grad_norm": 1.377435326576233, "learning_rate": 1.4166666666666666e-05, "loss": 0.6324, "step": 1725 }, { "epoch": 1.75, "grad_norm": 1.0001251697540283, "learning_rate": 1.388888888888889e-05, "loss": 0.7036, "step": 1750 }, { "epoch": 1.775, "grad_norm": 1.0013527870178223, "learning_rate": 1.3611111111111111e-05, "loss": 0.6765, "step": 1775 }, { "epoch": 1.8, "grad_norm": 1.0746055841445923, "learning_rate": 1.3333333333333333e-05, "loss": 0.654, "step": 1800 }, { "epoch": 1.825, "grad_norm": 0.6743106842041016, "learning_rate": 1.3055555555555555e-05, "loss": 0.6923, "step": 1825 }, { "epoch": 1.85, "grad_norm": 0.9659077525138855, "learning_rate": 1.2777777777777779e-05, "loss": 0.6976, "step": 1850 }, { "epoch": 1.875, "grad_norm": 0.7309139966964722, "learning_rate": 1.25e-05, "loss": 0.6904, "step": 1875 }, { "epoch": 1.9, "grad_norm": 0.6239315271377563, "learning_rate": 1.2222222222222222e-05, "loss": 0.6307, "step": 1900 }, { "epoch": 1.925, "grad_norm": 1.3958375453948975, "learning_rate": 1.1944444444444444e-05, "loss": 0.6437, "step": 1925 }, { "epoch": 1.95, "grad_norm": 0.7617831230163574, "learning_rate": 1.1666666666666668e-05, "loss": 0.6333, "step": 1950 }, { "epoch": 1.975, "grad_norm": 0.8432300686836243, "learning_rate": 1.138888888888889e-05, "loss": 0.6214, "step": 1975 }, { "epoch": 2.0, "grad_norm": 1.236924409866333, "learning_rate": 1.111111111111111e-05, "loss": 0.6424, "step": 2000 }, { "epoch": 2.0, "eval_loss": 0.6764588952064514, "eval_runtime": 38.544, "eval_samples_per_second": 51.889, "eval_steps_per_second": 3.243, "step": 2000 }, { "epoch": 2.025, "grad_norm": 1.2284152507781982, "learning_rate": 1.0833333333333334e-05, "loss": 0.7041, "step": 2025 }, { "epoch": 2.05, "grad_norm": 0.9430116415023804, "learning_rate": 1.0555555555555555e-05, "loss": 0.7112, "step": 2050 }, { "epoch": 2.075, "grad_norm": 0.5471211075782776, "learning_rate": 1.0277777777777779e-05, "loss": 0.6696, "step": 2075 }, { "epoch": 2.1, "grad_norm": 0.9567949771881104, "learning_rate": 9.999999999999999e-06, "loss": 0.6739, "step": 2100 }, { "epoch": 2.125, "grad_norm": 0.633762001991272, "learning_rate": 9.722222222222223e-06, "loss": 0.6315, "step": 2125 }, { "epoch": 2.15, "grad_norm": 1.0539363622665405, "learning_rate": 9.444444444444445e-06, "loss": 0.7649, "step": 2150 }, { "epoch": 2.175, "grad_norm": 0.9735732078552246, "learning_rate": 9.166666666666668e-06, "loss": 0.7079, "step": 2175 }, { "epoch": 2.2, "grad_norm": 1.3620977401733398, "learning_rate": 8.888888888888888e-06, "loss": 0.6549, "step": 2200 }, { "epoch": 2.225, "grad_norm": 0.9268941879272461, "learning_rate": 8.611111111111112e-06, "loss": 0.6548, "step": 2225 }, { "epoch": 2.25, "grad_norm": 0.4519413709640503, "learning_rate": 8.333333333333334e-06, "loss": 0.6647, "step": 2250 }, { "epoch": 2.275, "grad_norm": 0.8613296747207642, "learning_rate": 8.055555555555557e-06, "loss": 0.7568, "step": 2275 }, { "epoch": 2.3, "grad_norm": 1.1156052350997925, "learning_rate": 7.777777777777777e-06, "loss": 0.6659, "step": 2300 }, { "epoch": 2.325, "grad_norm": 0.8070225119590759, "learning_rate": 7.5e-06, "loss": 0.604, "step": 2325 }, { "epoch": 2.35, "grad_norm": 1.2144221067428589, "learning_rate": 7.222222222222222e-06, "loss": 0.6342, "step": 2350 }, { "epoch": 2.375, "grad_norm": 0.8116927742958069, "learning_rate": 6.944444444444445e-06, "loss": 0.6891, "step": 2375 }, { "epoch": 2.4, "grad_norm": 0.8394978642463684, "learning_rate": 6.666666666666667e-06, "loss": 0.6856, "step": 2400 }, { "epoch": 2.425, "grad_norm": 1.1582024097442627, "learning_rate": 6.388888888888889e-06, "loss": 0.6683, "step": 2425 }, { "epoch": 2.45, "grad_norm": 0.9621151089668274, "learning_rate": 6.111111111111111e-06, "loss": 0.678, "step": 2450 }, { "epoch": 2.475, "grad_norm": 1.0509181022644043, "learning_rate": 5.833333333333334e-06, "loss": 0.7102, "step": 2475 }, { "epoch": 2.5, "grad_norm": 0.7675669193267822, "learning_rate": 5.555555555555555e-06, "loss": 0.6606, "step": 2500 }, { "epoch": 2.525, "grad_norm": 0.9356604218482971, "learning_rate": 5.277777777777778e-06, "loss": 0.6634, "step": 2525 }, { "epoch": 2.55, "grad_norm": 1.1379098892211914, "learning_rate": 4.9999999999999996e-06, "loss": 0.6443, "step": 2550 }, { "epoch": 2.575, "grad_norm": 1.0013926029205322, "learning_rate": 4.722222222222222e-06, "loss": 0.6122, "step": 2575 }, { "epoch": 2.6, "grad_norm": 0.771693229675293, "learning_rate": 4.444444444444444e-06, "loss": 0.6926, "step": 2600 }, { "epoch": 2.625, "grad_norm": 0.7376611232757568, "learning_rate": 4.166666666666667e-06, "loss": 0.5957, "step": 2625 }, { "epoch": 2.65, "grad_norm": 0.7340726256370544, "learning_rate": 3.888888888888889e-06, "loss": 0.6933, "step": 2650 }, { "epoch": 2.675, "grad_norm": 0.7760947942733765, "learning_rate": 3.611111111111111e-06, "loss": 0.691, "step": 2675 }, { "epoch": 2.7, "grad_norm": 0.9809922575950623, "learning_rate": 3.3333333333333333e-06, "loss": 0.7015, "step": 2700 }, { "epoch": 2.725, "grad_norm": 0.9861670732498169, "learning_rate": 3.0555555555555556e-06, "loss": 0.7057, "step": 2725 }, { "epoch": 2.75, "grad_norm": 0.8055828809738159, "learning_rate": 2.7777777777777775e-06, "loss": 0.6386, "step": 2750 }, { "epoch": 2.775, "grad_norm": 1.0951838493347168, "learning_rate": 2.4999999999999998e-06, "loss": 0.6868, "step": 2775 }, { "epoch": 2.8, "grad_norm": 1.086242437362671, "learning_rate": 2.222222222222222e-06, "loss": 0.6992, "step": 2800 }, { "epoch": 2.825, "grad_norm": 0.6613348126411438, "learning_rate": 1.9444444444444444e-06, "loss": 0.6338, "step": 2825 }, { "epoch": 2.85, "grad_norm": 0.944501519203186, "learning_rate": 1.6666666666666667e-06, "loss": 0.6175, "step": 2850 }, { "epoch": 2.875, "grad_norm": 0.5407629609107971, "learning_rate": 1.3888888888888887e-06, "loss": 0.6125, "step": 2875 }, { "epoch": 2.9, "grad_norm": 1.0618243217468262, "learning_rate": 1.111111111111111e-06, "loss": 0.6675, "step": 2900 }, { "epoch": 2.925, "grad_norm": 0.6185476183891296, "learning_rate": 8.333333333333333e-07, "loss": 0.6369, "step": 2925 }, { "epoch": 2.95, "grad_norm": 0.9023645520210266, "learning_rate": 5.555555555555555e-07, "loss": 0.6468, "step": 2950 }, { "epoch": 2.975, "grad_norm": 1.4191973209381104, "learning_rate": 2.7777777777777776e-07, "loss": 0.6627, "step": 2975 }, { "epoch": 3.0, "grad_norm": 0.7791981101036072, "learning_rate": 0.0, "loss": 0.6685, "step": 3000 }, { "epoch": 3.0, "eval_loss": 0.6764523983001709, "eval_runtime": 38.9354, "eval_samples_per_second": 51.367, "eval_steps_per_second": 3.21, "step": 3000 } ], "logging_steps": 25, "max_steps": 3000, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.01 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }