booster-team's picture
Upload folder using huggingface_hub
251853b verified
{
"best_metric": 0.6764523983001709,
"best_model_checkpoint": "autotrain-l21an-6mkt7/checkpoint-3000",
"epoch": 3.0,
"eval_steps": 500,
"global_step": 3000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.025,
"grad_norm": 3.444312810897827,
"learning_rate": 2.4999999999999998e-06,
"loss": 0.6305,
"step": 25
},
{
"epoch": 0.05,
"grad_norm": 4.167919635772705,
"learning_rate": 4.9999999999999996e-06,
"loss": 0.7491,
"step": 50
},
{
"epoch": 0.075,
"grad_norm": 2.0621402263641357,
"learning_rate": 7.5e-06,
"loss": 0.6931,
"step": 75
},
{
"epoch": 0.1,
"grad_norm": 2.2826991081237793,
"learning_rate": 9.999999999999999e-06,
"loss": 0.7093,
"step": 100
},
{
"epoch": 0.125,
"grad_norm": 2.8610947132110596,
"learning_rate": 1.25e-05,
"loss": 0.631,
"step": 125
},
{
"epoch": 0.15,
"grad_norm": 2.2747015953063965,
"learning_rate": 1.5e-05,
"loss": 0.6127,
"step": 150
},
{
"epoch": 0.175,
"grad_norm": 1.3792694807052612,
"learning_rate": 1.7500000000000002e-05,
"loss": 0.5963,
"step": 175
},
{
"epoch": 0.2,
"grad_norm": 2.0398032665252686,
"learning_rate": 1.9999999999999998e-05,
"loss": 0.6556,
"step": 200
},
{
"epoch": 0.225,
"grad_norm": 1.9805101156234741,
"learning_rate": 2.25e-05,
"loss": 0.6831,
"step": 225
},
{
"epoch": 0.25,
"grad_norm": 1.3323885202407837,
"learning_rate": 2.5e-05,
"loss": 0.6389,
"step": 250
},
{
"epoch": 0.275,
"grad_norm": 1.6555352210998535,
"learning_rate": 2.75e-05,
"loss": 0.6913,
"step": 275
},
{
"epoch": 0.3,
"grad_norm": 1.4770078659057617,
"learning_rate": 3e-05,
"loss": 0.6447,
"step": 300
},
{
"epoch": 0.325,
"grad_norm": 1.9949244260787964,
"learning_rate": 2.9722222222222223e-05,
"loss": 0.6993,
"step": 325
},
{
"epoch": 0.35,
"grad_norm": 1.2942368984222412,
"learning_rate": 2.9444444444444445e-05,
"loss": 0.6581,
"step": 350
},
{
"epoch": 0.375,
"grad_norm": 1.8725063800811768,
"learning_rate": 2.9166666666666666e-05,
"loss": 0.6695,
"step": 375
},
{
"epoch": 0.4,
"grad_norm": 1.4848814010620117,
"learning_rate": 2.8888888888888888e-05,
"loss": 0.7076,
"step": 400
},
{
"epoch": 0.425,
"grad_norm": 1.406736969947815,
"learning_rate": 2.8611111111111113e-05,
"loss": 0.6301,
"step": 425
},
{
"epoch": 0.45,
"grad_norm": 1.2826756238937378,
"learning_rate": 2.8333333333333332e-05,
"loss": 0.6121,
"step": 450
},
{
"epoch": 0.475,
"grad_norm": 1.0705897808074951,
"learning_rate": 2.8055555555555557e-05,
"loss": 0.6439,
"step": 475
},
{
"epoch": 0.5,
"grad_norm": 1.7978061437606812,
"learning_rate": 2.777777777777778e-05,
"loss": 0.6782,
"step": 500
},
{
"epoch": 0.525,
"grad_norm": 1.2017405033111572,
"learning_rate": 2.75e-05,
"loss": 0.7025,
"step": 525
},
{
"epoch": 0.55,
"grad_norm": 1.4544589519500732,
"learning_rate": 2.7222222222222223e-05,
"loss": 0.7228,
"step": 550
},
{
"epoch": 0.575,
"grad_norm": 2.094083070755005,
"learning_rate": 2.6944444444444445e-05,
"loss": 0.6065,
"step": 575
},
{
"epoch": 0.6,
"grad_norm": 1.4134550094604492,
"learning_rate": 2.6666666666666667e-05,
"loss": 0.6496,
"step": 600
},
{
"epoch": 0.625,
"grad_norm": 1.1640647649765015,
"learning_rate": 2.6388888888888892e-05,
"loss": 0.6816,
"step": 625
},
{
"epoch": 0.65,
"grad_norm": 1.7982358932495117,
"learning_rate": 2.611111111111111e-05,
"loss": 0.6302,
"step": 650
},
{
"epoch": 0.675,
"grad_norm": 1.6336771249771118,
"learning_rate": 2.5833333333333336e-05,
"loss": 0.692,
"step": 675
},
{
"epoch": 0.7,
"grad_norm": 1.2203083038330078,
"learning_rate": 2.5555555555555557e-05,
"loss": 0.7533,
"step": 700
},
{
"epoch": 0.725,
"grad_norm": 1.4167596101760864,
"learning_rate": 2.5277777777777776e-05,
"loss": 0.6567,
"step": 725
},
{
"epoch": 0.75,
"grad_norm": 2.341327667236328,
"learning_rate": 2.5e-05,
"loss": 0.6472,
"step": 750
},
{
"epoch": 0.775,
"grad_norm": 1.1488889455795288,
"learning_rate": 2.4722222222222223e-05,
"loss": 0.6461,
"step": 775
},
{
"epoch": 0.8,
"grad_norm": 1.6008880138397217,
"learning_rate": 2.4444444444444445e-05,
"loss": 0.661,
"step": 800
},
{
"epoch": 0.825,
"grad_norm": 1.6397242546081543,
"learning_rate": 2.4166666666666667e-05,
"loss": 0.6897,
"step": 825
},
{
"epoch": 0.85,
"grad_norm": 1.567724347114563,
"learning_rate": 2.388888888888889e-05,
"loss": 0.6097,
"step": 850
},
{
"epoch": 0.875,
"grad_norm": 1.290542483329773,
"learning_rate": 2.3611111111111114e-05,
"loss": 0.6284,
"step": 875
},
{
"epoch": 0.9,
"grad_norm": 1.4457989931106567,
"learning_rate": 2.3333333333333336e-05,
"loss": 0.5923,
"step": 900
},
{
"epoch": 0.925,
"grad_norm": 1.1782793998718262,
"learning_rate": 2.3055555555555554e-05,
"loss": 0.6642,
"step": 925
},
{
"epoch": 0.95,
"grad_norm": 1.2478561401367188,
"learning_rate": 2.277777777777778e-05,
"loss": 0.6531,
"step": 950
},
{
"epoch": 0.975,
"grad_norm": 1.3522217273712158,
"learning_rate": 2.25e-05,
"loss": 0.6705,
"step": 975
},
{
"epoch": 1.0,
"grad_norm": 1.4155220985412598,
"learning_rate": 2.222222222222222e-05,
"loss": 0.7137,
"step": 1000
},
{
"epoch": 1.0,
"eval_loss": 0.6765261292457581,
"eval_runtime": 37.4092,
"eval_samples_per_second": 53.463,
"eval_steps_per_second": 3.341,
"step": 1000
},
{
"epoch": 1.025,
"grad_norm": 1.0540518760681152,
"learning_rate": 2.1944444444444445e-05,
"loss": 0.6601,
"step": 1025
},
{
"epoch": 1.05,
"grad_norm": 1.219468116760254,
"learning_rate": 2.1666666666666667e-05,
"loss": 0.6739,
"step": 1050
},
{
"epoch": 1.075,
"grad_norm": 1.1928082704544067,
"learning_rate": 2.138888888888889e-05,
"loss": 0.6487,
"step": 1075
},
{
"epoch": 1.1,
"grad_norm": 1.0191409587860107,
"learning_rate": 2.111111111111111e-05,
"loss": 0.6864,
"step": 1100
},
{
"epoch": 1.125,
"grad_norm": 1.0731534957885742,
"learning_rate": 2.0833333333333333e-05,
"loss": 0.7744,
"step": 1125
},
{
"epoch": 1.15,
"grad_norm": 1.1843361854553223,
"learning_rate": 2.0555555555555558e-05,
"loss": 0.6698,
"step": 1150
},
{
"epoch": 1.175,
"grad_norm": 1.1600492000579834,
"learning_rate": 2.027777777777778e-05,
"loss": 0.6421,
"step": 1175
},
{
"epoch": 1.2,
"grad_norm": 1.3744503259658813,
"learning_rate": 1.9999999999999998e-05,
"loss": 0.633,
"step": 1200
},
{
"epoch": 1.225,
"grad_norm": 0.8186588287353516,
"learning_rate": 1.9722222222222224e-05,
"loss": 0.678,
"step": 1225
},
{
"epoch": 1.25,
"grad_norm": 1.2602007389068604,
"learning_rate": 1.9444444444444445e-05,
"loss": 0.6264,
"step": 1250
},
{
"epoch": 1.275,
"grad_norm": 1.5430500507354736,
"learning_rate": 1.9166666666666667e-05,
"loss": 0.721,
"step": 1275
},
{
"epoch": 1.3,
"grad_norm": 1.6438603401184082,
"learning_rate": 1.888888888888889e-05,
"loss": 0.6736,
"step": 1300
},
{
"epoch": 1.325,
"grad_norm": 1.1491776704788208,
"learning_rate": 1.861111111111111e-05,
"loss": 0.5332,
"step": 1325
},
{
"epoch": 1.35,
"grad_norm": 1.087183952331543,
"learning_rate": 1.8333333333333336e-05,
"loss": 0.6576,
"step": 1350
},
{
"epoch": 1.375,
"grad_norm": 1.6351675987243652,
"learning_rate": 1.8055555555555555e-05,
"loss": 0.6625,
"step": 1375
},
{
"epoch": 1.4,
"grad_norm": 1.5467578172683716,
"learning_rate": 1.7777777777777777e-05,
"loss": 0.7248,
"step": 1400
},
{
"epoch": 1.425,
"grad_norm": 1.1913565397262573,
"learning_rate": 1.7500000000000002e-05,
"loss": 0.6188,
"step": 1425
},
{
"epoch": 1.45,
"grad_norm": 1.1346111297607422,
"learning_rate": 1.7222222222222224e-05,
"loss": 0.6452,
"step": 1450
},
{
"epoch": 1.475,
"grad_norm": 1.3555978536605835,
"learning_rate": 1.6944444444444442e-05,
"loss": 0.7024,
"step": 1475
},
{
"epoch": 1.5,
"grad_norm": 0.8716872930526733,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.7005,
"step": 1500
},
{
"epoch": 1.525,
"grad_norm": 0.9957846999168396,
"learning_rate": 1.638888888888889e-05,
"loss": 0.6219,
"step": 1525
},
{
"epoch": 1.55,
"grad_norm": 2.1997838020324707,
"learning_rate": 1.6111111111111115e-05,
"loss": 0.6525,
"step": 1550
},
{
"epoch": 1.575,
"grad_norm": 1.3860341310501099,
"learning_rate": 1.5833333333333333e-05,
"loss": 0.6718,
"step": 1575
},
{
"epoch": 1.6,
"grad_norm": 0.8452956676483154,
"learning_rate": 1.5555555555555555e-05,
"loss": 0.6738,
"step": 1600
},
{
"epoch": 1.625,
"grad_norm": 0.9731984734535217,
"learning_rate": 1.527777777777778e-05,
"loss": 0.6558,
"step": 1625
},
{
"epoch": 1.65,
"grad_norm": 1.831750750541687,
"learning_rate": 1.5e-05,
"loss": 0.6236,
"step": 1650
},
{
"epoch": 1.675,
"grad_norm": 1.6755101680755615,
"learning_rate": 1.4722222222222222e-05,
"loss": 0.7126,
"step": 1675
},
{
"epoch": 1.7,
"grad_norm": 1.3757505416870117,
"learning_rate": 1.4444444444444444e-05,
"loss": 0.6822,
"step": 1700
},
{
"epoch": 1.725,
"grad_norm": 1.377435326576233,
"learning_rate": 1.4166666666666666e-05,
"loss": 0.6324,
"step": 1725
},
{
"epoch": 1.75,
"grad_norm": 1.0001251697540283,
"learning_rate": 1.388888888888889e-05,
"loss": 0.7036,
"step": 1750
},
{
"epoch": 1.775,
"grad_norm": 1.0013527870178223,
"learning_rate": 1.3611111111111111e-05,
"loss": 0.6765,
"step": 1775
},
{
"epoch": 1.8,
"grad_norm": 1.0746055841445923,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.654,
"step": 1800
},
{
"epoch": 1.825,
"grad_norm": 0.6743106842041016,
"learning_rate": 1.3055555555555555e-05,
"loss": 0.6923,
"step": 1825
},
{
"epoch": 1.85,
"grad_norm": 0.9659077525138855,
"learning_rate": 1.2777777777777779e-05,
"loss": 0.6976,
"step": 1850
},
{
"epoch": 1.875,
"grad_norm": 0.7309139966964722,
"learning_rate": 1.25e-05,
"loss": 0.6904,
"step": 1875
},
{
"epoch": 1.9,
"grad_norm": 0.6239315271377563,
"learning_rate": 1.2222222222222222e-05,
"loss": 0.6307,
"step": 1900
},
{
"epoch": 1.925,
"grad_norm": 1.3958375453948975,
"learning_rate": 1.1944444444444444e-05,
"loss": 0.6437,
"step": 1925
},
{
"epoch": 1.95,
"grad_norm": 0.7617831230163574,
"learning_rate": 1.1666666666666668e-05,
"loss": 0.6333,
"step": 1950
},
{
"epoch": 1.975,
"grad_norm": 0.8432300686836243,
"learning_rate": 1.138888888888889e-05,
"loss": 0.6214,
"step": 1975
},
{
"epoch": 2.0,
"grad_norm": 1.236924409866333,
"learning_rate": 1.111111111111111e-05,
"loss": 0.6424,
"step": 2000
},
{
"epoch": 2.0,
"eval_loss": 0.6764588952064514,
"eval_runtime": 38.544,
"eval_samples_per_second": 51.889,
"eval_steps_per_second": 3.243,
"step": 2000
},
{
"epoch": 2.025,
"grad_norm": 1.2284152507781982,
"learning_rate": 1.0833333333333334e-05,
"loss": 0.7041,
"step": 2025
},
{
"epoch": 2.05,
"grad_norm": 0.9430116415023804,
"learning_rate": 1.0555555555555555e-05,
"loss": 0.7112,
"step": 2050
},
{
"epoch": 2.075,
"grad_norm": 0.5471211075782776,
"learning_rate": 1.0277777777777779e-05,
"loss": 0.6696,
"step": 2075
},
{
"epoch": 2.1,
"grad_norm": 0.9567949771881104,
"learning_rate": 9.999999999999999e-06,
"loss": 0.6739,
"step": 2100
},
{
"epoch": 2.125,
"grad_norm": 0.633762001991272,
"learning_rate": 9.722222222222223e-06,
"loss": 0.6315,
"step": 2125
},
{
"epoch": 2.15,
"grad_norm": 1.0539363622665405,
"learning_rate": 9.444444444444445e-06,
"loss": 0.7649,
"step": 2150
},
{
"epoch": 2.175,
"grad_norm": 0.9735732078552246,
"learning_rate": 9.166666666666668e-06,
"loss": 0.7079,
"step": 2175
},
{
"epoch": 2.2,
"grad_norm": 1.3620977401733398,
"learning_rate": 8.888888888888888e-06,
"loss": 0.6549,
"step": 2200
},
{
"epoch": 2.225,
"grad_norm": 0.9268941879272461,
"learning_rate": 8.611111111111112e-06,
"loss": 0.6548,
"step": 2225
},
{
"epoch": 2.25,
"grad_norm": 0.4519413709640503,
"learning_rate": 8.333333333333334e-06,
"loss": 0.6647,
"step": 2250
},
{
"epoch": 2.275,
"grad_norm": 0.8613296747207642,
"learning_rate": 8.055555555555557e-06,
"loss": 0.7568,
"step": 2275
},
{
"epoch": 2.3,
"grad_norm": 1.1156052350997925,
"learning_rate": 7.777777777777777e-06,
"loss": 0.6659,
"step": 2300
},
{
"epoch": 2.325,
"grad_norm": 0.8070225119590759,
"learning_rate": 7.5e-06,
"loss": 0.604,
"step": 2325
},
{
"epoch": 2.35,
"grad_norm": 1.2144221067428589,
"learning_rate": 7.222222222222222e-06,
"loss": 0.6342,
"step": 2350
},
{
"epoch": 2.375,
"grad_norm": 0.8116927742958069,
"learning_rate": 6.944444444444445e-06,
"loss": 0.6891,
"step": 2375
},
{
"epoch": 2.4,
"grad_norm": 0.8394978642463684,
"learning_rate": 6.666666666666667e-06,
"loss": 0.6856,
"step": 2400
},
{
"epoch": 2.425,
"grad_norm": 1.1582024097442627,
"learning_rate": 6.388888888888889e-06,
"loss": 0.6683,
"step": 2425
},
{
"epoch": 2.45,
"grad_norm": 0.9621151089668274,
"learning_rate": 6.111111111111111e-06,
"loss": 0.678,
"step": 2450
},
{
"epoch": 2.475,
"grad_norm": 1.0509181022644043,
"learning_rate": 5.833333333333334e-06,
"loss": 0.7102,
"step": 2475
},
{
"epoch": 2.5,
"grad_norm": 0.7675669193267822,
"learning_rate": 5.555555555555555e-06,
"loss": 0.6606,
"step": 2500
},
{
"epoch": 2.525,
"grad_norm": 0.9356604218482971,
"learning_rate": 5.277777777777778e-06,
"loss": 0.6634,
"step": 2525
},
{
"epoch": 2.55,
"grad_norm": 1.1379098892211914,
"learning_rate": 4.9999999999999996e-06,
"loss": 0.6443,
"step": 2550
},
{
"epoch": 2.575,
"grad_norm": 1.0013926029205322,
"learning_rate": 4.722222222222222e-06,
"loss": 0.6122,
"step": 2575
},
{
"epoch": 2.6,
"grad_norm": 0.771693229675293,
"learning_rate": 4.444444444444444e-06,
"loss": 0.6926,
"step": 2600
},
{
"epoch": 2.625,
"grad_norm": 0.7376611232757568,
"learning_rate": 4.166666666666667e-06,
"loss": 0.5957,
"step": 2625
},
{
"epoch": 2.65,
"grad_norm": 0.7340726256370544,
"learning_rate": 3.888888888888889e-06,
"loss": 0.6933,
"step": 2650
},
{
"epoch": 2.675,
"grad_norm": 0.7760947942733765,
"learning_rate": 3.611111111111111e-06,
"loss": 0.691,
"step": 2675
},
{
"epoch": 2.7,
"grad_norm": 0.9809922575950623,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.7015,
"step": 2700
},
{
"epoch": 2.725,
"grad_norm": 0.9861670732498169,
"learning_rate": 3.0555555555555556e-06,
"loss": 0.7057,
"step": 2725
},
{
"epoch": 2.75,
"grad_norm": 0.8055828809738159,
"learning_rate": 2.7777777777777775e-06,
"loss": 0.6386,
"step": 2750
},
{
"epoch": 2.775,
"grad_norm": 1.0951838493347168,
"learning_rate": 2.4999999999999998e-06,
"loss": 0.6868,
"step": 2775
},
{
"epoch": 2.8,
"grad_norm": 1.086242437362671,
"learning_rate": 2.222222222222222e-06,
"loss": 0.6992,
"step": 2800
},
{
"epoch": 2.825,
"grad_norm": 0.6613348126411438,
"learning_rate": 1.9444444444444444e-06,
"loss": 0.6338,
"step": 2825
},
{
"epoch": 2.85,
"grad_norm": 0.944501519203186,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.6175,
"step": 2850
},
{
"epoch": 2.875,
"grad_norm": 0.5407629609107971,
"learning_rate": 1.3888888888888887e-06,
"loss": 0.6125,
"step": 2875
},
{
"epoch": 2.9,
"grad_norm": 1.0618243217468262,
"learning_rate": 1.111111111111111e-06,
"loss": 0.6675,
"step": 2900
},
{
"epoch": 2.925,
"grad_norm": 0.6185476183891296,
"learning_rate": 8.333333333333333e-07,
"loss": 0.6369,
"step": 2925
},
{
"epoch": 2.95,
"grad_norm": 0.9023645520210266,
"learning_rate": 5.555555555555555e-07,
"loss": 0.6468,
"step": 2950
},
{
"epoch": 2.975,
"grad_norm": 1.4191973209381104,
"learning_rate": 2.7777777777777776e-07,
"loss": 0.6627,
"step": 2975
},
{
"epoch": 3.0,
"grad_norm": 0.7791981101036072,
"learning_rate": 0.0,
"loss": 0.6685,
"step": 3000
},
{
"epoch": 3.0,
"eval_loss": 0.6764523983001709,
"eval_runtime": 38.9354,
"eval_samples_per_second": 51.367,
"eval_steps_per_second": 3.21,
"step": 3000
}
],
"logging_steps": 25,
"max_steps": 3000,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.01
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}