llama3_qlora_finetuned / checkpoint-100 /trainer_state.json

Add fine-tuned LLaMA-3 model

484b80e 3 months ago

17.8 kB

	{
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 8.930232558139535,
	"eval_steps": 500,
	"global_step": 100,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.09302325581395349,
	"grad_norm": 0.16149793565273285,
	"learning_rate": 2e-05,
	"loss": 1.1134,
	"step": 1
	},
	{
	"epoch": 0.18604651162790697,
	"grad_norm": 0.1648913472890854,
	"learning_rate": 4e-05,
	"loss": 1.1164,
	"step": 2
	},
	{
	"epoch": 0.27906976744186046,
	"grad_norm": 0.17200778424739838,
	"learning_rate": 6e-05,
	"loss": 1.1092,
	"step": 3
	},
	{
	"epoch": 0.37209302325581395,
	"grad_norm": 0.1553676575422287,
	"learning_rate": 8e-05,
	"loss": 1.0805,
	"step": 4
	},
	{
	"epoch": 0.46511627906976744,
	"grad_norm": 0.17939890921115875,
	"learning_rate": 0.0001,
	"loss": 1.0974,
	"step": 5
	},
	{
	"epoch": 0.5581395348837209,
	"grad_norm": 0.18353882431983948,
	"learning_rate": 0.00012,
	"loss": 1.084,
	"step": 6
	},
	{
	"epoch": 0.6511627906976745,
	"grad_norm": 0.10934005677700043,
	"learning_rate": 0.00014,
	"loss": 1.0589,
	"step": 7
	},
	{
	"epoch": 0.7441860465116279,
	"grad_norm": 0.14321544766426086,
	"learning_rate": 0.00016,
	"loss": 1.1106,
	"step": 8
	},
	{
	"epoch": 0.8372093023255814,
	"grad_norm": 0.14364807307720184,
	"learning_rate": 0.00018,
	"loss": 0.9995,
	"step": 9
	},
	{
	"epoch": 0.9302325581395349,
	"grad_norm": 0.11091677844524384,
	"learning_rate": 0.0002,
	"loss": 1.0161,
	"step": 10
	},
	{
	"epoch": 1.0232558139534884,
	"grad_norm": 0.13314443826675415,
	"learning_rate": 0.0001999390827019096,
	"loss": 1.0635,
	"step": 11
	},
	{
	"epoch": 1.069767441860465,
	"grad_norm": 0.15984280407428741,
	"learning_rate": 0.00019975640502598244,
	"loss": 1.0619,
	"step": 12
	},
	{
	"epoch": 1.1627906976744187,
	"grad_norm": 0.09492229670286179,
	"learning_rate": 0.00019945218953682734,
	"loss": 1.0128,
	"step": 13
	},
	{
	"epoch": 1.255813953488372,
	"grad_norm": 0.10073135793209076,
	"learning_rate": 0.00019902680687415705,
	"loss": 1.0284,
	"step": 14
	},
	{
	"epoch": 1.3488372093023255,
	"grad_norm": 0.14430958032608032,
	"learning_rate": 0.00019848077530122083,
	"loss": 0.999,
	"step": 15
	},
	{
	"epoch": 1.441860465116279,
	"grad_norm": 0.11253123730421066,
	"learning_rate": 0.00019781476007338058,
	"loss": 1.0133,
	"step": 16
	},
	{
	"epoch": 1.5348837209302326,
	"grad_norm": 0.08133890479803085,
	"learning_rate": 0.00019702957262759965,
	"loss": 1.0065,
	"step": 17
	},
	{
	"epoch": 1.627906976744186,
	"grad_norm": 0.07986672967672348,
	"learning_rate": 0.0001961261695938319,
	"loss": 1.007,
	"step": 18
	},
	{
	"epoch": 1.7209302325581395,
	"grad_norm": 0.09260742366313934,
	"learning_rate": 0.00019510565162951537,
	"loss": 0.9638,
	"step": 19
	},
	{
	"epoch": 1.8139534883720931,
	"grad_norm": 0.08938893675804138,
	"learning_rate": 0.00019396926207859084,
	"loss": 0.9518,
	"step": 20
	},
	{
	"epoch": 1.9069767441860463,
	"grad_norm": 0.10713282972574234,
	"learning_rate": 0.00019271838545667876,
	"loss": 0.9528,
	"step": 21
	},
	{
	"epoch": 2.0,
	"grad_norm": 0.08607500791549683,
	"learning_rate": 0.0001913545457642601,
	"loss": 0.9854,
	"step": 22
	},
	{
	"epoch": 2.046511627906977,
	"grad_norm": 0.093882717192173,
	"learning_rate": 0.0001898794046299167,
	"loss": 0.9862,
	"step": 23
	},
	{
	"epoch": 2.13953488372093,
	"grad_norm": 0.09035174548625946,
	"learning_rate": 0.00018829475928589271,
	"loss": 0.9672,
	"step": 24
	},
	{
	"epoch": 2.2325581395348837,
	"grad_norm": 0.0946049690246582,
	"learning_rate": 0.00018660254037844388,
	"loss": 0.9532,
	"step": 25
	},
	{
	"epoch": 2.3255813953488373,
	"grad_norm": 0.08834836632013321,
	"learning_rate": 0.0001848048096156426,
	"loss": 0.8906,
	"step": 26
	},
	{
	"epoch": 2.4186046511627906,
	"grad_norm": 0.09713950753211975,
	"learning_rate": 0.00018290375725550417,
	"loss": 0.9557,
	"step": 27
	},
	{
	"epoch": 2.511627906976744,
	"grad_norm": 0.09845568984746933,
	"learning_rate": 0.00018090169943749476,
	"loss": 0.8998,
	"step": 28
	},
	{
	"epoch": 2.604651162790698,
	"grad_norm": 0.09690100699663162,
	"learning_rate": 0.00017880107536067218,
	"loss": 0.8738,
	"step": 29
	},
	{
	"epoch": 2.697674418604651,
	"grad_norm": 0.10925247520208359,
	"learning_rate": 0.0001766044443118978,
	"loss": 0.9368,
	"step": 30
	},
	{
	"epoch": 2.7906976744186047,
	"grad_norm": 0.11018506437540054,
	"learning_rate": 0.00017431448254773944,
	"loss": 0.9053,
	"step": 31
	},
	{
	"epoch": 2.883720930232558,
	"grad_norm": 0.10770343989133835,
	"learning_rate": 0.0001719339800338651,
	"loss": 0.8706,
	"step": 32
	},
	{
	"epoch": 2.9767441860465116,
	"grad_norm": 0.1134146898984909,
	"learning_rate": 0.00016946583704589973,
	"loss": 0.9463,
	"step": 33
	},
	{
	"epoch": 3.0232558139534884,
	"grad_norm": 0.1267908215522766,
	"learning_rate": 0.00016691306063588583,
	"loss": 0.9671,
	"step": 34
	},
	{
	"epoch": 3.116279069767442,
	"grad_norm": 0.12680290639400482,
	"learning_rate": 0.00016427876096865394,
	"loss": 0.8842,
	"step": 35
	},
	{
	"epoch": 3.2093023255813953,
	"grad_norm": 0.12786774337291718,
	"learning_rate": 0.0001615661475325658,
	"loss": 0.8572,
	"step": 36
	},
	{
	"epoch": 3.302325581395349,
	"grad_norm": 0.13284103572368622,
	"learning_rate": 0.00015877852522924732,
	"loss": 0.8159,
	"step": 37
	},
	{
	"epoch": 3.395348837209302,
	"grad_norm": 0.14496278762817383,
	"learning_rate": 0.0001559192903470747,
	"loss": 0.8261,
	"step": 38
	},
	{
	"epoch": 3.488372093023256,
	"grad_norm": 0.14366915822029114,
	"learning_rate": 0.0001529919264233205,
	"loss": 0.8086,
	"step": 39
	},
	{
	"epoch": 3.5813953488372094,
	"grad_norm": 0.15425656735897064,
	"learning_rate": 0.00015000000000000001,
	"loss": 0.8219,
	"step": 40
	},
	{
	"epoch": 3.6744186046511627,
	"grad_norm": 0.15200121700763702,
	"learning_rate": 0.00014694715627858908,
	"loss": 0.7976,
	"step": 41
	},
	{
	"epoch": 3.7674418604651163,
	"grad_norm": 0.16381299495697021,
	"learning_rate": 0.00014383711467890774,
	"loss": 0.8372,
	"step": 42
	},
	{
	"epoch": 3.8604651162790695,
	"grad_norm": 0.18023422360420227,
	"learning_rate": 0.00014067366430758004,
	"loss": 0.8413,
	"step": 43
	},
	{
	"epoch": 3.953488372093023,
	"grad_norm": 0.1712740808725357,
	"learning_rate": 0.00013746065934159123,
	"loss": 0.8042,
	"step": 44
	},
	{
	"epoch": 4.046511627906977,
	"grad_norm": 0.19934940338134766,
	"learning_rate": 0.00013420201433256689,
	"loss": 0.8048,
	"step": 45
	},
	{
	"epoch": 4.093023255813954,
	"grad_norm": 0.2283613681793213,
	"learning_rate": 0.00013090169943749476,
	"loss": 0.7578,
	"step": 46
	},
	{
	"epoch": 4.186046511627907,
	"grad_norm": 0.19298645853996277,
	"learning_rate": 0.0001275637355816999,
	"loss": 0.7187,
	"step": 47
	},
	{
	"epoch": 4.27906976744186,
	"grad_norm": 0.2513829469680786,
	"learning_rate": 0.00012419218955996676,
	"loss": 0.7365,
	"step": 48
	},
	{
	"epoch": 4.372093023255814,
	"grad_norm": 0.24523547291755676,
	"learning_rate": 0.00012079116908177593,
	"loss": 0.7479,
	"step": 49
	},
	{
	"epoch": 4.465116279069767,
	"grad_norm": 0.22233036160469055,
	"learning_rate": 0.00011736481776669306,
	"loss": 0.7316,
	"step": 50
	},
	{
	"epoch": 4.558139534883721,
	"grad_norm": 0.24643278121948242,
	"learning_rate": 0.00011391731009600654,
	"loss": 0.7246,
	"step": 51
	},
	{
	"epoch": 4.651162790697675,
	"grad_norm": 0.25024113059043884,
	"learning_rate": 0.00011045284632676536,
	"loss": 0.7199,
	"step": 52
	},
	{
	"epoch": 4.7441860465116275,
	"grad_norm": 0.2576965093612671,
	"learning_rate": 0.00010697564737441252,
	"loss": 0.7089,
	"step": 53
	},
	{
	"epoch": 4.837209302325581,
	"grad_norm": 0.2569313049316406,
	"learning_rate": 0.00010348994967025012,
	"loss": 0.6886,
	"step": 54
	},
	{
	"epoch": 4.930232558139535,
	"grad_norm": 0.2936149537563324,
	"learning_rate": 0.0001,
	"loss": 0.6954,
	"step": 55
	},
	{
	"epoch": 5.023255813953488,
	"grad_norm": 0.2493412047624588,
	"learning_rate": 9.651005032974994e-05,
	"loss": 0.6572,
	"step": 56
	},
	{
	"epoch": 5.069767441860465,
	"grad_norm": 0.3881013095378876,
	"learning_rate": 9.302435262558747e-05,
	"loss": 0.703,
	"step": 57
	},
	{
	"epoch": 5.162790697674419,
	"grad_norm": 0.3377751410007477,
	"learning_rate": 8.954715367323468e-05,
	"loss": 0.5782,
	"step": 58
	},
	{
	"epoch": 5.2558139534883725,
	"grad_norm": 0.2629767656326294,
	"learning_rate": 8.608268990399349e-05,
	"loss": 0.5818,
	"step": 59
	},
	{
	"epoch": 5.348837209302325,
	"grad_norm": 0.4151897132396698,
	"learning_rate": 8.263518223330697e-05,
	"loss": 0.6153,
	"step": 60
	},
	{
	"epoch": 5.441860465116279,
	"grad_norm": 0.4599984288215637,
	"learning_rate": 7.920883091822408e-05,
	"loss": 0.5997,
	"step": 61
	},
	{
	"epoch": 5.534883720930233,
	"grad_norm": 0.3126599192619324,
	"learning_rate": 7.580781044003324e-05,
	"loss": 0.6259,
	"step": 62
	},
	{
	"epoch": 5.627906976744186,
	"grad_norm": 0.3361060619354248,
	"learning_rate": 7.243626441830009e-05,
	"loss": 0.5956,
	"step": 63
	},
	{
	"epoch": 5.720930232558139,
	"grad_norm": 0.3510780334472656,
	"learning_rate": 6.909830056250527e-05,
	"loss": 0.6112,
	"step": 64
	},
	{
	"epoch": 5.813953488372093,
	"grad_norm": 0.3278762996196747,
	"learning_rate": 6.579798566743314e-05,
	"loss": 0.5896,
	"step": 65
	},
	{
	"epoch": 5.906976744186046,
	"grad_norm": 0.30621784925460815,
	"learning_rate": 6.25393406584088e-05,
	"loss": 0.6069,
	"step": 66
	},
	{
	"epoch": 6.0,
	"grad_norm": 0.3743348717689514,
	"learning_rate": 5.9326335692419995e-05,
	"loss": 0.6006,
	"step": 67
	},
	{
	"epoch": 6.046511627906977,
	"grad_norm": 0.3283383846282959,
	"learning_rate": 5.616288532109225e-05,
	"loss": 0.5486,
	"step": 68
	},
	{
	"epoch": 6.1395348837209305,
	"grad_norm": 0.32962197065353394,
	"learning_rate": 5.305284372141095e-05,
	"loss": 0.5548,
	"step": 69
	},
	{
	"epoch": 6.232558139534884,
	"grad_norm": 0.30023908615112305,
	"learning_rate": 5.000000000000002e-05,
	"loss": 0.5145,
	"step": 70
	},
	{
	"epoch": 6.325581395348837,
	"grad_norm": 0.33415696024894714,
	"learning_rate": 4.700807357667952e-05,
	"loss": 0.5225,
	"step": 71
	},
	{
	"epoch": 6.4186046511627906,
	"grad_norm": 0.3424683213233948,
	"learning_rate": 4.4080709652925336e-05,
	"loss": 0.5188,
	"step": 72
	},
	{
	"epoch": 6.511627906976744,
	"grad_norm": 0.35422009229660034,
	"learning_rate": 4.12214747707527e-05,
	"loss": 0.5075,
	"step": 73
	},
	{
	"epoch": 6.604651162790698,
	"grad_norm": 0.3498677909374237,
	"learning_rate": 3.843385246743417e-05,
	"loss": 0.4983,
	"step": 74
	},
	{
	"epoch": 6.6976744186046515,
	"grad_norm": 0.3385615348815918,
	"learning_rate": 3.5721239031346066e-05,
	"loss": 0.5674,
	"step": 75
	},
	{
	"epoch": 6.790697674418604,
	"grad_norm": 0.3517475724220276,
	"learning_rate": 3.308693936411421e-05,
	"loss": 0.5089,
	"step": 76
	},
	{
	"epoch": 6.883720930232558,
	"grad_norm": 0.3289170563220978,
	"learning_rate": 3.053416295410026e-05,
	"loss": 0.5144,
	"step": 77
	},
	{
	"epoch": 6.976744186046512,
	"grad_norm": 0.32821524143218994,
	"learning_rate": 2.8066019966134904e-05,
	"loss": 0.4931,
	"step": 78
	},
	{
	"epoch": 7.023255813953488,
	"grad_norm": 0.39287298917770386,
	"learning_rate": 2.5685517452260567e-05,
	"loss": 0.4649,
	"step": 79
	},
	{
	"epoch": 7.116279069767442,
	"grad_norm": 0.3749699890613556,
	"learning_rate": 2.339555568810221e-05,
	"loss": 0.4878,
	"step": 80
	},
	{
	"epoch": 7.209302325581396,
	"grad_norm": 0.33682915568351746,
	"learning_rate": 2.119892463932781e-05,
	"loss": 0.4757,
	"step": 81
	},
	{
	"epoch": 7.3023255813953485,
	"grad_norm": 0.3400666117668152,
	"learning_rate": 1.9098300562505266e-05,
	"loss": 0.4723,
	"step": 82
	},
	{
	"epoch": 7.395348837209302,
	"grad_norm": 0.37987035512924194,
	"learning_rate": 1.7096242744495837e-05,
	"loss": 0.5013,
	"step": 83
	},
	{
	"epoch": 7.488372093023256,
	"grad_norm": 0.3450799584388733,
	"learning_rate": 1.5195190384357404e-05,
	"loss": 0.4439,
	"step": 84
	},
	{
	"epoch": 7.5813953488372094,
	"grad_norm": 0.3729305863380432,
	"learning_rate": 1.339745962155613e-05,
	"loss": 0.4542,
	"step": 85
	},
	{
	"epoch": 7.674418604651163,
	"grad_norm": 0.38979628682136536,
	"learning_rate": 1.1705240714107302e-05,
	"loss": 0.4771,
	"step": 86
	},
	{
	"epoch": 7.767441860465116,
	"grad_norm": 0.36468306183815,
	"learning_rate": 1.0120595370083318e-05,
	"loss": 0.4396,
	"step": 87
	},
	{
	"epoch": 7.8604651162790695,
	"grad_norm": 0.35426077246665955,
	"learning_rate": 8.645454235739903e-06,
	"loss": 0.4438,
	"step": 88
	},
	{
	"epoch": 7.953488372093023,
	"grad_norm": 0.35007694363594055,
	"learning_rate": 7.281614543321269e-06,
	"loss": 0.4494,
	"step": 89
	},
	{
	"epoch": 8.046511627906977,
	"grad_norm": 0.3535120189189911,
	"learning_rate": 6.030737921409169e-06,
	"loss": 0.5123,
	"step": 90
	},
	{
	"epoch": 8.093023255813954,
	"grad_norm": 0.3610997796058655,
	"learning_rate": 4.8943483704846475e-06,
	"loss": 0.4599,
	"step": 91
	},
	{
	"epoch": 8.186046511627907,
	"grad_norm": 0.3317432403564453,
	"learning_rate": 3.873830406168111e-06,
	"loss": 0.4375,
	"step": 92
	},
	{
	"epoch": 8.279069767441861,
	"grad_norm": 0.36092883348464966,
	"learning_rate": 2.970427372400353e-06,
	"loss": 0.4346,
	"step": 93
	},
	{
	"epoch": 8.372093023255815,
	"grad_norm": 0.32780343294143677,
	"learning_rate": 2.1852399266194314e-06,
	"loss": 0.444,
	"step": 94
	},
	{
	"epoch": 8.465116279069768,
	"grad_norm": 0.34713441133499146,
	"learning_rate": 1.5192246987791981e-06,
	"loss": 0.4946,
	"step": 95
	},
	{
	"epoch": 8.55813953488372,
	"grad_norm": 0.33522722125053406,
	"learning_rate": 9.731931258429638e-07,
	"loss": 0.4373,
	"step": 96
	},
	{
	"epoch": 8.651162790697674,
	"grad_norm": 0.35264119505882263,
	"learning_rate": 5.478104631726711e-07,
	"loss": 0.4604,
	"step": 97
	},
	{
	"epoch": 8.744186046511627,
	"grad_norm": 0.34864360094070435,
	"learning_rate": 2.4359497401758024e-07,
	"loss": 0.4255,
	"step": 98
	},
	{
	"epoch": 8.837209302325581,
	"grad_norm": 0.3404649794101715,
	"learning_rate": 6.09172980904238e-08,
	"loss": 0.4433,
	"step": 99
	},
	{
	"epoch": 8.930232558139535,
	"grad_norm": 0.3446452021598816,
	"learning_rate": 0.0,
	"loss": 0.4519,
	"step": 100
	}
	],
	"logging_steps": 1,
	"max_steps": 100,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 10,
	"save_steps": 10,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": true
	},
	"attributes": {}
	}
	},
	"total_flos": 1.48456079980757e+17,
	"train_batch_size": 2,
	"trial_name": null,
	"trial_params": null
	}