RoyJoy's picture
Training in progress, step 107, checkpoint
58bea56 verified
{
"best_metric": 0.2019248753786087,
"best_model_checkpoint": "miner_id_24/checkpoint-100",
"epoch": 3.0247349823321557,
"eval_steps": 25,
"global_step": 107,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.028268551236749116,
"grad_norm": 0.1496291607618332,
"learning_rate": 2.5e-05,
"loss": 0.2719,
"step": 1
},
{
"epoch": 0.028268551236749116,
"eval_loss": 0.33758026361465454,
"eval_runtime": 2.2677,
"eval_samples_per_second": 22.049,
"eval_steps_per_second": 5.733,
"step": 1
},
{
"epoch": 0.05653710247349823,
"grad_norm": 0.19203834235668182,
"learning_rate": 5e-05,
"loss": 0.3043,
"step": 2
},
{
"epoch": 0.08480565371024736,
"grad_norm": 0.1976221799850464,
"learning_rate": 7.500000000000001e-05,
"loss": 0.3131,
"step": 3
},
{
"epoch": 0.11307420494699646,
"grad_norm": 0.18888598680496216,
"learning_rate": 0.0001,
"loss": 0.3167,
"step": 4
},
{
"epoch": 0.1413427561837456,
"grad_norm": 0.16196908056735992,
"learning_rate": 9.997906976305083e-05,
"loss": 0.3673,
"step": 5
},
{
"epoch": 0.1696113074204947,
"grad_norm": 0.13969330489635468,
"learning_rate": 9.991629852219523e-05,
"loss": 0.3187,
"step": 6
},
{
"epoch": 0.1978798586572438,
"grad_norm": 0.16544975340366364,
"learning_rate": 9.981174466929743e-05,
"loss": 0.3186,
"step": 7
},
{
"epoch": 0.22614840989399293,
"grad_norm": 0.1924961507320404,
"learning_rate": 9.966550546377587e-05,
"loss": 0.3365,
"step": 8
},
{
"epoch": 0.254416961130742,
"grad_norm": 0.12072353810071945,
"learning_rate": 9.947771694212933e-05,
"loss": 0.2508,
"step": 9
},
{
"epoch": 0.2826855123674912,
"grad_norm": 0.12432608753442764,
"learning_rate": 9.924855379139136e-05,
"loss": 0.2776,
"step": 10
},
{
"epoch": 0.31095406360424027,
"grad_norm": 0.10128191113471985,
"learning_rate": 9.897822918663062e-05,
"loss": 0.2447,
"step": 11
},
{
"epoch": 0.3392226148409894,
"grad_norm": 0.08782458305358887,
"learning_rate": 9.866699459264848e-05,
"loss": 0.2468,
"step": 12
},
{
"epoch": 0.3674911660777385,
"grad_norm": 0.07989340275526047,
"learning_rate": 9.831513953005823e-05,
"loss": 0.254,
"step": 13
},
{
"epoch": 0.3957597173144876,
"grad_norm": 0.07268672436475754,
"learning_rate": 9.792299130596348e-05,
"loss": 0.2531,
"step": 14
},
{
"epoch": 0.42402826855123676,
"grad_norm": 0.0746292918920517,
"learning_rate": 9.749091470948645e-05,
"loss": 0.2708,
"step": 15
},
{
"epoch": 0.45229681978798586,
"grad_norm": 0.07829014211893082,
"learning_rate": 9.70193116724291e-05,
"loss": 0.2828,
"step": 16
},
{
"epoch": 0.48056537102473496,
"grad_norm": 0.09487626701593399,
"learning_rate": 9.650862089538307e-05,
"loss": 0.3013,
"step": 17
},
{
"epoch": 0.508833922261484,
"grad_norm": 0.06753429770469666,
"learning_rate": 9.595931743963597e-05,
"loss": 0.2171,
"step": 18
},
{
"epoch": 0.5371024734982333,
"grad_norm": 0.07974890619516373,
"learning_rate": 9.537191228525384e-05,
"loss": 0.2098,
"step": 19
},
{
"epoch": 0.5653710247349824,
"grad_norm": 0.07332316040992737,
"learning_rate": 9.474695185575073e-05,
"loss": 0.2234,
"step": 20
},
{
"epoch": 0.5936395759717314,
"grad_norm": 0.06308283656835556,
"learning_rate": 9.408501750978769e-05,
"loss": 0.2219,
"step": 21
},
{
"epoch": 0.6219081272084805,
"grad_norm": 0.07921251654624939,
"learning_rate": 9.338672500037388e-05,
"loss": 0.237,
"step": 22
},
{
"epoch": 0.6501766784452296,
"grad_norm": 0.0856022983789444,
"learning_rate": 9.26527239020729e-05,
"loss": 0.2579,
"step": 23
},
{
"epoch": 0.6784452296819788,
"grad_norm": 0.0858432948589325,
"learning_rate": 9.188369700674736e-05,
"loss": 0.2633,
"step": 24
},
{
"epoch": 0.7067137809187279,
"grad_norm": 0.08091321587562561,
"learning_rate": 9.108035968840348e-05,
"loss": 0.2571,
"step": 25
},
{
"epoch": 0.7067137809187279,
"eval_loss": 0.22624744474887848,
"eval_runtime": 2.3489,
"eval_samples_per_second": 21.286,
"eval_steps_per_second": 5.534,
"step": 25
},
{
"epoch": 0.734982332155477,
"grad_norm": 0.09223099052906036,
"learning_rate": 9.024345923772673e-05,
"loss": 0.2829,
"step": 26
},
{
"epoch": 0.7632508833922261,
"grad_norm": 0.06381526589393616,
"learning_rate": 8.937377416692753e-05,
"loss": 0.1954,
"step": 27
},
{
"epoch": 0.7915194346289752,
"grad_norm": 0.0784890204668045,
"learning_rate": 8.847211348554383e-05,
"loss": 0.2135,
"step": 28
},
{
"epoch": 0.8197879858657244,
"grad_norm": 0.08194228261709213,
"learning_rate": 8.753931594787381e-05,
"loss": 0.2163,
"step": 29
},
{
"epoch": 0.8480565371024735,
"grad_norm": 0.07212778180837631,
"learning_rate": 8.65762492727392e-05,
"loss": 0.2276,
"step": 30
},
{
"epoch": 0.8763250883392226,
"grad_norm": 0.07101049274206161,
"learning_rate": 8.558380933630481e-05,
"loss": 0.2567,
"step": 31
},
{
"epoch": 0.9045936395759717,
"grad_norm": 0.06946881860494614,
"learning_rate": 8.456291933870523e-05,
"loss": 0.2489,
"step": 32
},
{
"epoch": 0.9328621908127208,
"grad_norm": 0.08646480739116669,
"learning_rate": 8.351452894525369e-05,
"loss": 0.2449,
"step": 33
},
{
"epoch": 0.9611307420494699,
"grad_norm": 0.09382244199514389,
"learning_rate": 8.243961340303246e-05,
"loss": 0.2713,
"step": 34
},
{
"epoch": 0.9893992932862191,
"grad_norm": 0.10727506130933762,
"learning_rate": 8.13391726336859e-05,
"loss": 0.2837,
"step": 35
},
{
"epoch": 1.017667844522968,
"grad_norm": 0.11025042831897736,
"learning_rate": 8.021423030326076e-05,
"loss": 0.3322,
"step": 36
},
{
"epoch": 1.0459363957597174,
"grad_norm": 0.05836005136370659,
"learning_rate": 7.906583286995835e-05,
"loss": 0.1968,
"step": 37
},
{
"epoch": 1.0742049469964665,
"grad_norm": 0.06407664716243744,
"learning_rate": 7.789504861068493e-05,
"loss": 0.1979,
"step": 38
},
{
"epoch": 1.1024734982332156,
"grad_norm": 0.07625511288642883,
"learning_rate": 7.670296662730553e-05,
"loss": 0.2166,
"step": 39
},
{
"epoch": 1.1307420494699647,
"grad_norm": 0.0838479995727539,
"learning_rate": 7.54906958335257e-05,
"loss": 0.2416,
"step": 40
},
{
"epoch": 1.1590106007067138,
"grad_norm": 0.08144625276327133,
"learning_rate": 7.425936392334369e-05,
"loss": 0.2472,
"step": 41
},
{
"epoch": 1.187279151943463,
"grad_norm": 0.06474898755550385,
"learning_rate": 7.301011632203251e-05,
"loss": 0.214,
"step": 42
},
{
"epoch": 1.215547703180212,
"grad_norm": 0.0715680941939354,
"learning_rate": 7.17441151206279e-05,
"loss": 0.2439,
"step": 43
},
{
"epoch": 1.243816254416961,
"grad_norm": 0.0650620236992836,
"learning_rate": 7.046253799491311e-05,
"loss": 0.1902,
"step": 44
},
{
"epoch": 1.2720848056537102,
"grad_norm": 0.12278559058904648,
"learning_rate": 6.916657710990633e-05,
"loss": 0.2531,
"step": 45
},
{
"epoch": 1.3003533568904593,
"grad_norm": 0.07810261845588684,
"learning_rate": 6.785743801086981e-05,
"loss": 0.2229,
"step": 46
},
{
"epoch": 1.3286219081272086,
"grad_norm": 0.06839544326066971,
"learning_rate": 6.653633850187212e-05,
"loss": 0.2014,
"step": 47
},
{
"epoch": 1.3568904593639575,
"grad_norm": 0.07193570584058762,
"learning_rate": 6.520450751294685e-05,
"loss": 0.2103,
"step": 48
},
{
"epoch": 1.3851590106007068,
"grad_norm": 0.09183719009160995,
"learning_rate": 6.386318395690179e-05,
"loss": 0.2542,
"step": 49
},
{
"epoch": 1.4134275618374559,
"grad_norm": 0.09221762418746948,
"learning_rate": 6.25136155768415e-05,
"loss": 0.2299,
"step": 50
},
{
"epoch": 1.4134275618374559,
"eval_loss": 0.21029320359230042,
"eval_runtime": 2.4406,
"eval_samples_per_second": 20.487,
"eval_steps_per_second": 5.327,
"step": 50
},
{
"epoch": 1.441696113074205,
"grad_norm": 0.10233300924301147,
"learning_rate": 6.115705778547597e-05,
"loss": 0.238,
"step": 51
},
{
"epoch": 1.469964664310954,
"grad_norm": 0.08884930610656738,
"learning_rate": 5.979477249729443e-05,
"loss": 0.2443,
"step": 52
},
{
"epoch": 1.4982332155477032,
"grad_norm": 0.06135065108537674,
"learning_rate": 5.842802695469132e-05,
"loss": 0.1793,
"step": 53
},
{
"epoch": 1.5265017667844523,
"grad_norm": 0.0772595927119255,
"learning_rate": 5.705809254913577e-05,
"loss": 0.2317,
"step": 54
},
{
"epoch": 1.5547703180212014,
"grad_norm": 0.07231206446886063,
"learning_rate": 5.568624363848167e-05,
"loss": 0.1857,
"step": 55
},
{
"epoch": 1.5830388692579507,
"grad_norm": 0.07848239690065384,
"learning_rate": 5.431375636151834e-05,
"loss": 0.2049,
"step": 56
},
{
"epoch": 1.6113074204946995,
"grad_norm": 0.07845490425825119,
"learning_rate": 5.294190745086426e-05,
"loss": 0.2081,
"step": 57
},
{
"epoch": 1.6395759717314489,
"grad_norm": 0.0769682452082634,
"learning_rate": 5.15719730453087e-05,
"loss": 0.2242,
"step": 58
},
{
"epoch": 1.6678445229681977,
"grad_norm": 0.07719448208808899,
"learning_rate": 5.020522750270559e-05,
"loss": 0.2138,
"step": 59
},
{
"epoch": 1.696113074204947,
"grad_norm": 0.08276604861021042,
"learning_rate": 4.884294221452406e-05,
"loss": 0.2339,
"step": 60
},
{
"epoch": 1.7243816254416962,
"grad_norm": 0.09187748283147812,
"learning_rate": 4.7486384423158514e-05,
"loss": 0.2585,
"step": 61
},
{
"epoch": 1.7526501766784452,
"grad_norm": 0.06316478550434113,
"learning_rate": 4.613681604309824e-05,
"loss": 0.1936,
"step": 62
},
{
"epoch": 1.7809187279151943,
"grad_norm": 0.06568682938814163,
"learning_rate": 4.479549248705316e-05,
"loss": 0.1895,
"step": 63
},
{
"epoch": 1.8091872791519434,
"grad_norm": 0.07526635378599167,
"learning_rate": 4.346366149812791e-05,
"loss": 0.1954,
"step": 64
},
{
"epoch": 1.8374558303886925,
"grad_norm": 0.07216266542673111,
"learning_rate": 4.2142561989130204e-05,
"loss": 0.1972,
"step": 65
},
{
"epoch": 1.8657243816254416,
"grad_norm": 0.07032765448093414,
"learning_rate": 4.0833422890093684e-05,
"loss": 0.1875,
"step": 66
},
{
"epoch": 1.893992932862191,
"grad_norm": 0.0753110945224762,
"learning_rate": 3.9537462005086936e-05,
"loss": 0.2326,
"step": 67
},
{
"epoch": 1.9222614840989398,
"grad_norm": 0.07524023950099945,
"learning_rate": 3.825588487937212e-05,
"loss": 0.235,
"step": 68
},
{
"epoch": 1.9505300353356891,
"grad_norm": 0.0842115730047226,
"learning_rate": 3.6989883677967485e-05,
"loss": 0.2449,
"step": 69
},
{
"epoch": 1.978798586572438,
"grad_norm": 0.10185158252716064,
"learning_rate": 3.574063607665633e-05,
"loss": 0.2608,
"step": 70
},
{
"epoch": 2.0070671378091873,
"grad_norm": 0.14404942095279694,
"learning_rate": 3.450930416647429e-05,
"loss": 0.3676,
"step": 71
},
{
"epoch": 2.035335689045936,
"grad_norm": 0.060068968683481216,
"learning_rate": 3.3297033372694477e-05,
"loss": 0.1747,
"step": 72
},
{
"epoch": 2.0636042402826855,
"grad_norm": 0.06130439415574074,
"learning_rate": 3.2104951389315077e-05,
"loss": 0.2236,
"step": 73
},
{
"epoch": 2.091872791519435,
"grad_norm": 0.06732094287872314,
"learning_rate": 3.093416713004167e-05,
"loss": 0.1928,
"step": 74
},
{
"epoch": 2.1201413427561837,
"grad_norm": 0.07202989608049393,
"learning_rate": 2.9785769696739264e-05,
"loss": 0.2016,
"step": 75
},
{
"epoch": 2.1201413427561837,
"eval_loss": 0.20411866903305054,
"eval_runtime": 2.3425,
"eval_samples_per_second": 21.345,
"eval_steps_per_second": 5.55,
"step": 75
},
{
"epoch": 2.148409893992933,
"grad_norm": 0.08088778704404831,
"learning_rate": 2.86608273663141e-05,
"loss": 0.2475,
"step": 76
},
{
"epoch": 2.176678445229682,
"grad_norm": 0.07822652906179428,
"learning_rate": 2.7560386596967557e-05,
"loss": 0.2136,
"step": 77
},
{
"epoch": 2.204946996466431,
"grad_norm": 0.080837681889534,
"learning_rate": 2.6485471054746318e-05,
"loss": 0.2256,
"step": 78
},
{
"epoch": 2.23321554770318,
"grad_norm": 0.07773551344871521,
"learning_rate": 2.5437080661294786e-05,
"loss": 0.2158,
"step": 79
},
{
"epoch": 2.2614840989399294,
"grad_norm": 0.07530491799116135,
"learning_rate": 2.4416190663695194e-05,
"loss": 0.206,
"step": 80
},
{
"epoch": 2.2897526501766783,
"grad_norm": 0.07282233983278275,
"learning_rate": 2.3423750727260816e-05,
"loss": 0.1726,
"step": 81
},
{
"epoch": 2.3180212014134276,
"grad_norm": 0.0700765922665596,
"learning_rate": 2.2460684052126197e-05,
"loss": 0.1814,
"step": 82
},
{
"epoch": 2.3462897526501765,
"grad_norm": 0.06933876872062683,
"learning_rate": 2.152788651445618e-05,
"loss": 0.1945,
"step": 83
},
{
"epoch": 2.374558303886926,
"grad_norm": 0.06883740425109863,
"learning_rate": 2.0626225833072487e-05,
"loss": 0.2101,
"step": 84
},
{
"epoch": 2.402826855123675,
"grad_norm": 0.06986892968416214,
"learning_rate": 1.97565407622733e-05,
"loss": 0.1994,
"step": 85
},
{
"epoch": 2.431095406360424,
"grad_norm": 0.07742994278669357,
"learning_rate": 1.891964031159653e-05,
"loss": 0.2147,
"step": 86
},
{
"epoch": 2.4593639575971733,
"grad_norm": 0.08387456834316254,
"learning_rate": 1.8116302993252637e-05,
"loss": 0.2197,
"step": 87
},
{
"epoch": 2.487632508833922,
"grad_norm": 0.08557935804128647,
"learning_rate": 1.7347276097927105e-05,
"loss": 0.2097,
"step": 88
},
{
"epoch": 2.5159010600706715,
"grad_norm": 0.06514272093772888,
"learning_rate": 1.6613274999626137e-05,
"loss": 0.2136,
"step": 89
},
{
"epoch": 2.5441696113074204,
"grad_norm": 0.05643463507294655,
"learning_rate": 1.5914982490212312e-05,
"loss": 0.1907,
"step": 90
},
{
"epoch": 2.5724381625441697,
"grad_norm": 0.06008900701999664,
"learning_rate": 1.5253048144249275e-05,
"loss": 0.1835,
"step": 91
},
{
"epoch": 2.6007067137809186,
"grad_norm": 0.06434128433465958,
"learning_rate": 1.4628087714746172e-05,
"loss": 0.189,
"step": 92
},
{
"epoch": 2.628975265017668,
"grad_norm": 0.07678020000457764,
"learning_rate": 1.4040682560364033e-05,
"loss": 0.2275,
"step": 93
},
{
"epoch": 2.657243816254417,
"grad_norm": 0.07552898675203323,
"learning_rate": 1.3491379104616938e-05,
"loss": 0.2218,
"step": 94
},
{
"epoch": 2.685512367491166,
"grad_norm": 0.0758337527513504,
"learning_rate": 1.2980688327570905e-05,
"loss": 0.2201,
"step": 95
},
{
"epoch": 2.713780918727915,
"grad_norm": 0.08446727693080902,
"learning_rate": 1.2509085290513564e-05,
"loss": 0.2429,
"step": 96
},
{
"epoch": 2.7420494699646643,
"grad_norm": 0.08306104689836502,
"learning_rate": 1.2077008694036528e-05,
"loss": 0.2056,
"step": 97
},
{
"epoch": 2.7703180212014136,
"grad_norm": 0.0664680078625679,
"learning_rate": 1.1684860469941786e-05,
"loss": 0.1927,
"step": 98
},
{
"epoch": 2.7985865724381624,
"grad_norm": 0.06700006872415543,
"learning_rate": 1.1333005407351517e-05,
"loss": 0.1953,
"step": 99
},
{
"epoch": 2.8268551236749118,
"grad_norm": 0.06202859431505203,
"learning_rate": 1.1021770813369377e-05,
"loss": 0.1882,
"step": 100
},
{
"epoch": 2.8268551236749118,
"eval_loss": 0.2019248753786087,
"eval_runtime": 2.346,
"eval_samples_per_second": 21.313,
"eval_steps_per_second": 5.541,
"step": 100
},
{
"epoch": 2.8551236749116606,
"grad_norm": 0.06783071905374527,
"learning_rate": 1.0751446208608642e-05,
"loss": 0.2021,
"step": 101
},
{
"epoch": 2.88339222614841,
"grad_norm": 0.06956081092357635,
"learning_rate": 1.0522283057870676e-05,
"loss": 0.2095,
"step": 102
},
{
"epoch": 2.9116607773851593,
"grad_norm": 0.07598631829023361,
"learning_rate": 1.0334494536224147e-05,
"loss": 0.2283,
"step": 103
},
{
"epoch": 2.939929328621908,
"grad_norm": 0.07840535044670105,
"learning_rate": 1.0188255330702585e-05,
"loss": 0.2234,
"step": 104
},
{
"epoch": 2.968197879858657,
"grad_norm": 0.08693535625934601,
"learning_rate": 1.008370147780478e-05,
"loss": 0.2282,
"step": 105
},
{
"epoch": 2.9964664310954063,
"grad_norm": 0.12355560064315796,
"learning_rate": 1.0020930236949183e-05,
"loss": 0.3288,
"step": 106
},
{
"epoch": 3.0247349823321557,
"grad_norm": 0.06719771027565002,
"learning_rate": 1e-05,
"loss": 0.2029,
"step": 107
}
],
"logging_steps": 1,
"max_steps": 107,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 30,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.4028515042797814e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}