7b-55k-e3 / checkpoint-107 /trainer_state.json
amphora's picture
Upload folder using huggingface_hub
f5c777a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9991245987744383,
"eval_steps": 36,
"global_step": 107,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009337613072658301,
"grad_norm": 1.3116651734261826,
"learning_rate": 1.25e-06,
"loss": 11.8241,
"step": 1
},
{
"epoch": 0.009337613072658301,
"eval_loss": 11.651493072509766,
"eval_runtime": 43.8088,
"eval_samples_per_second": 6.3,
"eval_steps_per_second": 0.799,
"step": 1
},
{
"epoch": 0.018675226145316602,
"grad_norm": 1.3220486767280257,
"learning_rate": 2.5e-06,
"loss": 11.8134,
"step": 2
},
{
"epoch": 0.028012839217974907,
"grad_norm": 1.323342086038301,
"learning_rate": 3.7500000000000005e-06,
"loss": 11.7915,
"step": 3
},
{
"epoch": 0.037350452290633204,
"grad_norm": 1.32468270774218,
"learning_rate": 5e-06,
"loss": 11.7306,
"step": 4
},
{
"epoch": 0.04668806536329151,
"grad_norm": 1.3580804351059697,
"learning_rate": 6.25e-06,
"loss": 11.4493,
"step": 5
},
{
"epoch": 0.05602567843594981,
"grad_norm": 1.4657782226093532,
"learning_rate": 7.500000000000001e-06,
"loss": 10.5146,
"step": 6
},
{
"epoch": 0.06536329150860812,
"grad_norm": 1.4570176977534868,
"learning_rate": 8.750000000000001e-06,
"loss": 7.5687,
"step": 7
},
{
"epoch": 0.07470090458126641,
"grad_norm": 1.3101839933421826,
"learning_rate": 1e-05,
"loss": 6.6374,
"step": 8
},
{
"epoch": 0.08403851765392471,
"grad_norm": 0.39797650661474604,
"learning_rate": 1.125e-05,
"loss": 4.0048,
"step": 9
},
{
"epoch": 0.09337613072658302,
"grad_norm": 0.29538196193252036,
"learning_rate": 1.25e-05,
"loss": 3.7439,
"step": 10
},
{
"epoch": 0.10271374379924132,
"grad_norm": 0.21371312404721451,
"learning_rate": 1.375e-05,
"loss": 3.5406,
"step": 11
},
{
"epoch": 0.11205135687189963,
"grad_norm": 0.36485008261730323,
"learning_rate": 1.5000000000000002e-05,
"loss": 3.3211,
"step": 12
},
{
"epoch": 0.12138896994455792,
"grad_norm": 0.2805852343578138,
"learning_rate": 1.6250000000000002e-05,
"loss": 3.1236,
"step": 13
},
{
"epoch": 0.13072658301721624,
"grad_norm": 0.2021156853252266,
"learning_rate": 1.7500000000000002e-05,
"loss": 2.8668,
"step": 14
},
{
"epoch": 0.14006419608987453,
"grad_norm": 0.15215118244217535,
"learning_rate": 1.8750000000000002e-05,
"loss": 2.6099,
"step": 15
},
{
"epoch": 0.14940180916253282,
"grad_norm": 0.13687350701334702,
"learning_rate": 2e-05,
"loss": 2.3842,
"step": 16
},
{
"epoch": 0.15873942223519114,
"grad_norm": 0.13750612099868206,
"learning_rate": 1.9999469523400122e-05,
"loss": 2.1991,
"step": 17
},
{
"epoch": 0.16807703530784943,
"grad_norm": 0.12651196118653027,
"learning_rate": 1.9997878149881576e-05,
"loss": 2.0405,
"step": 18
},
{
"epoch": 0.17741464838050774,
"grad_norm": 0.12217963693020709,
"learning_rate": 1.999522604828164e-05,
"loss": 1.867,
"step": 19
},
{
"epoch": 0.18675226145316604,
"grad_norm": 0.11161741227643726,
"learning_rate": 1.9991513499975883e-05,
"loss": 1.7081,
"step": 20
},
{
"epoch": 0.19608987452582433,
"grad_norm": 0.09431005577364608,
"learning_rate": 1.9986740898848306e-05,
"loss": 1.6022,
"step": 21
},
{
"epoch": 0.20542748759848264,
"grad_norm": 0.07544503987145361,
"learning_rate": 1.9980908751249556e-05,
"loss": 1.5179,
"step": 22
},
{
"epoch": 0.21476510067114093,
"grad_norm": 0.06853247429967851,
"learning_rate": 1.997401767594319e-05,
"loss": 1.4395,
"step": 23
},
{
"epoch": 0.22410271374379925,
"grad_norm": 0.05892769062015809,
"learning_rate": 1.996606840404006e-05,
"loss": 1.3577,
"step": 24
},
{
"epoch": 0.23344032681645754,
"grad_norm": 0.05337539608721018,
"learning_rate": 1.9957061778920703e-05,
"loss": 1.3032,
"step": 25
},
{
"epoch": 0.24277793988911583,
"grad_norm": 0.05279602192024426,
"learning_rate": 1.9946998756145894e-05,
"loss": 1.2436,
"step": 26
},
{
"epoch": 0.2521155529617741,
"grad_norm": 0.04928056172617358,
"learning_rate": 1.9935880403355255e-05,
"loss": 1.1964,
"step": 27
},
{
"epoch": 0.26145316603443247,
"grad_norm": 0.046581318628423386,
"learning_rate": 1.9923707900153984e-05,
"loss": 1.1161,
"step": 28
},
{
"epoch": 0.27079077910709076,
"grad_norm": 0.04485518736789617,
"learning_rate": 1.9910482537987704e-05,
"loss": 1.1,
"step": 29
},
{
"epoch": 0.28012839217974905,
"grad_norm": 0.03813950781953075,
"learning_rate": 1.989620572000544e-05,
"loss": 1.0508,
"step": 30
},
{
"epoch": 0.28946600525240734,
"grad_norm": 0.036622498559561,
"learning_rate": 1.9880878960910772e-05,
"loss": 1.0364,
"step": 31
},
{
"epoch": 0.29880361832506563,
"grad_norm": 0.03579080814636808,
"learning_rate": 1.9864503886801108e-05,
"loss": 1.0058,
"step": 32
},
{
"epoch": 0.308141231397724,
"grad_norm": 0.03146608814811976,
"learning_rate": 1.9847082234995172e-05,
"loss": 0.9954,
"step": 33
},
{
"epoch": 0.31747884447038227,
"grad_norm": 0.027371561053018377,
"learning_rate": 1.982861585384869e-05,
"loss": 0.9445,
"step": 34
},
{
"epoch": 0.32681645754304056,
"grad_norm": 0.024499632261339315,
"learning_rate": 1.9809106702558277e-05,
"loss": 0.938,
"step": 35
},
{
"epoch": 0.33615407061569885,
"grad_norm": 0.022391297579903467,
"learning_rate": 1.978855685095358e-05,
"loss": 0.9141,
"step": 36
},
{
"epoch": 0.33615407061569885,
"eval_loss": 0.8478350639343262,
"eval_runtime": 43.6583,
"eval_samples_per_second": 6.322,
"eval_steps_per_second": 0.802,
"step": 36
},
{
"epoch": 0.34549168368835714,
"grad_norm": 0.021886974051633205,
"learning_rate": 1.9766968479277684e-05,
"loss": 0.8907,
"step": 37
},
{
"epoch": 0.3548292967610155,
"grad_norm": 0.023142495322344372,
"learning_rate": 1.974434387795579e-05,
"loss": 0.9146,
"step": 38
},
{
"epoch": 0.3641669098336738,
"grad_norm": 0.021168627691228573,
"learning_rate": 1.972068544735221e-05,
"loss": 0.8564,
"step": 39
},
{
"epoch": 0.37350452290633207,
"grad_norm": 0.019500940158346962,
"learning_rate": 1.969599569751571e-05,
"loss": 0.8691,
"step": 40
},
{
"epoch": 0.38284213597899036,
"grad_norm": 0.017342490295839202,
"learning_rate": 1.9670277247913205e-05,
"loss": 0.8169,
"step": 41
},
{
"epoch": 0.39217974905164865,
"grad_norm": 0.01834359475842193,
"learning_rate": 1.964353282715183e-05,
"loss": 0.8285,
"step": 42
},
{
"epoch": 0.401517362124307,
"grad_norm": 0.01743926340900274,
"learning_rate": 1.961576527268946e-05,
"loss": 0.8185,
"step": 43
},
{
"epoch": 0.4108549751969653,
"grad_norm": 0.015589382188249705,
"learning_rate": 1.9586977530533677e-05,
"loss": 0.7824,
"step": 44
},
{
"epoch": 0.4201925882696236,
"grad_norm": 0.015465707621862244,
"learning_rate": 1.95571726549292e-05,
"loss": 0.7994,
"step": 45
},
{
"epoch": 0.42953020134228187,
"grad_norm": 0.01348325969968343,
"learning_rate": 1.9526353808033827e-05,
"loss": 0.786,
"step": 46
},
{
"epoch": 0.43886781441494016,
"grad_norm": 0.01248344639342667,
"learning_rate": 1.9494524259582994e-05,
"loss": 0.7841,
"step": 47
},
{
"epoch": 0.4482054274875985,
"grad_norm": 0.012287865038500807,
"learning_rate": 1.9461687386542826e-05,
"loss": 0.7703,
"step": 48
},
{
"epoch": 0.4575430405602568,
"grad_norm": 0.013137558740636913,
"learning_rate": 1.9427846672751873e-05,
"loss": 0.7522,
"step": 49
},
{
"epoch": 0.4668806536329151,
"grad_norm": 0.012405785040338313,
"learning_rate": 1.93930057085515e-05,
"loss": 0.7513,
"step": 50
},
{
"epoch": 0.4762182667055734,
"grad_norm": 0.012489962634362767,
"learning_rate": 1.9357168190404937e-05,
"loss": 0.743,
"step": 51
},
{
"epoch": 0.48555587977823167,
"grad_norm": 0.011604499619002773,
"learning_rate": 1.932033792050515e-05,
"loss": 0.7377,
"step": 52
},
{
"epoch": 0.49489349285089,
"grad_norm": 0.010226240702119492,
"learning_rate": 1.928251880637141e-05,
"loss": 0.723,
"step": 53
},
{
"epoch": 0.5042311059235483,
"grad_norm": 0.010262055155701614,
"learning_rate": 1.924371486043473e-05,
"loss": 0.7371,
"step": 54
},
{
"epoch": 0.5135687189962066,
"grad_norm": 0.009630481842938549,
"learning_rate": 1.920393019961217e-05,
"loss": 0.7295,
"step": 55
},
{
"epoch": 0.5229063320688649,
"grad_norm": 0.0093680195601059,
"learning_rate": 1.916316904487005e-05,
"loss": 0.712,
"step": 56
},
{
"epoch": 0.5322439451415232,
"grad_norm": 0.009110104257000179,
"learning_rate": 1.9121435720776122e-05,
"loss": 0.6959,
"step": 57
},
{
"epoch": 0.5415815582141815,
"grad_norm": 0.008883791285743962,
"learning_rate": 1.9078734655040763e-05,
"loss": 0.7056,
"step": 58
},
{
"epoch": 0.5509191712868398,
"grad_norm": 0.008661290168572833,
"learning_rate": 1.9035070378047204e-05,
"loss": 0.7031,
"step": 59
},
{
"epoch": 0.5602567843594981,
"grad_norm": 0.008574085915641946,
"learning_rate": 1.8990447522370886e-05,
"loss": 0.6973,
"step": 60
},
{
"epoch": 0.5695943974321565,
"grad_norm": 0.008014005266988974,
"learning_rate": 1.8944870822287957e-05,
"loss": 0.7075,
"step": 61
},
{
"epoch": 0.5789320105048147,
"grad_norm": 0.007587538749978625,
"learning_rate": 1.8898345113273e-05,
"loss": 0.6736,
"step": 62
},
{
"epoch": 0.588269623577473,
"grad_norm": 0.007458374217548868,
"learning_rate": 1.8850875331485996e-05,
"loss": 0.6811,
"step": 63
},
{
"epoch": 0.5976072366501313,
"grad_norm": 0.007383373412606801,
"learning_rate": 1.8802466513248635e-05,
"loss": 0.682,
"step": 64
},
{
"epoch": 0.6069448497227896,
"grad_norm": 0.007069434956330871,
"learning_rate": 1.8753123794509974e-05,
"loss": 0.6723,
"step": 65
},
{
"epoch": 0.616282462795448,
"grad_norm": 0.007152713173479847,
"learning_rate": 1.8702852410301556e-05,
"loss": 0.6663,
"step": 66
},
{
"epoch": 0.6256200758681062,
"grad_norm": 0.00764154365674685,
"learning_rate": 1.865165769418196e-05,
"loss": 0.6954,
"step": 67
},
{
"epoch": 0.6349576889407645,
"grad_norm": 0.007184416838244711,
"learning_rate": 1.8599545077670983e-05,
"loss": 0.6738,
"step": 68
},
{
"epoch": 0.6442953020134228,
"grad_norm": 0.006849637138186313,
"learning_rate": 1.854652008967335e-05,
"loss": 0.6679,
"step": 69
},
{
"epoch": 0.6536329150860811,
"grad_norm": 0.007061882929419886,
"learning_rate": 1.8492588355892125e-05,
"loss": 0.6765,
"step": 70
},
{
"epoch": 0.6629705281587395,
"grad_norm": 0.006338175952472431,
"learning_rate": 1.8437755598231857e-05,
"loss": 0.6695,
"step": 71
},
{
"epoch": 0.6723081412313977,
"grad_norm": 0.0060579092781975295,
"learning_rate": 1.8382027634191523e-05,
"loss": 0.6484,
"step": 72
},
{
"epoch": 0.6723081412313977,
"eval_loss": 0.6214176416397095,
"eval_runtime": 43.7617,
"eval_samples_per_second": 6.307,
"eval_steps_per_second": 0.8,
"step": 72
},
{
"epoch": 0.681645754304056,
"grad_norm": 0.006328927046317208,
"learning_rate": 1.8325410376247295e-05,
"loss": 0.6537,
"step": 73
},
{
"epoch": 0.6909833673767143,
"grad_norm": 0.006322879070385963,
"learning_rate": 1.826790983122527e-05,
"loss": 0.6792,
"step": 74
},
{
"epoch": 0.7003209804493726,
"grad_norm": 0.0059267041402792315,
"learning_rate": 1.8209532099664177e-05,
"loss": 0.6497,
"step": 75
},
{
"epoch": 0.709658593522031,
"grad_norm": 0.006265125029941557,
"learning_rate": 1.8150283375168112e-05,
"loss": 0.6375,
"step": 76
},
{
"epoch": 0.7189962065946892,
"grad_norm": 0.0057329086343299585,
"learning_rate": 1.8090169943749477e-05,
"loss": 0.6505,
"step": 77
},
{
"epoch": 0.7283338196673476,
"grad_norm": 0.006315125437389684,
"learning_rate": 1.8029198183162e-05,
"loss": 0.641,
"step": 78
},
{
"epoch": 0.7376714327400058,
"grad_norm": 0.00602643377500652,
"learning_rate": 1.796737456222413e-05,
"loss": 0.6702,
"step": 79
},
{
"epoch": 0.7470090458126641,
"grad_norm": 0.00618264758307804,
"learning_rate": 1.7904705640132717e-05,
"loss": 0.6414,
"step": 80
},
{
"epoch": 0.7563466588853225,
"grad_norm": 0.0058530581998478,
"learning_rate": 1.7841198065767107e-05,
"loss": 0.65,
"step": 81
},
{
"epoch": 0.7656842719579807,
"grad_norm": 0.005741561897188635,
"learning_rate": 1.7776858576983713e-05,
"loss": 0.659,
"step": 82
},
{
"epoch": 0.7750218850306391,
"grad_norm": 0.005350843870415206,
"learning_rate": 1.771169399990119e-05,
"loss": 0.6464,
"step": 83
},
{
"epoch": 0.7843594981032973,
"grad_norm": 0.005628603669293866,
"learning_rate": 1.7645711248176198e-05,
"loss": 0.6347,
"step": 84
},
{
"epoch": 0.7936971111759556,
"grad_norm": 0.006024684364906471,
"learning_rate": 1.7578917322269885e-05,
"loss": 0.6133,
"step": 85
},
{
"epoch": 0.803034724248614,
"grad_norm": 0.005557283692065632,
"learning_rate": 1.7511319308705198e-05,
"loss": 0.6244,
"step": 86
},
{
"epoch": 0.8123723373212722,
"grad_norm": 0.00553799083367584,
"learning_rate": 1.744292437931502e-05,
"loss": 0.6528,
"step": 87
},
{
"epoch": 0.8217099503939306,
"grad_norm": 0.005384068308149398,
"learning_rate": 1.7373739790481263e-05,
"loss": 0.6337,
"step": 88
},
{
"epoch": 0.8310475634665888,
"grad_norm": 0.005255083208402985,
"learning_rate": 1.7303772882365018e-05,
"loss": 0.6063,
"step": 89
},
{
"epoch": 0.8403851765392472,
"grad_norm": 0.005656017120071371,
"learning_rate": 1.723303107812779e-05,
"loss": 0.6194,
"step": 90
},
{
"epoch": 0.8497227896119055,
"grad_norm": 0.005727064114913842,
"learning_rate": 1.7161521883143936e-05,
"loss": 0.6359,
"step": 91
},
{
"epoch": 0.8590604026845637,
"grad_norm": 0.005332007932524027,
"learning_rate": 1.7089252884204376e-05,
"loss": 0.6188,
"step": 92
},
{
"epoch": 0.8683980157572221,
"grad_norm": 0.004895963878943312,
"learning_rate": 1.701623174871168e-05,
"loss": 0.6156,
"step": 93
},
{
"epoch": 0.8777356288298803,
"grad_norm": 0.0053198813356673084,
"learning_rate": 1.6942466223866582e-05,
"loss": 0.6165,
"step": 94
},
{
"epoch": 0.8870732419025387,
"grad_norm": 0.005384472223204587,
"learning_rate": 1.6867964135846043e-05,
"loss": 0.6182,
"step": 95
},
{
"epoch": 0.896410854975197,
"grad_norm": 0.005145515044871062,
"learning_rate": 1.679273338897293e-05,
"loss": 0.622,
"step": 96
},
{
"epoch": 0.9057484680478552,
"grad_norm": 0.005287999487944072,
"learning_rate": 1.6716781964877413e-05,
"loss": 0.6346,
"step": 97
},
{
"epoch": 0.9150860811205136,
"grad_norm": 0.005190281800590422,
"learning_rate": 1.664011792165012e-05,
"loss": 0.6219,
"step": 98
},
{
"epoch": 0.9244236941931718,
"grad_norm": 0.0051116136333854015,
"learning_rate": 1.6562749392987255e-05,
"loss": 0.6234,
"step": 99
},
{
"epoch": 0.9337613072658302,
"grad_norm": 0.005528069555146713,
"learning_rate": 1.648468458732762e-05,
"loss": 0.6238,
"step": 100
},
{
"epoch": 0.9430989203384885,
"grad_norm": 0.00505378298539984,
"learning_rate": 1.6405931786981753e-05,
"loss": 0.611,
"step": 101
},
{
"epoch": 0.9524365334111468,
"grad_norm": 0.005251284958033568,
"learning_rate": 1.6326499347253206e-05,
"loss": 0.6016,
"step": 102
},
{
"epoch": 0.9617741464838051,
"grad_norm": 0.0052283843925876325,
"learning_rate": 1.6246395695552086e-05,
"loss": 0.6224,
"step": 103
},
{
"epoch": 0.9711117595564633,
"grad_norm": 0.005202512120139698,
"learning_rate": 1.6165629330500952e-05,
"loss": 0.6082,
"step": 104
},
{
"epoch": 0.9804493726291217,
"grad_norm": 0.004915735280290327,
"learning_rate": 1.6084208821033152e-05,
"loss": 0.5931,
"step": 105
},
{
"epoch": 0.98978698570178,
"grad_norm": 0.0050879698990566845,
"learning_rate": 1.6002142805483686e-05,
"loss": 0.5945,
"step": 106
},
{
"epoch": 0.9991245987744383,
"grad_norm": 0.005267364898779865,
"learning_rate": 1.591943999067273e-05,
"loss": 0.6108,
"step": 107
}
],
"logging_steps": 1,
"max_steps": 321,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 107,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.6387261951962513e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}