lillian039's picture
Model save
c156f11 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.980132450331126,
"eval_steps": 500,
"global_step": 225,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.013245033112582781,
"grad_norm": 0.8096176088552035,
"learning_rate": 8.695652173913044e-06,
"loss": 1.2541,
"step": 1
},
{
"epoch": 0.026490066225165563,
"grad_norm": 0.8050822017472643,
"learning_rate": 1.739130434782609e-05,
"loss": 1.227,
"step": 2
},
{
"epoch": 0.039735099337748346,
"grad_norm": 0.7944772711887119,
"learning_rate": 2.608695652173913e-05,
"loss": 1.2415,
"step": 3
},
{
"epoch": 0.052980132450331126,
"grad_norm": 0.7598134845438774,
"learning_rate": 3.478260869565218e-05,
"loss": 1.1949,
"step": 4
},
{
"epoch": 0.06622516556291391,
"grad_norm": 0.7683127560022982,
"learning_rate": 4.347826086956522e-05,
"loss": 1.2093,
"step": 5
},
{
"epoch": 0.07947019867549669,
"grad_norm": 0.5344525931760804,
"learning_rate": 5.217391304347826e-05,
"loss": 1.1036,
"step": 6
},
{
"epoch": 0.09271523178807947,
"grad_norm": 0.4587044664340658,
"learning_rate": 6.086956521739131e-05,
"loss": 1.0166,
"step": 7
},
{
"epoch": 0.10596026490066225,
"grad_norm": 0.4868625164917359,
"learning_rate": 6.956521739130436e-05,
"loss": 0.955,
"step": 8
},
{
"epoch": 0.11920529801324503,
"grad_norm": 0.5418471125188639,
"learning_rate": 7.82608695652174e-05,
"loss": 0.8997,
"step": 9
},
{
"epoch": 0.13245033112582782,
"grad_norm": 0.5223349521251892,
"learning_rate": 8.695652173913044e-05,
"loss": 0.8113,
"step": 10
},
{
"epoch": 0.1456953642384106,
"grad_norm": 0.4786982568033246,
"learning_rate": 9.565217391304348e-05,
"loss": 0.7325,
"step": 11
},
{
"epoch": 0.15894039735099338,
"grad_norm": 0.46957216029807536,
"learning_rate": 0.00010434782608695653,
"loss": 0.6606,
"step": 12
},
{
"epoch": 0.17218543046357615,
"grad_norm": 0.38029367288689914,
"learning_rate": 0.00011304347826086956,
"loss": 0.5808,
"step": 13
},
{
"epoch": 0.18543046357615894,
"grad_norm": 0.24720582418095602,
"learning_rate": 0.00012173913043478263,
"loss": 0.5613,
"step": 14
},
{
"epoch": 0.1986754966887417,
"grad_norm": 0.23099067802861695,
"learning_rate": 0.00013043478260869567,
"loss": 0.5391,
"step": 15
},
{
"epoch": 0.2119205298013245,
"grad_norm": 0.20957820248410008,
"learning_rate": 0.0001391304347826087,
"loss": 0.539,
"step": 16
},
{
"epoch": 0.2251655629139073,
"grad_norm": 0.21711931182463448,
"learning_rate": 0.00014782608695652173,
"loss": 0.5268,
"step": 17
},
{
"epoch": 0.23841059602649006,
"grad_norm": 0.1951790595421549,
"learning_rate": 0.0001565217391304348,
"loss": 0.4963,
"step": 18
},
{
"epoch": 0.25165562913907286,
"grad_norm": 0.1826409685431601,
"learning_rate": 0.00016521739130434784,
"loss": 0.4952,
"step": 19
},
{
"epoch": 0.26490066225165565,
"grad_norm": 0.14373385619543355,
"learning_rate": 0.00017391304347826088,
"loss": 0.4837,
"step": 20
},
{
"epoch": 0.2781456953642384,
"grad_norm": 0.12173908533781636,
"learning_rate": 0.00018260869565217392,
"loss": 0.4634,
"step": 21
},
{
"epoch": 0.2913907284768212,
"grad_norm": 0.12297735060498352,
"learning_rate": 0.00019130434782608697,
"loss": 0.4573,
"step": 22
},
{
"epoch": 0.304635761589404,
"grad_norm": 0.10994270746188307,
"learning_rate": 0.0002,
"loss": 0.4683,
"step": 23
},
{
"epoch": 0.31788079470198677,
"grad_norm": 0.11351044281096902,
"learning_rate": 0.00019998790632601496,
"loss": 0.4322,
"step": 24
},
{
"epoch": 0.33112582781456956,
"grad_norm": 0.11243087776192183,
"learning_rate": 0.00019995162822919883,
"loss": 0.4516,
"step": 25
},
{
"epoch": 0.3443708609271523,
"grad_norm": 0.11510175208476785,
"learning_rate": 0.00019989117448426108,
"loss": 0.4499,
"step": 26
},
{
"epoch": 0.3576158940397351,
"grad_norm": 0.11693433753737806,
"learning_rate": 0.00019980655971335945,
"loss": 0.4542,
"step": 27
},
{
"epoch": 0.3708609271523179,
"grad_norm": 0.11467246423231502,
"learning_rate": 0.00019969780438256293,
"loss": 0.4337,
"step": 28
},
{
"epoch": 0.3841059602649007,
"grad_norm": 0.11115653137915112,
"learning_rate": 0.0001995649347969019,
"loss": 0.4263,
"step": 29
},
{
"epoch": 0.3973509933774834,
"grad_norm": 0.11024786542483019,
"learning_rate": 0.00019940798309400526,
"loss": 0.4342,
"step": 30
},
{
"epoch": 0.4105960264900662,
"grad_norm": 0.10312580553142063,
"learning_rate": 0.00019922698723632767,
"loss": 0.4267,
"step": 31
},
{
"epoch": 0.423841059602649,
"grad_norm": 0.11074151337400631,
"learning_rate": 0.00019902199100196697,
"loss": 0.4286,
"step": 32
},
{
"epoch": 0.4370860927152318,
"grad_norm": 0.09029943151079976,
"learning_rate": 0.0001987930439740757,
"loss": 0.4152,
"step": 33
},
{
"epoch": 0.4503311258278146,
"grad_norm": 0.09101826700354056,
"learning_rate": 0.00019854020152886814,
"loss": 0.4313,
"step": 34
},
{
"epoch": 0.46357615894039733,
"grad_norm": 0.0914630983642065,
"learning_rate": 0.00019826352482222638,
"loss": 0.4117,
"step": 35
},
{
"epoch": 0.4768211920529801,
"grad_norm": 0.09219697877770537,
"learning_rate": 0.00019796308077490817,
"loss": 0.4175,
"step": 36
},
{
"epoch": 0.4900662251655629,
"grad_norm": 0.08852002864296264,
"learning_rate": 0.00019763894205636072,
"loss": 0.4041,
"step": 37
},
{
"epoch": 0.5033112582781457,
"grad_norm": 0.08580676378486166,
"learning_rate": 0.00019729118706714375,
"loss": 0.404,
"step": 38
},
{
"epoch": 0.5165562913907285,
"grad_norm": 0.08598698501328113,
"learning_rate": 0.00019691989991996663,
"loss": 0.4087,
"step": 39
},
{
"epoch": 0.5298013245033113,
"grad_norm": 0.08961053716539952,
"learning_rate": 0.00019652517041934356,
"loss": 0.4014,
"step": 40
},
{
"epoch": 0.543046357615894,
"grad_norm": 0.08443482401797175,
"learning_rate": 0.00019610709403987246,
"loss": 0.4137,
"step": 41
},
{
"epoch": 0.5562913907284768,
"grad_norm": 0.08466021640310874,
"learning_rate": 0.00019566577190314197,
"loss": 0.4071,
"step": 42
},
{
"epoch": 0.5695364238410596,
"grad_norm": 0.08784527020927076,
"learning_rate": 0.00019520131075327298,
"loss": 0.4061,
"step": 43
},
{
"epoch": 0.5827814569536424,
"grad_norm": 0.08325332082087357,
"learning_rate": 0.00019471382293110003,
"loss": 0.3957,
"step": 44
},
{
"epoch": 0.5960264900662252,
"grad_norm": 0.08614805595781429,
"learning_rate": 0.0001942034263469989,
"loss": 0.4053,
"step": 45
},
{
"epoch": 0.609271523178808,
"grad_norm": 0.07902174863469037,
"learning_rate": 0.00019367024445236754,
"loss": 0.3987,
"step": 46
},
{
"epoch": 0.6225165562913907,
"grad_norm": 0.08133695710941313,
"learning_rate": 0.00019311440620976597,
"loss": 0.3942,
"step": 47
},
{
"epoch": 0.6357615894039735,
"grad_norm": 0.08276360028919133,
"learning_rate": 0.00019253604606172417,
"loss": 0.3951,
"step": 48
},
{
"epoch": 0.6490066225165563,
"grad_norm": 0.08194802489692825,
"learning_rate": 0.00019193530389822363,
"loss": 0.3917,
"step": 49
},
{
"epoch": 0.6622516556291391,
"grad_norm": 0.08159974959706186,
"learning_rate": 0.00019131232502286188,
"loss": 0.3934,
"step": 50
},
{
"epoch": 0.6754966887417219,
"grad_norm": 0.08170998905157066,
"learning_rate": 0.00019066726011770726,
"loss": 0.3851,
"step": 51
},
{
"epoch": 0.6887417218543046,
"grad_norm": 0.08020907094953274,
"learning_rate": 0.00019000026520685302,
"loss": 0.3893,
"step": 52
},
{
"epoch": 0.7019867549668874,
"grad_norm": 0.08034981466771474,
"learning_rate": 0.00018931150161867916,
"loss": 0.381,
"step": 53
},
{
"epoch": 0.7152317880794702,
"grad_norm": 0.08444845993593682,
"learning_rate": 0.00018860113594683148,
"loss": 0.3915,
"step": 54
},
{
"epoch": 0.7284768211920529,
"grad_norm": 0.08015215412606266,
"learning_rate": 0.00018786934000992688,
"loss": 0.3833,
"step": 55
},
{
"epoch": 0.7417218543046358,
"grad_norm": 0.08464858931007045,
"learning_rate": 0.00018711629080999504,
"loss": 0.3826,
"step": 56
},
{
"epoch": 0.7549668874172185,
"grad_norm": 0.08291520407405459,
"learning_rate": 0.00018634217048966637,
"loss": 0.3738,
"step": 57
},
{
"epoch": 0.7682119205298014,
"grad_norm": 0.08660040487398858,
"learning_rate": 0.0001855471662881164,
"loss": 0.3856,
"step": 58
},
{
"epoch": 0.7814569536423841,
"grad_norm": 0.0857196214995308,
"learning_rate": 0.00018473147049577774,
"loss": 0.3779,
"step": 59
},
{
"epoch": 0.7947019867549668,
"grad_norm": 0.07987880371713715,
"learning_rate": 0.00018389528040783012,
"loss": 0.3766,
"step": 60
},
{
"epoch": 0.8079470198675497,
"grad_norm": 0.08369440099668185,
"learning_rate": 0.00018303879827647975,
"loss": 0.3835,
"step": 61
},
{
"epoch": 0.8211920529801324,
"grad_norm": 0.08373532556639413,
"learning_rate": 0.00018216223126204007,
"loss": 0.3745,
"step": 62
},
{
"epoch": 0.8344370860927153,
"grad_norm": 0.08073536197157054,
"learning_rate": 0.00018126579138282503,
"loss": 0.3687,
"step": 63
},
{
"epoch": 0.847682119205298,
"grad_norm": 0.08284465509601228,
"learning_rate": 0.00018034969546386757,
"loss": 0.3787,
"step": 64
},
{
"epoch": 0.8609271523178808,
"grad_norm": 0.0842934427371451,
"learning_rate": 0.00017941416508447536,
"loss": 0.3873,
"step": 65
},
{
"epoch": 0.8741721854304636,
"grad_norm": 0.08355593713327628,
"learning_rate": 0.0001784594265246366,
"loss": 0.3778,
"step": 66
},
{
"epoch": 0.8874172185430463,
"grad_norm": 0.08950539941436171,
"learning_rate": 0.000177485710710289,
"loss": 0.3727,
"step": 67
},
{
"epoch": 0.9006622516556292,
"grad_norm": 0.08710263548451828,
"learning_rate": 0.00017649325315746478,
"loss": 0.3808,
"step": 68
},
{
"epoch": 0.9139072847682119,
"grad_norm": 0.0887614198652171,
"learning_rate": 0.00017548229391532572,
"loss": 0.3789,
"step": 69
},
{
"epoch": 0.9271523178807947,
"grad_norm": 0.08666661250569707,
"learning_rate": 0.0001744530775081015,
"loss": 0.3732,
"step": 70
},
{
"epoch": 0.9403973509933775,
"grad_norm": 0.0849525268450149,
"learning_rate": 0.00017340585287594604,
"loss": 0.3712,
"step": 71
},
{
"epoch": 0.9536423841059603,
"grad_norm": 0.08625788315304235,
"learning_rate": 0.00017234087331472497,
"loss": 0.3597,
"step": 72
},
{
"epoch": 0.9668874172185431,
"grad_norm": 0.07851130512605926,
"learning_rate": 0.00017125839641475072,
"loss": 0.3639,
"step": 73
},
{
"epoch": 0.9801324503311258,
"grad_norm": 0.08964240238751611,
"learning_rate": 0.00017015868399847768,
"loss": 0.3844,
"step": 74
},
{
"epoch": 0.9933774834437086,
"grad_norm": 0.08516340365396252,
"learning_rate": 0.0001690420020571747,
"loss": 0.372,
"step": 75
},
{
"epoch": 0.9933774834437086,
"eval_loss": 0.3703567683696747,
"eval_runtime": 46.123,
"eval_samples_per_second": 21.941,
"eval_steps_per_second": 0.694,
"step": 75
},
{
"epoch": 1.0066225165562914,
"grad_norm": 0.07944382362889917,
"learning_rate": 0.0001679086206865886,
"loss": 0.3697,
"step": 76
},
{
"epoch": 1.0198675496688743,
"grad_norm": 0.08265930361903498,
"learning_rate": 0.00016675881402161536,
"loss": 0.3551,
"step": 77
},
{
"epoch": 1.033112582781457,
"grad_norm": 0.08703614399996357,
"learning_rate": 0.000165592860169994,
"loss": 0.3442,
"step": 78
},
{
"epoch": 1.0463576158940397,
"grad_norm": 0.08916319509375828,
"learning_rate": 0.0001644110411450398,
"loss": 0.365,
"step": 79
},
{
"epoch": 1.0596026490066226,
"grad_norm": 0.08703848127871557,
"learning_rate": 0.00016321364279743266,
"loss": 0.3611,
"step": 80
},
{
"epoch": 1.0728476821192052,
"grad_norm": 0.09052558000694078,
"learning_rate": 0.00016200095474607753,
"loss": 0.3615,
"step": 81
},
{
"epoch": 1.086092715231788,
"grad_norm": 0.08918100371610707,
"learning_rate": 0.0001607732703080532,
"loss": 0.342,
"step": 82
},
{
"epoch": 1.099337748344371,
"grad_norm": 0.08576575268439565,
"learning_rate": 0.0001595308864276666,
"loss": 0.3598,
"step": 83
},
{
"epoch": 1.1125827814569536,
"grad_norm": 0.08585017464402006,
"learning_rate": 0.0001582741036046301,
"loss": 0.3504,
"step": 84
},
{
"epoch": 1.1258278145695364,
"grad_norm": 0.08593452414859805,
"learning_rate": 0.00015700322582137827,
"loss": 0.3432,
"step": 85
},
{
"epoch": 1.1390728476821192,
"grad_norm": 0.08731970510720415,
"learning_rate": 0.00015571856046954285,
"loss": 0.3457,
"step": 86
},
{
"epoch": 1.152317880794702,
"grad_norm": 0.0921843418842424,
"learning_rate": 0.00015442041827560274,
"loss": 0.3507,
"step": 87
},
{
"epoch": 1.1655629139072847,
"grad_norm": 0.09651961400159455,
"learning_rate": 0.00015310911322572753,
"loss": 0.3596,
"step": 88
},
{
"epoch": 1.1788079470198676,
"grad_norm": 0.08524005048376013,
"learning_rate": 0.00015178496248983254,
"loss": 0.3554,
"step": 89
},
{
"epoch": 1.1920529801324504,
"grad_norm": 0.08859594152270273,
"learning_rate": 0.000150448286344864,
"loss": 0.3551,
"step": 90
},
{
"epoch": 1.205298013245033,
"grad_norm": 0.0924808469627539,
"learning_rate": 0.00014909940809733222,
"loss": 0.3525,
"step": 91
},
{
"epoch": 1.218543046357616,
"grad_norm": 0.08644059805052462,
"learning_rate": 0.00014773865400511272,
"loss": 0.3503,
"step": 92
},
{
"epoch": 1.2317880794701987,
"grad_norm": 0.09131894341880005,
"learning_rate": 0.00014636635319853275,
"loss": 0.3571,
"step": 93
},
{
"epoch": 1.2450331125827814,
"grad_norm": 0.08393682045402433,
"learning_rate": 0.0001449828376007636,
"loss": 0.3476,
"step": 94
},
{
"epoch": 1.2582781456953642,
"grad_norm": 0.08696313045637266,
"learning_rate": 0.00014358844184753712,
"loss": 0.3594,
"step": 95
},
{
"epoch": 1.271523178807947,
"grad_norm": 0.09458041630505085,
"learning_rate": 0.00014218350320620624,
"loss": 0.3626,
"step": 96
},
{
"epoch": 1.2847682119205297,
"grad_norm": 0.08823303635376296,
"learning_rate": 0.00014076836149416887,
"loss": 0.3499,
"step": 97
},
{
"epoch": 1.2980132450331126,
"grad_norm": 0.09294675372857181,
"learning_rate": 0.00013934335899667527,
"loss": 0.3539,
"step": 98
},
{
"epoch": 1.3112582781456954,
"grad_norm": 0.08824268036877034,
"learning_rate": 0.00013790884038403795,
"loss": 0.3514,
"step": 99
},
{
"epoch": 1.3245033112582782,
"grad_norm": 0.08535480262896947,
"learning_rate": 0.00013646515262826552,
"loss": 0.345,
"step": 100
},
{
"epoch": 1.3377483443708609,
"grad_norm": 0.08847562725166169,
"learning_rate": 0.00013501264491913906,
"loss": 0.3616,
"step": 101
},
{
"epoch": 1.3509933774834437,
"grad_norm": 0.08859058434854095,
"learning_rate": 0.0001335516685797525,
"loss": 0.3562,
"step": 102
},
{
"epoch": 1.3642384105960264,
"grad_norm": 0.08715025975746184,
"learning_rate": 0.00013208257698153677,
"loss": 0.3455,
"step": 103
},
{
"epoch": 1.3774834437086092,
"grad_norm": 0.0853594568437305,
"learning_rate": 0.00013060572545878875,
"loss": 0.346,
"step": 104
},
{
"epoch": 1.390728476821192,
"grad_norm": 0.08722491192064814,
"learning_rate": 0.00012912147122272523,
"loss": 0.3555,
"step": 105
},
{
"epoch": 1.403973509933775,
"grad_norm": 0.0871433664730764,
"learning_rate": 0.00012763017327508305,
"loss": 0.3556,
"step": 106
},
{
"epoch": 1.4172185430463577,
"grad_norm": 0.08803547541904783,
"learning_rate": 0.00012613219232128608,
"loss": 0.3534,
"step": 107
},
{
"epoch": 1.4304635761589404,
"grad_norm": 0.09122226233927531,
"learning_rate": 0.00012462789068320017,
"loss": 0.3569,
"step": 108
},
{
"epoch": 1.4437086092715232,
"grad_norm": 0.09822341257641279,
"learning_rate": 0.000123117632211497,
"loss": 0.3633,
"step": 109
},
{
"epoch": 1.4569536423841059,
"grad_norm": 0.09270090775666746,
"learning_rate": 0.00012160178219764837,
"loss": 0.3453,
"step": 110
},
{
"epoch": 1.4701986754966887,
"grad_norm": 0.08925565696630358,
"learning_rate": 0.00012008070728557186,
"loss": 0.3508,
"step": 111
},
{
"epoch": 1.4834437086092715,
"grad_norm": 0.09170653617303556,
"learning_rate": 0.00011855477538294935,
"loss": 0.3534,
"step": 112
},
{
"epoch": 1.4966887417218544,
"grad_norm": 0.08583635619816832,
"learning_rate": 0.00011702435557223987,
"loss": 0.3463,
"step": 113
},
{
"epoch": 1.5099337748344372,
"grad_norm": 0.08058809711878263,
"learning_rate": 0.00011548981802140848,
"loss": 0.3477,
"step": 114
},
{
"epoch": 1.5231788079470199,
"grad_norm": 0.09093533643868798,
"learning_rate": 0.00011395153389439233,
"loss": 0.3512,
"step": 115
},
{
"epoch": 1.5364238410596025,
"grad_norm": 0.09171376470501859,
"learning_rate": 0.00011240987526132594,
"loss": 0.3544,
"step": 116
},
{
"epoch": 1.5496688741721854,
"grad_norm": 0.08586078909940174,
"learning_rate": 0.00011086521500854745,
"loss": 0.3694,
"step": 117
},
{
"epoch": 1.5629139072847682,
"grad_norm": 0.08632019045566638,
"learning_rate": 0.00010931792674840718,
"loss": 0.3453,
"step": 118
},
{
"epoch": 1.576158940397351,
"grad_norm": 0.09269094674353331,
"learning_rate": 0.00010776838472890065,
"loss": 0.3587,
"step": 119
},
{
"epoch": 1.589403973509934,
"grad_norm": 0.08779002368050795,
"learning_rate": 0.00010621696374314807,
"loss": 0.3478,
"step": 120
},
{
"epoch": 1.6026490066225165,
"grad_norm": 0.08586261022719192,
"learning_rate": 0.00010466403903874176,
"loss": 0.341,
"step": 121
},
{
"epoch": 1.6158940397350994,
"grad_norm": 0.08611577193250892,
"learning_rate": 0.0001031099862269837,
"loss": 0.3558,
"step": 122
},
{
"epoch": 1.629139072847682,
"grad_norm": 0.09316621499512412,
"learning_rate": 0.0001015551811920351,
"loss": 0.3541,
"step": 123
},
{
"epoch": 1.6423841059602649,
"grad_norm": 0.08404147766450029,
"learning_rate": 0.0001,
"loss": 0.3489,
"step": 124
},
{
"epoch": 1.6556291390728477,
"grad_norm": 0.08524287111150772,
"learning_rate": 9.844481880796491e-05,
"loss": 0.3541,
"step": 125
},
{
"epoch": 1.6688741721854305,
"grad_norm": 0.08369196863657465,
"learning_rate": 9.689001377301633e-05,
"loss": 0.3421,
"step": 126
},
{
"epoch": 1.6821192052980134,
"grad_norm": 0.08831018354579961,
"learning_rate": 9.533596096125825e-05,
"loss": 0.3484,
"step": 127
},
{
"epoch": 1.695364238410596,
"grad_norm": 0.08931583825994703,
"learning_rate": 9.378303625685195e-05,
"loss": 0.3418,
"step": 128
},
{
"epoch": 1.7086092715231787,
"grad_norm": 0.0920976409870365,
"learning_rate": 9.223161527109937e-05,
"loss": 0.3477,
"step": 129
},
{
"epoch": 1.7218543046357615,
"grad_norm": 0.0866166191323527,
"learning_rate": 9.068207325159284e-05,
"loss": 0.3422,
"step": 130
},
{
"epoch": 1.7350993377483444,
"grad_norm": 0.08394672431065998,
"learning_rate": 8.913478499145254e-05,
"loss": 0.337,
"step": 131
},
{
"epoch": 1.7483443708609272,
"grad_norm": 0.08368403453651165,
"learning_rate": 8.759012473867407e-05,
"loss": 0.3487,
"step": 132
},
{
"epoch": 1.76158940397351,
"grad_norm": 0.08503534775674756,
"learning_rate": 8.604846610560771e-05,
"loss": 0.3463,
"step": 133
},
{
"epoch": 1.7748344370860927,
"grad_norm": 0.08495442186575057,
"learning_rate": 8.451018197859153e-05,
"loss": 0.3506,
"step": 134
},
{
"epoch": 1.7880794701986755,
"grad_norm": 0.08766338307723749,
"learning_rate": 8.297564442776014e-05,
"loss": 0.3423,
"step": 135
},
{
"epoch": 1.8013245033112582,
"grad_norm": 0.08162961612606438,
"learning_rate": 8.144522461705067e-05,
"loss": 0.3316,
"step": 136
},
{
"epoch": 1.814569536423841,
"grad_norm": 0.08852249330426205,
"learning_rate": 7.991929271442817e-05,
"loss": 0.3483,
"step": 137
},
{
"epoch": 1.8278145695364238,
"grad_norm": 0.08788889130608463,
"learning_rate": 7.839821780235168e-05,
"loss": 0.3554,
"step": 138
},
{
"epoch": 1.8410596026490067,
"grad_norm": 0.08567621661342421,
"learning_rate": 7.688236778850306e-05,
"loss": 0.3333,
"step": 139
},
{
"epoch": 1.8543046357615895,
"grad_norm": 0.09025227183243908,
"learning_rate": 7.537210931679987e-05,
"loss": 0.3461,
"step": 140
},
{
"epoch": 1.8675496688741722,
"grad_norm": 0.0887176743957205,
"learning_rate": 7.386780767871397e-05,
"loss": 0.3459,
"step": 141
},
{
"epoch": 1.8807947019867548,
"grad_norm": 0.08665996940712498,
"learning_rate": 7.236982672491698e-05,
"loss": 0.3539,
"step": 142
},
{
"epoch": 1.8940397350993377,
"grad_norm": 0.08608862013105582,
"learning_rate": 7.087852877727481e-05,
"loss": 0.3418,
"step": 143
},
{
"epoch": 1.9072847682119205,
"grad_norm": 0.08420947731369693,
"learning_rate": 6.939427454121128e-05,
"loss": 0.3385,
"step": 144
},
{
"epoch": 1.9205298013245033,
"grad_norm": 0.08687771570570416,
"learning_rate": 6.791742301846326e-05,
"loss": 0.3484,
"step": 145
},
{
"epoch": 1.9337748344370862,
"grad_norm": 0.09001811775951214,
"learning_rate": 6.644833142024751e-05,
"loss": 0.3482,
"step": 146
},
{
"epoch": 1.9470198675496688,
"grad_norm": 0.08461468347282106,
"learning_rate": 6.498735508086093e-05,
"loss": 0.3384,
"step": 147
},
{
"epoch": 1.9602649006622517,
"grad_norm": 0.08353611993941902,
"learning_rate": 6.35348473717345e-05,
"loss": 0.343,
"step": 148
},
{
"epoch": 1.9735099337748343,
"grad_norm": 0.0834738694275141,
"learning_rate": 6.209115961596208e-05,
"loss": 0.3431,
"step": 149
},
{
"epoch": 1.9867549668874172,
"grad_norm": 0.08599845820919347,
"learning_rate": 6.065664100332478e-05,
"loss": 0.3381,
"step": 150
},
{
"epoch": 2.0,
"grad_norm": 0.08781968968497832,
"learning_rate": 5.923163850583113e-05,
"loss": 0.3361,
"step": 151
},
{
"epoch": 2.0,
"eval_loss": 0.35156726837158203,
"eval_runtime": 38.8035,
"eval_samples_per_second": 26.08,
"eval_steps_per_second": 0.825,
"step": 151
},
{
"epoch": 2.013245033112583,
"grad_norm": 0.08189131042429836,
"learning_rate": 5.781649679379378e-05,
"loss": 0.3168,
"step": 152
},
{
"epoch": 2.0264900662251657,
"grad_norm": 0.08590965338671859,
"learning_rate": 5.6411558152462894e-05,
"loss": 0.3327,
"step": 153
},
{
"epoch": 2.0397350993377485,
"grad_norm": 0.08632653329140866,
"learning_rate": 5.501716239923642e-05,
"loss": 0.331,
"step": 154
},
{
"epoch": 2.052980132450331,
"grad_norm": 0.08516842826462703,
"learning_rate": 5.363364680146725e-05,
"loss": 0.3306,
"step": 155
},
{
"epoch": 2.066225165562914,
"grad_norm": 0.08496401039658237,
"learning_rate": 5.226134599488728e-05,
"loss": 0.3248,
"step": 156
},
{
"epoch": 2.0794701986754967,
"grad_norm": 0.08826525390483432,
"learning_rate": 5.090059190266779e-05,
"loss": 0.3308,
"step": 157
},
{
"epoch": 2.0927152317880795,
"grad_norm": 0.08487280637626197,
"learning_rate": 4.955171365513603e-05,
"loss": 0.3211,
"step": 158
},
{
"epoch": 2.1059602649006623,
"grad_norm": 0.09382764910639449,
"learning_rate": 4.821503751016746e-05,
"loss": 0.3354,
"step": 159
},
{
"epoch": 2.119205298013245,
"grad_norm": 0.08732672940741114,
"learning_rate": 4.689088677427249e-05,
"loss": 0.3315,
"step": 160
},
{
"epoch": 2.1324503311258276,
"grad_norm": 0.09541697755263766,
"learning_rate": 4.5579581724397255e-05,
"loss": 0.3373,
"step": 161
},
{
"epoch": 2.1456953642384105,
"grad_norm": 0.08867554361971618,
"learning_rate": 4.428143953045717e-05,
"loss": 0.3383,
"step": 162
},
{
"epoch": 2.1589403973509933,
"grad_norm": 0.09288456090060858,
"learning_rate": 4.2996774178621736e-05,
"loss": 0.331,
"step": 163
},
{
"epoch": 2.172185430463576,
"grad_norm": 0.08808813047917079,
"learning_rate": 4.172589639536991e-05,
"loss": 0.3223,
"step": 164
},
{
"epoch": 2.185430463576159,
"grad_norm": 0.09275105554751231,
"learning_rate": 4.046911357233343e-05,
"loss": 0.3301,
"step": 165
},
{
"epoch": 2.198675496688742,
"grad_norm": 0.09353735027294084,
"learning_rate": 3.922672969194686e-05,
"loss": 0.3295,
"step": 166
},
{
"epoch": 2.2119205298013247,
"grad_norm": 0.09234588799290942,
"learning_rate": 3.79990452539225e-05,
"loss": 0.3214,
"step": 167
},
{
"epoch": 2.225165562913907,
"grad_norm": 0.09179773375765557,
"learning_rate": 3.678635720256737e-05,
"loss": 0.3241,
"step": 168
},
{
"epoch": 2.23841059602649,
"grad_norm": 0.08971692725792768,
"learning_rate": 3.558895885496023e-05,
"loss": 0.3175,
"step": 169
},
{
"epoch": 2.251655629139073,
"grad_norm": 0.08939100980866099,
"learning_rate": 3.440713983000601e-05,
"loss": 0.3252,
"step": 170
},
{
"epoch": 2.2649006622516556,
"grad_norm": 0.09306831321980909,
"learning_rate": 3.324118597838464e-05,
"loss": 0.3225,
"step": 171
},
{
"epoch": 2.2781456953642385,
"grad_norm": 0.09091774211009096,
"learning_rate": 3.209137931341143e-05,
"loss": 0.3215,
"step": 172
},
{
"epoch": 2.2913907284768213,
"grad_norm": 0.08998835153295978,
"learning_rate": 3.0957997942825336e-05,
"loss": 0.3332,
"step": 173
},
{
"epoch": 2.304635761589404,
"grad_norm": 0.08999871518726542,
"learning_rate": 2.9841316001522347e-05,
"loss": 0.3265,
"step": 174
},
{
"epoch": 2.3178807947019866,
"grad_norm": 0.08874688997641272,
"learning_rate": 2.874160358524931e-05,
"loss": 0.328,
"step": 175
},
{
"epoch": 2.3311258278145695,
"grad_norm": 0.08979245895359222,
"learning_rate": 2.7659126685275027e-05,
"loss": 0.3288,
"step": 176
},
{
"epoch": 2.3443708609271523,
"grad_norm": 0.09322170086883196,
"learning_rate": 2.659414712405398e-05,
"loss": 0.3264,
"step": 177
},
{
"epoch": 2.357615894039735,
"grad_norm": 0.0873785964065595,
"learning_rate": 2.5546922491898495e-05,
"loss": 0.3283,
"step": 178
},
{
"epoch": 2.370860927152318,
"grad_norm": 0.09137697607964013,
"learning_rate": 2.451770608467432e-05,
"loss": 0.3265,
"step": 179
},
{
"epoch": 2.384105960264901,
"grad_norm": 0.08934971281847022,
"learning_rate": 2.3506746842535242e-05,
"loss": 0.3197,
"step": 180
},
{
"epoch": 2.3973509933774833,
"grad_norm": 0.09226380851297578,
"learning_rate": 2.251428928971102e-05,
"loss": 0.3303,
"step": 181
},
{
"epoch": 2.410596026490066,
"grad_norm": 0.08813038828978075,
"learning_rate": 2.1540573475363402e-05,
"loss": 0.3147,
"step": 182
},
{
"epoch": 2.423841059602649,
"grad_norm": 0.09148478249319783,
"learning_rate": 2.058583491552465e-05,
"loss": 0.3304,
"step": 183
},
{
"epoch": 2.437086092715232,
"grad_norm": 0.08970976007155415,
"learning_rate": 1.9650304536132426e-05,
"loss": 0.3142,
"step": 184
},
{
"epoch": 2.4503311258278146,
"grad_norm": 0.0914061480835884,
"learning_rate": 1.8734208617174988e-05,
"loss": 0.3332,
"step": 185
},
{
"epoch": 2.4635761589403975,
"grad_norm": 0.09223482668849642,
"learning_rate": 1.783776873795994e-05,
"loss": 0.3235,
"step": 186
},
{
"epoch": 2.47682119205298,
"grad_norm": 0.09218058790384615,
"learning_rate": 1.696120172352025e-05,
"loss": 0.3281,
"step": 187
},
{
"epoch": 2.4900662251655628,
"grad_norm": 0.09120288324314661,
"learning_rate": 1.6104719592169902e-05,
"loss": 0.323,
"step": 188
},
{
"epoch": 2.5033112582781456,
"grad_norm": 0.09425838170079778,
"learning_rate": 1.526852950422226e-05,
"loss": 0.3214,
"step": 189
},
{
"epoch": 2.5165562913907285,
"grad_norm": 0.09259911612664488,
"learning_rate": 1.4452833711883628e-05,
"loss": 0.3172,
"step": 190
},
{
"epoch": 2.5298013245033113,
"grad_norm": 0.08967866399346999,
"learning_rate": 1.3657829510333654e-05,
"loss": 0.314,
"step": 191
},
{
"epoch": 2.543046357615894,
"grad_norm": 0.09263981141490185,
"learning_rate": 1.2883709190004955e-05,
"loss": 0.3306,
"step": 192
},
{
"epoch": 2.556291390728477,
"grad_norm": 0.0924041757651034,
"learning_rate": 1.2130659990073146e-05,
"loss": 0.3238,
"step": 193
},
{
"epoch": 2.5695364238410594,
"grad_norm": 0.08680414784000516,
"learning_rate": 1.1398864053168534e-05,
"loss": 0.3172,
"step": 194
},
{
"epoch": 2.5827814569536423,
"grad_norm": 0.08927214818010673,
"learning_rate": 1.0688498381320855e-05,
"loss": 0.3148,
"step": 195
},
{
"epoch": 2.596026490066225,
"grad_norm": 0.09039528377033235,
"learning_rate": 9.999734793146998e-06,
"loss": 0.3212,
"step": 196
},
{
"epoch": 2.609271523178808,
"grad_norm": 0.08907654916187858,
"learning_rate": 9.332739882292752e-06,
"loss": 0.3124,
"step": 197
},
{
"epoch": 2.622516556291391,
"grad_norm": 0.09035973348094353,
"learning_rate": 8.687674977138116e-06,
"loss": 0.3246,
"step": 198
},
{
"epoch": 2.6357615894039736,
"grad_norm": 0.08737713823497803,
"learning_rate": 8.064696101776358e-06,
"loss": 0.3143,
"step": 199
},
{
"epoch": 2.6490066225165565,
"grad_norm": 0.08814135175802748,
"learning_rate": 7.463953938275858e-06,
"loss": 0.3094,
"step": 200
},
{
"epoch": 2.662251655629139,
"grad_norm": 0.08889240634697596,
"learning_rate": 6.8855937902340576e-06,
"loss": 0.3214,
"step": 201
},
{
"epoch": 2.6754966887417218,
"grad_norm": 0.09012485234682949,
"learning_rate": 6.329755547632499e-06,
"loss": 0.3169,
"step": 202
},
{
"epoch": 2.6887417218543046,
"grad_norm": 0.09076602960863962,
"learning_rate": 5.7965736530010916e-06,
"loss": 0.3218,
"step": 203
},
{
"epoch": 2.7019867549668874,
"grad_norm": 0.09128692637997875,
"learning_rate": 5.286177068899989e-06,
"loss": 0.3224,
"step": 204
},
{
"epoch": 2.7152317880794703,
"grad_norm": 0.08980696390068593,
"learning_rate": 4.798689246727006e-06,
"loss": 0.3255,
"step": 205
},
{
"epoch": 2.7284768211920527,
"grad_norm": 0.08721555286082,
"learning_rate": 4.3342280968580285e-06,
"loss": 0.3056,
"step": 206
},
{
"epoch": 2.741721854304636,
"grad_norm": 0.09013962844918878,
"learning_rate": 3.892905960127546e-06,
"loss": 0.3198,
"step": 207
},
{
"epoch": 2.7549668874172184,
"grad_norm": 0.09102568370124482,
"learning_rate": 3.4748295806564356e-06,
"loss": 0.3192,
"step": 208
},
{
"epoch": 2.7682119205298013,
"grad_norm": 0.09384836363080047,
"learning_rate": 3.0801000800333877e-06,
"loss": 0.3269,
"step": 209
},
{
"epoch": 2.781456953642384,
"grad_norm": 0.09126268422899254,
"learning_rate": 2.708812932856253e-06,
"loss": 0.3302,
"step": 210
},
{
"epoch": 2.794701986754967,
"grad_norm": 0.08781813338797502,
"learning_rate": 2.3610579436393e-06,
"loss": 0.3272,
"step": 211
},
{
"epoch": 2.80794701986755,
"grad_norm": 0.09110065248669541,
"learning_rate": 2.036919225091827e-06,
"loss": 0.3206,
"step": 212
},
{
"epoch": 2.821192052980132,
"grad_norm": 0.09086421544518553,
"learning_rate": 1.7364751777736332e-06,
"loss": 0.3245,
"step": 213
},
{
"epoch": 2.8344370860927155,
"grad_norm": 0.08855581117736014,
"learning_rate": 1.459798471131868e-06,
"loss": 0.3118,
"step": 214
},
{
"epoch": 2.847682119205298,
"grad_norm": 0.08936995804191887,
"learning_rate": 1.2069560259243328e-06,
"loss": 0.3215,
"step": 215
},
{
"epoch": 2.8609271523178808,
"grad_norm": 0.0921595910113618,
"learning_rate": 9.780089980330642e-07,
"loss": 0.3174,
"step": 216
},
{
"epoch": 2.8741721854304636,
"grad_norm": 0.08711718437070236,
"learning_rate": 7.730127636723539e-07,
"loss": 0.3177,
"step": 217
},
{
"epoch": 2.8874172185430464,
"grad_norm": 0.09131775721484407,
"learning_rate": 5.920169059947411e-07,
"loss": 0.3232,
"step": 218
},
{
"epoch": 2.9006622516556293,
"grad_norm": 0.08947994407470564,
"learning_rate": 4.3506520309813947e-07,
"loss": 0.3204,
"step": 219
},
{
"epoch": 2.9139072847682117,
"grad_norm": 0.08743216843583222,
"learning_rate": 3.0219561743707326e-07,
"loss": 0.3231,
"step": 220
},
{
"epoch": 2.9271523178807946,
"grad_norm": 0.09204563273581286,
"learning_rate": 1.9344028664056713e-07,
"loss": 0.3206,
"step": 221
},
{
"epoch": 2.9403973509933774,
"grad_norm": 0.08928755161531188,
"learning_rate": 1.0882551573891953e-07,
"loss": 0.3258,
"step": 222
},
{
"epoch": 2.9536423841059603,
"grad_norm": 0.09055680073868443,
"learning_rate": 4.837177080119215e-08,
"loss": 0.3207,
"step": 223
},
{
"epoch": 2.966887417218543,
"grad_norm": 0.0882029082304654,
"learning_rate": 1.209367398504746e-08,
"loss": 0.314,
"step": 224
},
{
"epoch": 2.980132450331126,
"grad_norm": 0.09307741342290024,
"learning_rate": 0.0,
"loss": 0.3346,
"step": 225
},
{
"epoch": 2.980132450331126,
"eval_loss": 0.3478808104991913,
"eval_runtime": 37.4367,
"eval_samples_per_second": 27.032,
"eval_steps_per_second": 0.855,
"step": 225
},
{
"epoch": 2.980132450331126,
"step": 225,
"total_flos": 1.002324572158034e+17,
"train_loss": 0.3962253777186076,
"train_runtime": 3220.2895,
"train_samples_per_second": 8.951,
"train_steps_per_second": 0.07
}
],
"logging_steps": 1,
"max_steps": 225,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.002324572158034e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}