Model save

c156f11 verified 4 months ago

40.6 kB

	{
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 2.980132450331126,
	"eval_steps": 500,
	"global_step": 225,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.013245033112582781,
	"grad_norm": 0.8096176088552035,
	"learning_rate": 8.695652173913044e-06,
	"loss": 1.2541,
	"step": 1
	},
	{
	"epoch": 0.026490066225165563,
	"grad_norm": 0.8050822017472643,
	"learning_rate": 1.739130434782609e-05,
	"loss": 1.227,
	"step": 2
	},
	{
	"epoch": 0.039735099337748346,
	"grad_norm": 0.7944772711887119,
	"learning_rate": 2.608695652173913e-05,
	"loss": 1.2415,
	"step": 3
	},
	{
	"epoch": 0.052980132450331126,
	"grad_norm": 0.7598134845438774,
	"learning_rate": 3.478260869565218e-05,
	"loss": 1.1949,
	"step": 4
	},
	{
	"epoch": 0.06622516556291391,
	"grad_norm": 0.7683127560022982,
	"learning_rate": 4.347826086956522e-05,
	"loss": 1.2093,
	"step": 5
	},
	{
	"epoch": 0.07947019867549669,
	"grad_norm": 0.5344525931760804,
	"learning_rate": 5.217391304347826e-05,
	"loss": 1.1036,
	"step": 6
	},
	{
	"epoch": 0.09271523178807947,
	"grad_norm": 0.4587044664340658,
	"learning_rate": 6.086956521739131e-05,
	"loss": 1.0166,
	"step": 7
	},
	{
	"epoch": 0.10596026490066225,
	"grad_norm": 0.4868625164917359,
	"learning_rate": 6.956521739130436e-05,
	"loss": 0.955,
	"step": 8
	},
	{
	"epoch": 0.11920529801324503,
	"grad_norm": 0.5418471125188639,
	"learning_rate": 7.82608695652174e-05,
	"loss": 0.8997,
	"step": 9
	},
	{
	"epoch": 0.13245033112582782,
	"grad_norm": 0.5223349521251892,
	"learning_rate": 8.695652173913044e-05,
	"loss": 0.8113,
	"step": 10
	},
	{
	"epoch": 0.1456953642384106,
	"grad_norm": 0.4786982568033246,
	"learning_rate": 9.565217391304348e-05,
	"loss": 0.7325,
	"step": 11
	},
	{
	"epoch": 0.15894039735099338,
	"grad_norm": 0.46957216029807536,
	"learning_rate": 0.00010434782608695653,
	"loss": 0.6606,
	"step": 12
	},
	{
	"epoch": 0.17218543046357615,
	"grad_norm": 0.38029367288689914,
	"learning_rate": 0.00011304347826086956,
	"loss": 0.5808,
	"step": 13
	},
	{
	"epoch": 0.18543046357615894,
	"grad_norm": 0.24720582418095602,
	"learning_rate": 0.00012173913043478263,
	"loss": 0.5613,
	"step": 14
	},
	{
	"epoch": 0.1986754966887417,
	"grad_norm": 0.23099067802861695,
	"learning_rate": 0.00013043478260869567,
	"loss": 0.5391,
	"step": 15
	},
	{
	"epoch": 0.2119205298013245,
	"grad_norm": 0.20957820248410008,
	"learning_rate": 0.0001391304347826087,
	"loss": 0.539,
	"step": 16
	},
	{
	"epoch": 0.2251655629139073,
	"grad_norm": 0.21711931182463448,
	"learning_rate": 0.00014782608695652173,
	"loss": 0.5268,
	"step": 17
	},
	{
	"epoch": 0.23841059602649006,
	"grad_norm": 0.1951790595421549,
	"learning_rate": 0.0001565217391304348,
	"loss": 0.4963,
	"step": 18
	},
	{
	"epoch": 0.25165562913907286,
	"grad_norm": 0.1826409685431601,
	"learning_rate": 0.00016521739130434784,
	"loss": 0.4952,
	"step": 19
	},
	{
	"epoch": 0.26490066225165565,
	"grad_norm": 0.14373385619543355,
	"learning_rate": 0.00017391304347826088,
	"loss": 0.4837,
	"step": 20
	},
	{
	"epoch": 0.2781456953642384,
	"grad_norm": 0.12173908533781636,
	"learning_rate": 0.00018260869565217392,
	"loss": 0.4634,
	"step": 21
	},
	{
	"epoch": 0.2913907284768212,
	"grad_norm": 0.12297735060498352,
	"learning_rate": 0.00019130434782608697,
	"loss": 0.4573,
	"step": 22
	},
	{
	"epoch": 0.304635761589404,
	"grad_norm": 0.10994270746188307,
	"learning_rate": 0.0002,
	"loss": 0.4683,
	"step": 23
	},
	{
	"epoch": 0.31788079470198677,
	"grad_norm": 0.11351044281096902,
	"learning_rate": 0.00019998790632601496,
	"loss": 0.4322,
	"step": 24
	},
	{
	"epoch": 0.33112582781456956,
	"grad_norm": 0.11243087776192183,
	"learning_rate": 0.00019995162822919883,
	"loss": 0.4516,
	"step": 25
	},
	{
	"epoch": 0.3443708609271523,
	"grad_norm": 0.11510175208476785,
	"learning_rate": 0.00019989117448426108,
	"loss": 0.4499,
	"step": 26
	},
	{
	"epoch": 0.3576158940397351,
	"grad_norm": 0.11693433753737806,
	"learning_rate": 0.00019980655971335945,
	"loss": 0.4542,
	"step": 27
	},
	{
	"epoch": 0.3708609271523179,
	"grad_norm": 0.11467246423231502,
	"learning_rate": 0.00019969780438256293,
	"loss": 0.4337,
	"step": 28
	},
	{
	"epoch": 0.3841059602649007,
	"grad_norm": 0.11115653137915112,
	"learning_rate": 0.0001995649347969019,
	"loss": 0.4263,
	"step": 29
	},
	{
	"epoch": 0.3973509933774834,
	"grad_norm": 0.11024786542483019,
	"learning_rate": 0.00019940798309400526,
	"loss": 0.4342,
	"step": 30
	},
	{
	"epoch": 0.4105960264900662,
	"grad_norm": 0.10312580553142063,
	"learning_rate": 0.00019922698723632767,
	"loss": 0.4267,
	"step": 31
	},
	{
	"epoch": 0.423841059602649,
	"grad_norm": 0.11074151337400631,
	"learning_rate": 0.00019902199100196697,
	"loss": 0.4286,
	"step": 32
	},
	{
	"epoch": 0.4370860927152318,
	"grad_norm": 0.09029943151079976,
	"learning_rate": 0.0001987930439740757,
	"loss": 0.4152,
	"step": 33
	},
	{
	"epoch": 0.4503311258278146,
	"grad_norm": 0.09101826700354056,
	"learning_rate": 0.00019854020152886814,
	"loss": 0.4313,
	"step": 34
	},
	{
	"epoch": 0.46357615894039733,
	"grad_norm": 0.0914630983642065,
	"learning_rate": 0.00019826352482222638,
	"loss": 0.4117,
	"step": 35
	},
	{
	"epoch": 0.4768211920529801,
	"grad_norm": 0.09219697877770537,
	"learning_rate": 0.00019796308077490817,
	"loss": 0.4175,
	"step": 36
	},
	{
	"epoch": 0.4900662251655629,
	"grad_norm": 0.08852002864296264,
	"learning_rate": 0.00019763894205636072,
	"loss": 0.4041,
	"step": 37
	},
	{
	"epoch": 0.5033112582781457,
	"grad_norm": 0.08580676378486166,
	"learning_rate": 0.00019729118706714375,
	"loss": 0.404,
	"step": 38
	},
	{
	"epoch": 0.5165562913907285,
	"grad_norm": 0.08598698501328113,
	"learning_rate": 0.00019691989991996663,
	"loss": 0.4087,
	"step": 39
	},
	{
	"epoch": 0.5298013245033113,
	"grad_norm": 0.08961053716539952,
	"learning_rate": 0.00019652517041934356,
	"loss": 0.4014,
	"step": 40
	},
	{
	"epoch": 0.543046357615894,
	"grad_norm": 0.08443482401797175,
	"learning_rate": 0.00019610709403987246,
	"loss": 0.4137,
	"step": 41
	},
	{
	"epoch": 0.5562913907284768,
	"grad_norm": 0.08466021640310874,
	"learning_rate": 0.00019566577190314197,
	"loss": 0.4071,
	"step": 42
	},
	{
	"epoch": 0.5695364238410596,
	"grad_norm": 0.08784527020927076,
	"learning_rate": 0.00019520131075327298,
	"loss": 0.4061,
	"step": 43
	},
	{
	"epoch": 0.5827814569536424,
	"grad_norm": 0.08325332082087357,
	"learning_rate": 0.00019471382293110003,
	"loss": 0.3957,
	"step": 44
	},
	{
	"epoch": 0.5960264900662252,
	"grad_norm": 0.08614805595781429,
	"learning_rate": 0.0001942034263469989,
	"loss": 0.4053,
	"step": 45
	},
	{
	"epoch": 0.609271523178808,
	"grad_norm": 0.07902174863469037,
	"learning_rate": 0.00019367024445236754,
	"loss": 0.3987,
	"step": 46
	},
	{
	"epoch": 0.6225165562913907,
	"grad_norm": 0.08133695710941313,
	"learning_rate": 0.00019311440620976597,
	"loss": 0.3942,
	"step": 47
	},
	{
	"epoch": 0.6357615894039735,
	"grad_norm": 0.08276360028919133,
	"learning_rate": 0.00019253604606172417,
	"loss": 0.3951,
	"step": 48
	},
	{
	"epoch": 0.6490066225165563,
	"grad_norm": 0.08194802489692825,
	"learning_rate": 0.00019193530389822363,
	"loss": 0.3917,
	"step": 49
	},
	{
	"epoch": 0.6622516556291391,
	"grad_norm": 0.08159974959706186,
	"learning_rate": 0.00019131232502286188,
	"loss": 0.3934,
	"step": 50
	},
	{
	"epoch": 0.6754966887417219,
	"grad_norm": 0.08170998905157066,
	"learning_rate": 0.00019066726011770726,
	"loss": 0.3851,
	"step": 51
	},
	{
	"epoch": 0.6887417218543046,
	"grad_norm": 0.08020907094953274,
	"learning_rate": 0.00019000026520685302,
	"loss": 0.3893,
	"step": 52
	},
	{
	"epoch": 0.7019867549668874,
	"grad_norm": 0.08034981466771474,
	"learning_rate": 0.00018931150161867916,
	"loss": 0.381,
	"step": 53
	},
	{
	"epoch": 0.7152317880794702,
	"grad_norm": 0.08444845993593682,
	"learning_rate": 0.00018860113594683148,
	"loss": 0.3915,
	"step": 54
	},
	{
	"epoch": 0.7284768211920529,
	"grad_norm": 0.08015215412606266,
	"learning_rate": 0.00018786934000992688,
	"loss": 0.3833,
	"step": 55
	},
	{
	"epoch": 0.7417218543046358,
	"grad_norm": 0.08464858931007045,
	"learning_rate": 0.00018711629080999504,
	"loss": 0.3826,
	"step": 56
	},
	{
	"epoch": 0.7549668874172185,
	"grad_norm": 0.08291520407405459,
	"learning_rate": 0.00018634217048966637,
	"loss": 0.3738,
	"step": 57
	},
	{
	"epoch": 0.7682119205298014,
	"grad_norm": 0.08660040487398858,
	"learning_rate": 0.0001855471662881164,
	"loss": 0.3856,
	"step": 58
	},
	{
	"epoch": 0.7814569536423841,
	"grad_norm": 0.0857196214995308,
	"learning_rate": 0.00018473147049577774,
	"loss": 0.3779,
	"step": 59
	},
	{
	"epoch": 0.7947019867549668,
	"grad_norm": 0.07987880371713715,
	"learning_rate": 0.00018389528040783012,
	"loss": 0.3766,
	"step": 60
	},
	{
	"epoch": 0.8079470198675497,
	"grad_norm": 0.08369440099668185,
	"learning_rate": 0.00018303879827647975,
	"loss": 0.3835,
	"step": 61
	},
	{
	"epoch": 0.8211920529801324,
	"grad_norm": 0.08373532556639413,
	"learning_rate": 0.00018216223126204007,
	"loss": 0.3745,
	"step": 62
	},
	{
	"epoch": 0.8344370860927153,
	"grad_norm": 0.08073536197157054,
	"learning_rate": 0.00018126579138282503,
	"loss": 0.3687,
	"step": 63
	},
	{
	"epoch": 0.847682119205298,
	"grad_norm": 0.08284465509601228,
	"learning_rate": 0.00018034969546386757,
	"loss": 0.3787,
	"step": 64
	},
	{
	"epoch": 0.8609271523178808,
	"grad_norm": 0.0842934427371451,
	"learning_rate": 0.00017941416508447536,
	"loss": 0.3873,
	"step": 65
	},
	{
	"epoch": 0.8741721854304636,
	"grad_norm": 0.08355593713327628,
	"learning_rate": 0.0001784594265246366,
	"loss": 0.3778,
	"step": 66
	},
	{
	"epoch": 0.8874172185430463,
	"grad_norm": 0.08950539941436171,
	"learning_rate": 0.000177485710710289,
	"loss": 0.3727,
	"step": 67
	},
	{
	"epoch": 0.9006622516556292,
	"grad_norm": 0.08710263548451828,
	"learning_rate": 0.00017649325315746478,
	"loss": 0.3808,
	"step": 68
	},
	{
	"epoch": 0.9139072847682119,
	"grad_norm": 0.0887614198652171,
	"learning_rate": 0.00017548229391532572,
	"loss": 0.3789,
	"step": 69
	},
	{
	"epoch": 0.9271523178807947,
	"grad_norm": 0.08666661250569707,
	"learning_rate": 0.0001744530775081015,
	"loss": 0.3732,
	"step": 70
	},
	{
	"epoch": 0.9403973509933775,
	"grad_norm": 0.0849525268450149,
	"learning_rate": 0.00017340585287594604,
	"loss": 0.3712,
	"step": 71
	},
	{
	"epoch": 0.9536423841059603,
	"grad_norm": 0.08625788315304235,
	"learning_rate": 0.00017234087331472497,
	"loss": 0.3597,
	"step": 72
	},
	{
	"epoch": 0.9668874172185431,
	"grad_norm": 0.07851130512605926,
	"learning_rate": 0.00017125839641475072,
	"loss": 0.3639,
	"step": 73
	},
	{
	"epoch": 0.9801324503311258,
	"grad_norm": 0.08964240238751611,
	"learning_rate": 0.00017015868399847768,
	"loss": 0.3844,
	"step": 74
	},
	{
	"epoch": 0.9933774834437086,
	"grad_norm": 0.08516340365396252,
	"learning_rate": 0.0001690420020571747,
	"loss": 0.372,
	"step": 75
	},
	{
	"epoch": 0.9933774834437086,
	"eval_loss": 0.3703567683696747,
	"eval_runtime": 46.123,
	"eval_samples_per_second": 21.941,
	"eval_steps_per_second": 0.694,
	"step": 75
	},
	{
	"epoch": 1.0066225165562914,
	"grad_norm": 0.07944382362889917,
	"learning_rate": 0.0001679086206865886,
	"loss": 0.3697,
	"step": 76
	},
	{
	"epoch": 1.0198675496688743,
	"grad_norm": 0.08265930361903498,
	"learning_rate": 0.00016675881402161536,
	"loss": 0.3551,
	"step": 77
	},
	{
	"epoch": 1.033112582781457,
	"grad_norm": 0.08703614399996357,
	"learning_rate": 0.000165592860169994,
	"loss": 0.3442,
	"step": 78
	},
	{
	"epoch": 1.0463576158940397,
	"grad_norm": 0.08916319509375828,
	"learning_rate": 0.0001644110411450398,
	"loss": 0.365,
	"step": 79
	},
	{
	"epoch": 1.0596026490066226,
	"grad_norm": 0.08703848127871557,
	"learning_rate": 0.00016321364279743266,
	"loss": 0.3611,
	"step": 80
	},
	{
	"epoch": 1.0728476821192052,
	"grad_norm": 0.09052558000694078,
	"learning_rate": 0.00016200095474607753,
	"loss": 0.3615,
	"step": 81
	},
	{
	"epoch": 1.086092715231788,
	"grad_norm": 0.08918100371610707,
	"learning_rate": 0.0001607732703080532,
	"loss": 0.342,
	"step": 82
	},
	{
	"epoch": 1.099337748344371,
	"grad_norm": 0.08576575268439565,
	"learning_rate": 0.0001595308864276666,
	"loss": 0.3598,
	"step": 83
	},
	{
	"epoch": 1.1125827814569536,
	"grad_norm": 0.08585017464402006,
	"learning_rate": 0.0001582741036046301,
	"loss": 0.3504,
	"step": 84
	},
	{
	"epoch": 1.1258278145695364,
	"grad_norm": 0.08593452414859805,
	"learning_rate": 0.00015700322582137827,
	"loss": 0.3432,
	"step": 85
	},
	{
	"epoch": 1.1390728476821192,
	"grad_norm": 0.08731970510720415,
	"learning_rate": 0.00015571856046954285,
	"loss": 0.3457,
	"step": 86
	},
	{
	"epoch": 1.152317880794702,
	"grad_norm": 0.0921843418842424,
	"learning_rate": 0.00015442041827560274,
	"loss": 0.3507,
	"step": 87
	},
	{
	"epoch": 1.1655629139072847,
	"grad_norm": 0.09651961400159455,
	"learning_rate": 0.00015310911322572753,
	"loss": 0.3596,
	"step": 88
	},
	{
	"epoch": 1.1788079470198676,
	"grad_norm": 0.08524005048376013,
	"learning_rate": 0.00015178496248983254,
	"loss": 0.3554,
	"step": 89
	},
	{
	"epoch": 1.1920529801324504,
	"grad_norm": 0.08859594152270273,
	"learning_rate": 0.000150448286344864,
	"loss": 0.3551,
	"step": 90
	},
	{
	"epoch": 1.205298013245033,
	"grad_norm": 0.0924808469627539,
	"learning_rate": 0.00014909940809733222,
	"loss": 0.3525,
	"step": 91
	},
	{
	"epoch": 1.218543046357616,
	"grad_norm": 0.08644059805052462,
	"learning_rate": 0.00014773865400511272,
	"loss": 0.3503,
	"step": 92
	},
	{
	"epoch": 1.2317880794701987,
	"grad_norm": 0.09131894341880005,
	"learning_rate": 0.00014636635319853275,
	"loss": 0.3571,
	"step": 93
	},
	{
	"epoch": 1.2450331125827814,
	"grad_norm": 0.08393682045402433,
	"learning_rate": 0.0001449828376007636,
	"loss": 0.3476,
	"step": 94
	},
	{
	"epoch": 1.2582781456953642,
	"grad_norm": 0.08696313045637266,
	"learning_rate": 0.00014358844184753712,
	"loss": 0.3594,
	"step": 95
	},
	{
	"epoch": 1.271523178807947,
	"grad_norm": 0.09458041630505085,
	"learning_rate": 0.00014218350320620624,
	"loss": 0.3626,
	"step": 96
	},
	{
	"epoch": 1.2847682119205297,
	"grad_norm": 0.08823303635376296,
	"learning_rate": 0.00014076836149416887,
	"loss": 0.3499,
	"step": 97
	},
	{
	"epoch": 1.2980132450331126,
	"grad_norm": 0.09294675372857181,
	"learning_rate": 0.00013934335899667527,
	"loss": 0.3539,
	"step": 98
	},
	{
	"epoch": 1.3112582781456954,
	"grad_norm": 0.08824268036877034,
	"learning_rate": 0.00013790884038403795,
	"loss": 0.3514,
	"step": 99
	},
	{
	"epoch": 1.3245033112582782,
	"grad_norm": 0.08535480262896947,
	"learning_rate": 0.00013646515262826552,
	"loss": 0.345,
	"step": 100
	},
	{
	"epoch": 1.3377483443708609,
	"grad_norm": 0.08847562725166169,
	"learning_rate": 0.00013501264491913906,
	"loss": 0.3616,
	"step": 101
	},
	{
	"epoch": 1.3509933774834437,
	"grad_norm": 0.08859058434854095,
	"learning_rate": 0.0001335516685797525,
	"loss": 0.3562,
	"step": 102
	},
	{
	"epoch": 1.3642384105960264,
	"grad_norm": 0.08715025975746184,
	"learning_rate": 0.00013208257698153677,
	"loss": 0.3455,
	"step": 103
	},
	{
	"epoch": 1.3774834437086092,
	"grad_norm": 0.0853594568437305,
	"learning_rate": 0.00013060572545878875,
	"loss": 0.346,
	"step": 104
	},
	{
	"epoch": 1.390728476821192,
	"grad_norm": 0.08722491192064814,
	"learning_rate": 0.00012912147122272523,
	"loss": 0.3555,
	"step": 105
	},
	{
	"epoch": 1.403973509933775,
	"grad_norm": 0.0871433664730764,
	"learning_rate": 0.00012763017327508305,
	"loss": 0.3556,
	"step": 106
	},
	{
	"epoch": 1.4172185430463577,
	"grad_norm": 0.08803547541904783,
	"learning_rate": 0.00012613219232128608,
	"loss": 0.3534,
	"step": 107
	},
	{
	"epoch": 1.4304635761589404,
	"grad_norm": 0.09122226233927531,
	"learning_rate": 0.00012462789068320017,
	"loss": 0.3569,
	"step": 108
	},
	{
	"epoch": 1.4437086092715232,
	"grad_norm": 0.09822341257641279,
	"learning_rate": 0.000123117632211497,
	"loss": 0.3633,
	"step": 109
	},
	{
	"epoch": 1.4569536423841059,
	"grad_norm": 0.09270090775666746,
	"learning_rate": 0.00012160178219764837,
	"loss": 0.3453,
	"step": 110
	},
	{
	"epoch": 1.4701986754966887,
	"grad_norm": 0.08925565696630358,
	"learning_rate": 0.00012008070728557186,
	"loss": 0.3508,
	"step": 111
	},
	{
	"epoch": 1.4834437086092715,
	"grad_norm": 0.09170653617303556,
	"learning_rate": 0.00011855477538294935,
	"loss": 0.3534,
	"step": 112
	},
	{
	"epoch": 1.4966887417218544,
	"grad_norm": 0.08583635619816832,
	"learning_rate": 0.00011702435557223987,
	"loss": 0.3463,
	"step": 113
	},
	{
	"epoch": 1.5099337748344372,
	"grad_norm": 0.08058809711878263,
	"learning_rate": 0.00011548981802140848,
	"loss": 0.3477,
	"step": 114
	},
	{
	"epoch": 1.5231788079470199,
	"grad_norm": 0.09093533643868798,
	"learning_rate": 0.00011395153389439233,
	"loss": 0.3512,
	"step": 115
	},
	{
	"epoch": 1.5364238410596025,
	"grad_norm": 0.09171376470501859,
	"learning_rate": 0.00011240987526132594,
	"loss": 0.3544,
	"step": 116
	},
	{
	"epoch": 1.5496688741721854,
	"grad_norm": 0.08586078909940174,
	"learning_rate": 0.00011086521500854745,
	"loss": 0.3694,
	"step": 117
	},
	{
	"epoch": 1.5629139072847682,
	"grad_norm": 0.08632019045566638,
	"learning_rate": 0.00010931792674840718,
	"loss": 0.3453,
	"step": 118
	},
	{
	"epoch": 1.576158940397351,
	"grad_norm": 0.09269094674353331,
	"learning_rate": 0.00010776838472890065,
	"loss": 0.3587,
	"step": 119
	},
	{
	"epoch": 1.589403973509934,
	"grad_norm": 0.08779002368050795,
	"learning_rate": 0.00010621696374314807,
	"loss": 0.3478,
	"step": 120
	},
	{
	"epoch": 1.6026490066225165,
	"grad_norm": 0.08586261022719192,
	"learning_rate": 0.00010466403903874176,
	"loss": 0.341,
	"step": 121
	},
	{
	"epoch": 1.6158940397350994,
	"grad_norm": 0.08611577193250892,
	"learning_rate": 0.0001031099862269837,
	"loss": 0.3558,
	"step": 122
	},
	{
	"epoch": 1.629139072847682,
	"grad_norm": 0.09316621499512412,
	"learning_rate": 0.0001015551811920351,
	"loss": 0.3541,
	"step": 123
	},
	{
	"epoch": 1.6423841059602649,
	"grad_norm": 0.08404147766450029,
	"learning_rate": 0.0001,
	"loss": 0.3489,
	"step": 124
	},
	{
	"epoch": 1.6556291390728477,
	"grad_norm": 0.08524287111150772,
	"learning_rate": 9.844481880796491e-05,
	"loss": 0.3541,
	"step": 125
	},
	{
	"epoch": 1.6688741721854305,
	"grad_norm": 0.08369196863657465,
	"learning_rate": 9.689001377301633e-05,
	"loss": 0.3421,
	"step": 126
	},
	{
	"epoch": 1.6821192052980134,
	"grad_norm": 0.08831018354579961,
	"learning_rate": 9.533596096125825e-05,
	"loss": 0.3484,
	"step": 127
	},
	{
	"epoch": 1.695364238410596,
	"grad_norm": 0.08931583825994703,
	"learning_rate": 9.378303625685195e-05,
	"loss": 0.3418,
	"step": 128
	},
	{
	"epoch": 1.7086092715231787,
	"grad_norm": 0.0920976409870365,
	"learning_rate": 9.223161527109937e-05,
	"loss": 0.3477,
	"step": 129
	},
	{
	"epoch": 1.7218543046357615,
	"grad_norm": 0.0866166191323527,
	"learning_rate": 9.068207325159284e-05,
	"loss": 0.3422,
	"step": 130
	},
	{
	"epoch": 1.7350993377483444,
	"grad_norm": 0.08394672431065998,
	"learning_rate": 8.913478499145254e-05,
	"loss": 0.337,
	"step": 131
	},
	{
	"epoch": 1.7483443708609272,
	"grad_norm": 0.08368403453651165,
	"learning_rate": 8.759012473867407e-05,
	"loss": 0.3487,
	"step": 132
	},
	{
	"epoch": 1.76158940397351,
	"grad_norm": 0.08503534775674756,
	"learning_rate": 8.604846610560771e-05,
	"loss": 0.3463,
	"step": 133
	},
	{
	"epoch": 1.7748344370860927,
	"grad_norm": 0.08495442186575057,
	"learning_rate": 8.451018197859153e-05,
	"loss": 0.3506,
	"step": 134
	},
	{
	"epoch": 1.7880794701986755,
	"grad_norm": 0.08766338307723749,
	"learning_rate": 8.297564442776014e-05,
	"loss": 0.3423,
	"step": 135
	},
	{
	"epoch": 1.8013245033112582,
	"grad_norm": 0.08162961612606438,
	"learning_rate": 8.144522461705067e-05,
	"loss": 0.3316,
	"step": 136
	},
	{
	"epoch": 1.814569536423841,
	"grad_norm": 0.08852249330426205,
	"learning_rate": 7.991929271442817e-05,
	"loss": 0.3483,
	"step": 137
	},
	{
	"epoch": 1.8278145695364238,
	"grad_norm": 0.08788889130608463,
	"learning_rate": 7.839821780235168e-05,
	"loss": 0.3554,
	"step": 138
	},
	{
	"epoch": 1.8410596026490067,
	"grad_norm": 0.08567621661342421,
	"learning_rate": 7.688236778850306e-05,
	"loss": 0.3333,
	"step": 139
	},
	{
	"epoch": 1.8543046357615895,
	"grad_norm": 0.09025227183243908,
	"learning_rate": 7.537210931679987e-05,
	"loss": 0.3461,
	"step": 140
	},
	{
	"epoch": 1.8675496688741722,
	"grad_norm": 0.0887176743957205,
	"learning_rate": 7.386780767871397e-05,
	"loss": 0.3459,
	"step": 141
	},
	{
	"epoch": 1.8807947019867548,
	"grad_norm": 0.08665996940712498,
	"learning_rate": 7.236982672491698e-05,
	"loss": 0.3539,
	"step": 142
	},
	{
	"epoch": 1.8940397350993377,
	"grad_norm": 0.08608862013105582,
	"learning_rate": 7.087852877727481e-05,
	"loss": 0.3418,
	"step": 143
	},
	{
	"epoch": 1.9072847682119205,
	"grad_norm": 0.08420947731369693,
	"learning_rate": 6.939427454121128e-05,
	"loss": 0.3385,
	"step": 144
	},
	{
	"epoch": 1.9205298013245033,
	"grad_norm": 0.08687771570570416,
	"learning_rate": 6.791742301846326e-05,
	"loss": 0.3484,
	"step": 145
	},
	{
	"epoch": 1.9337748344370862,
	"grad_norm": 0.09001811775951214,
	"learning_rate": 6.644833142024751e-05,
	"loss": 0.3482,
	"step": 146
	},
	{
	"epoch": 1.9470198675496688,
	"grad_norm": 0.08461468347282106,
	"learning_rate": 6.498735508086093e-05,
	"loss": 0.3384,
	"step": 147
	},
	{
	"epoch": 1.9602649006622517,
	"grad_norm": 0.08353611993941902,
	"learning_rate": 6.35348473717345e-05,
	"loss": 0.343,
	"step": 148
	},
	{
	"epoch": 1.9735099337748343,
	"grad_norm": 0.0834738694275141,
	"learning_rate": 6.209115961596208e-05,
	"loss": 0.3431,
	"step": 149
	},
	{
	"epoch": 1.9867549668874172,
	"grad_norm": 0.08599845820919347,
	"learning_rate": 6.065664100332478e-05,
	"loss": 0.3381,
	"step": 150
	},
	{
	"epoch": 2.0,
	"grad_norm": 0.08781968968497832,
	"learning_rate": 5.923163850583113e-05,
	"loss": 0.3361,
	"step": 151
	},
	{
	"epoch": 2.0,
	"eval_loss": 0.35156726837158203,
	"eval_runtime": 38.8035,
	"eval_samples_per_second": 26.08,
	"eval_steps_per_second": 0.825,
	"step": 151
	},
	{
	"epoch": 2.013245033112583,
	"grad_norm": 0.08189131042429836,
	"learning_rate": 5.781649679379378e-05,
	"loss": 0.3168,
	"step": 152
	},
	{
	"epoch": 2.0264900662251657,
	"grad_norm": 0.08590965338671859,
	"learning_rate": 5.6411558152462894e-05,
	"loss": 0.3327,
	"step": 153
	},
	{
	"epoch": 2.0397350993377485,
	"grad_norm": 0.08632653329140866,
	"learning_rate": 5.501716239923642e-05,
	"loss": 0.331,
	"step": 154
	},
	{
	"epoch": 2.052980132450331,
	"grad_norm": 0.08516842826462703,
	"learning_rate": 5.363364680146725e-05,
	"loss": 0.3306,
	"step": 155
	},
	{
	"epoch": 2.066225165562914,
	"grad_norm": 0.08496401039658237,
	"learning_rate": 5.226134599488728e-05,
	"loss": 0.3248,
	"step": 156
	},
	{
	"epoch": 2.0794701986754967,
	"grad_norm": 0.08826525390483432,
	"learning_rate": 5.090059190266779e-05,
	"loss": 0.3308,
	"step": 157
	},
	{
	"epoch": 2.0927152317880795,
	"grad_norm": 0.08487280637626197,
	"learning_rate": 4.955171365513603e-05,
	"loss": 0.3211,
	"step": 158
	},
	{
	"epoch": 2.1059602649006623,
	"grad_norm": 0.09382764910639449,
	"learning_rate": 4.821503751016746e-05,
	"loss": 0.3354,
	"step": 159
	},
	{
	"epoch": 2.119205298013245,
	"grad_norm": 0.08732672940741114,
	"learning_rate": 4.689088677427249e-05,
	"loss": 0.3315,
	"step": 160
	},
	{
	"epoch": 2.1324503311258276,
	"grad_norm": 0.09541697755263766,
	"learning_rate": 4.5579581724397255e-05,
	"loss": 0.3373,
	"step": 161
	},
	{
	"epoch": 2.1456953642384105,
	"grad_norm": 0.08867554361971618,
	"learning_rate": 4.428143953045717e-05,
	"loss": 0.3383,
	"step": 162
	},
	{
	"epoch": 2.1589403973509933,
	"grad_norm": 0.09288456090060858,
	"learning_rate": 4.2996774178621736e-05,
	"loss": 0.331,
	"step": 163
	},
	{
	"epoch": 2.172185430463576,
	"grad_norm": 0.08808813047917079,
	"learning_rate": 4.172589639536991e-05,
	"loss": 0.3223,
	"step": 164
	},
	{
	"epoch": 2.185430463576159,
	"grad_norm": 0.09275105554751231,
	"learning_rate": 4.046911357233343e-05,
	"loss": 0.3301,
	"step": 165
	},
	{
	"epoch": 2.198675496688742,
	"grad_norm": 0.09353735027294084,
	"learning_rate": 3.922672969194686e-05,
	"loss": 0.3295,
	"step": 166
	},
	{
	"epoch": 2.2119205298013247,
	"grad_norm": 0.09234588799290942,
	"learning_rate": 3.79990452539225e-05,
	"loss": 0.3214,
	"step": 167
	},
	{
	"epoch": 2.225165562913907,
	"grad_norm": 0.09179773375765557,
	"learning_rate": 3.678635720256737e-05,
	"loss": 0.3241,
	"step": 168
	},
	{
	"epoch": 2.23841059602649,
	"grad_norm": 0.08971692725792768,
	"learning_rate": 3.558895885496023e-05,
	"loss": 0.3175,
	"step": 169
	},
	{
	"epoch": 2.251655629139073,
	"grad_norm": 0.08939100980866099,
	"learning_rate": 3.440713983000601e-05,
	"loss": 0.3252,
	"step": 170
	},
	{
	"epoch": 2.2649006622516556,
	"grad_norm": 0.09306831321980909,
	"learning_rate": 3.324118597838464e-05,
	"loss": 0.3225,
	"step": 171
	},
	{
	"epoch": 2.2781456953642385,
	"grad_norm": 0.09091774211009096,
	"learning_rate": 3.209137931341143e-05,
	"loss": 0.3215,
	"step": 172
	},
	{
	"epoch": 2.2913907284768213,
	"grad_norm": 0.08998835153295978,
	"learning_rate": 3.0957997942825336e-05,
	"loss": 0.3332,
	"step": 173
	},
	{
	"epoch": 2.304635761589404,
	"grad_norm": 0.08999871518726542,
	"learning_rate": 2.9841316001522347e-05,
	"loss": 0.3265,
	"step": 174
	},
	{
	"epoch": 2.3178807947019866,
	"grad_norm": 0.08874688997641272,
	"learning_rate": 2.874160358524931e-05,
	"loss": 0.328,
	"step": 175
	},
	{
	"epoch": 2.3311258278145695,
	"grad_norm": 0.08979245895359222,
	"learning_rate": 2.7659126685275027e-05,
	"loss": 0.3288,
	"step": 176
	},
	{
	"epoch": 2.3443708609271523,
	"grad_norm": 0.09322170086883196,
	"learning_rate": 2.659414712405398e-05,
	"loss": 0.3264,
	"step": 177
	},
	{
	"epoch": 2.357615894039735,
	"grad_norm": 0.0873785964065595,
	"learning_rate": 2.5546922491898495e-05,
	"loss": 0.3283,
	"step": 178
	},
	{
	"epoch": 2.370860927152318,
	"grad_norm": 0.09137697607964013,
	"learning_rate": 2.451770608467432e-05,
	"loss": 0.3265,
	"step": 179
	},
	{
	"epoch": 2.384105960264901,
	"grad_norm": 0.08934971281847022,
	"learning_rate": 2.3506746842535242e-05,
	"loss": 0.3197,
	"step": 180
	},
	{
	"epoch": 2.3973509933774833,
	"grad_norm": 0.09226380851297578,
	"learning_rate": 2.251428928971102e-05,
	"loss": 0.3303,
	"step": 181
	},
	{
	"epoch": 2.410596026490066,
	"grad_norm": 0.08813038828978075,
	"learning_rate": 2.1540573475363402e-05,
	"loss": 0.3147,
	"step": 182
	},
	{
	"epoch": 2.423841059602649,
	"grad_norm": 0.09148478249319783,
	"learning_rate": 2.058583491552465e-05,
	"loss": 0.3304,
	"step": 183
	},
	{
	"epoch": 2.437086092715232,
	"grad_norm": 0.08970976007155415,
	"learning_rate": 1.9650304536132426e-05,
	"loss": 0.3142,
	"step": 184
	},
	{
	"epoch": 2.4503311258278146,
	"grad_norm": 0.0914061480835884,
	"learning_rate": 1.8734208617174988e-05,
	"loss": 0.3332,
	"step": 185
	},
	{
	"epoch": 2.4635761589403975,
	"grad_norm": 0.09223482668849642,
	"learning_rate": 1.783776873795994e-05,
	"loss": 0.3235,
	"step": 186
	},
	{
	"epoch": 2.47682119205298,
	"grad_norm": 0.09218058790384615,
	"learning_rate": 1.696120172352025e-05,
	"loss": 0.3281,
	"step": 187
	},
	{
	"epoch": 2.4900662251655628,
	"grad_norm": 0.09120288324314661,
	"learning_rate": 1.6104719592169902e-05,
	"loss": 0.323,
	"step": 188
	},
	{
	"epoch": 2.5033112582781456,
	"grad_norm": 0.09425838170079778,
	"learning_rate": 1.526852950422226e-05,
	"loss": 0.3214,
	"step": 189
	},
	{
	"epoch": 2.5165562913907285,
	"grad_norm": 0.09259911612664488,
	"learning_rate": 1.4452833711883628e-05,
	"loss": 0.3172,
	"step": 190
	},
	{
	"epoch": 2.5298013245033113,
	"grad_norm": 0.08967866399346999,
	"learning_rate": 1.3657829510333654e-05,
	"loss": 0.314,
	"step": 191
	},
	{
	"epoch": 2.543046357615894,
	"grad_norm": 0.09263981141490185,
	"learning_rate": 1.2883709190004955e-05,
	"loss": 0.3306,
	"step": 192
	},
	{
	"epoch": 2.556291390728477,
	"grad_norm": 0.0924041757651034,
	"learning_rate": 1.2130659990073146e-05,
	"loss": 0.3238,
	"step": 193
	},
	{
	"epoch": 2.5695364238410594,
	"grad_norm": 0.08680414784000516,
	"learning_rate": 1.1398864053168534e-05,
	"loss": 0.3172,
	"step": 194
	},
	{
	"epoch": 2.5827814569536423,
	"grad_norm": 0.08927214818010673,
	"learning_rate": 1.0688498381320855e-05,
	"loss": 0.3148,
	"step": 195
	},
	{
	"epoch": 2.596026490066225,
	"grad_norm": 0.09039528377033235,
	"learning_rate": 9.999734793146998e-06,
	"loss": 0.3212,
	"step": 196
	},
	{
	"epoch": 2.609271523178808,
	"grad_norm": 0.08907654916187858,
	"learning_rate": 9.332739882292752e-06,
	"loss": 0.3124,
	"step": 197
	},
	{
	"epoch": 2.622516556291391,
	"grad_norm": 0.09035973348094353,
	"learning_rate": 8.687674977138116e-06,
	"loss": 0.3246,
	"step": 198
	},
	{
	"epoch": 2.6357615894039736,
	"grad_norm": 0.08737713823497803,
	"learning_rate": 8.064696101776358e-06,
	"loss": 0.3143,
	"step": 199
	},
	{
	"epoch": 2.6490066225165565,
	"grad_norm": 0.08814135175802748,
	"learning_rate": 7.463953938275858e-06,
	"loss": 0.3094,
	"step": 200
	},
	{
	"epoch": 2.662251655629139,
	"grad_norm": 0.08889240634697596,
	"learning_rate": 6.8855937902340576e-06,
	"loss": 0.3214,
	"step": 201
	},
	{
	"epoch": 2.6754966887417218,
	"grad_norm": 0.09012485234682949,
	"learning_rate": 6.329755547632499e-06,
	"loss": 0.3169,
	"step": 202
	},
	{
	"epoch": 2.6887417218543046,
	"grad_norm": 0.09076602960863962,
	"learning_rate": 5.7965736530010916e-06,
	"loss": 0.3218,
	"step": 203
	},
	{
	"epoch": 2.7019867549668874,
	"grad_norm": 0.09128692637997875,
	"learning_rate": 5.286177068899989e-06,
	"loss": 0.3224,
	"step": 204
	},
	{
	"epoch": 2.7152317880794703,
	"grad_norm": 0.08980696390068593,
	"learning_rate": 4.798689246727006e-06,
	"loss": 0.3255,
	"step": 205
	},
	{
	"epoch": 2.7284768211920527,
	"grad_norm": 0.08721555286082,
	"learning_rate": 4.3342280968580285e-06,
	"loss": 0.3056,
	"step": 206
	},
	{
	"epoch": 2.741721854304636,
	"grad_norm": 0.09013962844918878,
	"learning_rate": 3.892905960127546e-06,
	"loss": 0.3198,
	"step": 207
	},
	{
	"epoch": 2.7549668874172184,
	"grad_norm": 0.09102568370124482,
	"learning_rate": 3.4748295806564356e-06,
	"loss": 0.3192,
	"step": 208
	},
	{
	"epoch": 2.7682119205298013,
	"grad_norm": 0.09384836363080047,
	"learning_rate": 3.0801000800333877e-06,
	"loss": 0.3269,
	"step": 209
	},
	{
	"epoch": 2.781456953642384,
	"grad_norm": 0.09126268422899254,
	"learning_rate": 2.708812932856253e-06,
	"loss": 0.3302,
	"step": 210
	},
	{
	"epoch": 2.794701986754967,
	"grad_norm": 0.08781813338797502,
	"learning_rate": 2.3610579436393e-06,
	"loss": 0.3272,
	"step": 211
	},
	{
	"epoch": 2.80794701986755,
	"grad_norm": 0.09110065248669541,
	"learning_rate": 2.036919225091827e-06,
	"loss": 0.3206,
	"step": 212
	},
	{
	"epoch": 2.821192052980132,
	"grad_norm": 0.09086421544518553,
	"learning_rate": 1.7364751777736332e-06,
	"loss": 0.3245,
	"step": 213
	},
	{
	"epoch": 2.8344370860927155,
	"grad_norm": 0.08855581117736014,
	"learning_rate": 1.459798471131868e-06,
	"loss": 0.3118,
	"step": 214
	},
	{
	"epoch": 2.847682119205298,
	"grad_norm": 0.08936995804191887,
	"learning_rate": 1.2069560259243328e-06,
	"loss": 0.3215,
	"step": 215
	},
	{
	"epoch": 2.8609271523178808,
	"grad_norm": 0.0921595910113618,
	"learning_rate": 9.780089980330642e-07,
	"loss": 0.3174,
	"step": 216
	},
	{
	"epoch": 2.8741721854304636,
	"grad_norm": 0.08711718437070236,
	"learning_rate": 7.730127636723539e-07,
	"loss": 0.3177,
	"step": 217
	},
	{
	"epoch": 2.8874172185430464,
	"grad_norm": 0.09131775721484407,
	"learning_rate": 5.920169059947411e-07,
	"loss": 0.3232,
	"step": 218
	},
	{
	"epoch": 2.9006622516556293,
	"grad_norm": 0.08947994407470564,
	"learning_rate": 4.3506520309813947e-07,
	"loss": 0.3204,
	"step": 219
	},
	{
	"epoch": 2.9139072847682117,
	"grad_norm": 0.08743216843583222,
	"learning_rate": 3.0219561743707326e-07,
	"loss": 0.3231,
	"step": 220
	},
	{
	"epoch": 2.9271523178807946,
	"grad_norm": 0.09204563273581286,
	"learning_rate": 1.9344028664056713e-07,
	"loss": 0.3206,
	"step": 221
	},
	{
	"epoch": 2.9403973509933774,
	"grad_norm": 0.08928755161531188,
	"learning_rate": 1.0882551573891953e-07,
	"loss": 0.3258,
	"step": 222
	},
	{
	"epoch": 2.9536423841059603,
	"grad_norm": 0.09055680073868443,
	"learning_rate": 4.837177080119215e-08,
	"loss": 0.3207,
	"step": 223
	},
	{
	"epoch": 2.966887417218543,
	"grad_norm": 0.0882029082304654,
	"learning_rate": 1.209367398504746e-08,
	"loss": 0.314,
	"step": 224
	},
	{
	"epoch": 2.980132450331126,
	"grad_norm": 0.09307741342290024,
	"learning_rate": 0.0,
	"loss": 0.3346,
	"step": 225
	},
	{
	"epoch": 2.980132450331126,
	"eval_loss": 0.3478808104991913,
	"eval_runtime": 37.4367,
	"eval_samples_per_second": 27.032,
	"eval_steps_per_second": 0.855,
	"step": 225
	},
	{
	"epoch": 2.980132450331126,
	"step": 225,
	"total_flos": 1.002324572158034e+17,
	"train_loss": 0.3962253777186076,
	"train_runtime": 3220.2895,
	"train_samples_per_second": 8.951,
	"train_steps_per_second": 0.07
	}
	],
	"logging_steps": 1,
	"max_steps": 225,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 3,
	"save_steps": 500,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": true
	},
	"attributes": {}
	}
	},
	"total_flos": 1.002324572158034e+17,
	"train_batch_size": 8,
	"trial_name": null,
	"trial_params": null
	}