|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.980132450331126, |
|
"eval_steps": 500, |
|
"global_step": 225, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.013245033112582781, |
|
"grad_norm": 0.8096176088552035, |
|
"learning_rate": 8.695652173913044e-06, |
|
"loss": 1.2541, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.026490066225165563, |
|
"grad_norm": 0.8050822017472643, |
|
"learning_rate": 1.739130434782609e-05, |
|
"loss": 1.227, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.039735099337748346, |
|
"grad_norm": 0.7944772711887119, |
|
"learning_rate": 2.608695652173913e-05, |
|
"loss": 1.2415, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.052980132450331126, |
|
"grad_norm": 0.7598134845438774, |
|
"learning_rate": 3.478260869565218e-05, |
|
"loss": 1.1949, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.06622516556291391, |
|
"grad_norm": 0.7683127560022982, |
|
"learning_rate": 4.347826086956522e-05, |
|
"loss": 1.2093, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.07947019867549669, |
|
"grad_norm": 0.5344525931760804, |
|
"learning_rate": 5.217391304347826e-05, |
|
"loss": 1.1036, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.09271523178807947, |
|
"grad_norm": 0.4587044664340658, |
|
"learning_rate": 6.086956521739131e-05, |
|
"loss": 1.0166, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.10596026490066225, |
|
"grad_norm": 0.4868625164917359, |
|
"learning_rate": 6.956521739130436e-05, |
|
"loss": 0.955, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.11920529801324503, |
|
"grad_norm": 0.5418471125188639, |
|
"learning_rate": 7.82608695652174e-05, |
|
"loss": 0.8997, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.13245033112582782, |
|
"grad_norm": 0.5223349521251892, |
|
"learning_rate": 8.695652173913044e-05, |
|
"loss": 0.8113, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.1456953642384106, |
|
"grad_norm": 0.4786982568033246, |
|
"learning_rate": 9.565217391304348e-05, |
|
"loss": 0.7325, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.15894039735099338, |
|
"grad_norm": 0.46957216029807536, |
|
"learning_rate": 0.00010434782608695653, |
|
"loss": 0.6606, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.17218543046357615, |
|
"grad_norm": 0.38029367288689914, |
|
"learning_rate": 0.00011304347826086956, |
|
"loss": 0.5808, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.18543046357615894, |
|
"grad_norm": 0.24720582418095602, |
|
"learning_rate": 0.00012173913043478263, |
|
"loss": 0.5613, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.1986754966887417, |
|
"grad_norm": 0.23099067802861695, |
|
"learning_rate": 0.00013043478260869567, |
|
"loss": 0.5391, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.2119205298013245, |
|
"grad_norm": 0.20957820248410008, |
|
"learning_rate": 0.0001391304347826087, |
|
"loss": 0.539, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.2251655629139073, |
|
"grad_norm": 0.21711931182463448, |
|
"learning_rate": 0.00014782608695652173, |
|
"loss": 0.5268, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.23841059602649006, |
|
"grad_norm": 0.1951790595421549, |
|
"learning_rate": 0.0001565217391304348, |
|
"loss": 0.4963, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.25165562913907286, |
|
"grad_norm": 0.1826409685431601, |
|
"learning_rate": 0.00016521739130434784, |
|
"loss": 0.4952, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.26490066225165565, |
|
"grad_norm": 0.14373385619543355, |
|
"learning_rate": 0.00017391304347826088, |
|
"loss": 0.4837, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.2781456953642384, |
|
"grad_norm": 0.12173908533781636, |
|
"learning_rate": 0.00018260869565217392, |
|
"loss": 0.4634, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.2913907284768212, |
|
"grad_norm": 0.12297735060498352, |
|
"learning_rate": 0.00019130434782608697, |
|
"loss": 0.4573, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.304635761589404, |
|
"grad_norm": 0.10994270746188307, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4683, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.31788079470198677, |
|
"grad_norm": 0.11351044281096902, |
|
"learning_rate": 0.00019998790632601496, |
|
"loss": 0.4322, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.33112582781456956, |
|
"grad_norm": 0.11243087776192183, |
|
"learning_rate": 0.00019995162822919883, |
|
"loss": 0.4516, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.3443708609271523, |
|
"grad_norm": 0.11510175208476785, |
|
"learning_rate": 0.00019989117448426108, |
|
"loss": 0.4499, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.3576158940397351, |
|
"grad_norm": 0.11693433753737806, |
|
"learning_rate": 0.00019980655971335945, |
|
"loss": 0.4542, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.3708609271523179, |
|
"grad_norm": 0.11467246423231502, |
|
"learning_rate": 0.00019969780438256293, |
|
"loss": 0.4337, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.3841059602649007, |
|
"grad_norm": 0.11115653137915112, |
|
"learning_rate": 0.0001995649347969019, |
|
"loss": 0.4263, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.3973509933774834, |
|
"grad_norm": 0.11024786542483019, |
|
"learning_rate": 0.00019940798309400526, |
|
"loss": 0.4342, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.4105960264900662, |
|
"grad_norm": 0.10312580553142063, |
|
"learning_rate": 0.00019922698723632767, |
|
"loss": 0.4267, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.423841059602649, |
|
"grad_norm": 0.11074151337400631, |
|
"learning_rate": 0.00019902199100196697, |
|
"loss": 0.4286, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.4370860927152318, |
|
"grad_norm": 0.09029943151079976, |
|
"learning_rate": 0.0001987930439740757, |
|
"loss": 0.4152, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.4503311258278146, |
|
"grad_norm": 0.09101826700354056, |
|
"learning_rate": 0.00019854020152886814, |
|
"loss": 0.4313, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.46357615894039733, |
|
"grad_norm": 0.0914630983642065, |
|
"learning_rate": 0.00019826352482222638, |
|
"loss": 0.4117, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.4768211920529801, |
|
"grad_norm": 0.09219697877770537, |
|
"learning_rate": 0.00019796308077490817, |
|
"loss": 0.4175, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.4900662251655629, |
|
"grad_norm": 0.08852002864296264, |
|
"learning_rate": 0.00019763894205636072, |
|
"loss": 0.4041, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.5033112582781457, |
|
"grad_norm": 0.08580676378486166, |
|
"learning_rate": 0.00019729118706714375, |
|
"loss": 0.404, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.5165562913907285, |
|
"grad_norm": 0.08598698501328113, |
|
"learning_rate": 0.00019691989991996663, |
|
"loss": 0.4087, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.5298013245033113, |
|
"grad_norm": 0.08961053716539952, |
|
"learning_rate": 0.00019652517041934356, |
|
"loss": 0.4014, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.543046357615894, |
|
"grad_norm": 0.08443482401797175, |
|
"learning_rate": 0.00019610709403987246, |
|
"loss": 0.4137, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.5562913907284768, |
|
"grad_norm": 0.08466021640310874, |
|
"learning_rate": 0.00019566577190314197, |
|
"loss": 0.4071, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.5695364238410596, |
|
"grad_norm": 0.08784527020927076, |
|
"learning_rate": 0.00019520131075327298, |
|
"loss": 0.4061, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.5827814569536424, |
|
"grad_norm": 0.08325332082087357, |
|
"learning_rate": 0.00019471382293110003, |
|
"loss": 0.3957, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.5960264900662252, |
|
"grad_norm": 0.08614805595781429, |
|
"learning_rate": 0.0001942034263469989, |
|
"loss": 0.4053, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.609271523178808, |
|
"grad_norm": 0.07902174863469037, |
|
"learning_rate": 0.00019367024445236754, |
|
"loss": 0.3987, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.6225165562913907, |
|
"grad_norm": 0.08133695710941313, |
|
"learning_rate": 0.00019311440620976597, |
|
"loss": 0.3942, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.6357615894039735, |
|
"grad_norm": 0.08276360028919133, |
|
"learning_rate": 0.00019253604606172417, |
|
"loss": 0.3951, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.6490066225165563, |
|
"grad_norm": 0.08194802489692825, |
|
"learning_rate": 0.00019193530389822363, |
|
"loss": 0.3917, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.6622516556291391, |
|
"grad_norm": 0.08159974959706186, |
|
"learning_rate": 0.00019131232502286188, |
|
"loss": 0.3934, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.6754966887417219, |
|
"grad_norm": 0.08170998905157066, |
|
"learning_rate": 0.00019066726011770726, |
|
"loss": 0.3851, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.6887417218543046, |
|
"grad_norm": 0.08020907094953274, |
|
"learning_rate": 0.00019000026520685302, |
|
"loss": 0.3893, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.7019867549668874, |
|
"grad_norm": 0.08034981466771474, |
|
"learning_rate": 0.00018931150161867916, |
|
"loss": 0.381, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.7152317880794702, |
|
"grad_norm": 0.08444845993593682, |
|
"learning_rate": 0.00018860113594683148, |
|
"loss": 0.3915, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.7284768211920529, |
|
"grad_norm": 0.08015215412606266, |
|
"learning_rate": 0.00018786934000992688, |
|
"loss": 0.3833, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.7417218543046358, |
|
"grad_norm": 0.08464858931007045, |
|
"learning_rate": 0.00018711629080999504, |
|
"loss": 0.3826, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.7549668874172185, |
|
"grad_norm": 0.08291520407405459, |
|
"learning_rate": 0.00018634217048966637, |
|
"loss": 0.3738, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.7682119205298014, |
|
"grad_norm": 0.08660040487398858, |
|
"learning_rate": 0.0001855471662881164, |
|
"loss": 0.3856, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.7814569536423841, |
|
"grad_norm": 0.0857196214995308, |
|
"learning_rate": 0.00018473147049577774, |
|
"loss": 0.3779, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.7947019867549668, |
|
"grad_norm": 0.07987880371713715, |
|
"learning_rate": 0.00018389528040783012, |
|
"loss": 0.3766, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.8079470198675497, |
|
"grad_norm": 0.08369440099668185, |
|
"learning_rate": 0.00018303879827647975, |
|
"loss": 0.3835, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.8211920529801324, |
|
"grad_norm": 0.08373532556639413, |
|
"learning_rate": 0.00018216223126204007, |
|
"loss": 0.3745, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.8344370860927153, |
|
"grad_norm": 0.08073536197157054, |
|
"learning_rate": 0.00018126579138282503, |
|
"loss": 0.3687, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.847682119205298, |
|
"grad_norm": 0.08284465509601228, |
|
"learning_rate": 0.00018034969546386757, |
|
"loss": 0.3787, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.8609271523178808, |
|
"grad_norm": 0.0842934427371451, |
|
"learning_rate": 0.00017941416508447536, |
|
"loss": 0.3873, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.8741721854304636, |
|
"grad_norm": 0.08355593713327628, |
|
"learning_rate": 0.0001784594265246366, |
|
"loss": 0.3778, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.8874172185430463, |
|
"grad_norm": 0.08950539941436171, |
|
"learning_rate": 0.000177485710710289, |
|
"loss": 0.3727, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.9006622516556292, |
|
"grad_norm": 0.08710263548451828, |
|
"learning_rate": 0.00017649325315746478, |
|
"loss": 0.3808, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.9139072847682119, |
|
"grad_norm": 0.0887614198652171, |
|
"learning_rate": 0.00017548229391532572, |
|
"loss": 0.3789, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.9271523178807947, |
|
"grad_norm": 0.08666661250569707, |
|
"learning_rate": 0.0001744530775081015, |
|
"loss": 0.3732, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.9403973509933775, |
|
"grad_norm": 0.0849525268450149, |
|
"learning_rate": 0.00017340585287594604, |
|
"loss": 0.3712, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.9536423841059603, |
|
"grad_norm": 0.08625788315304235, |
|
"learning_rate": 0.00017234087331472497, |
|
"loss": 0.3597, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.9668874172185431, |
|
"grad_norm": 0.07851130512605926, |
|
"learning_rate": 0.00017125839641475072, |
|
"loss": 0.3639, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.9801324503311258, |
|
"grad_norm": 0.08964240238751611, |
|
"learning_rate": 0.00017015868399847768, |
|
"loss": 0.3844, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.9933774834437086, |
|
"grad_norm": 0.08516340365396252, |
|
"learning_rate": 0.0001690420020571747, |
|
"loss": 0.372, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.9933774834437086, |
|
"eval_loss": 0.3703567683696747, |
|
"eval_runtime": 46.123, |
|
"eval_samples_per_second": 21.941, |
|
"eval_steps_per_second": 0.694, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.0066225165562914, |
|
"grad_norm": 0.07944382362889917, |
|
"learning_rate": 0.0001679086206865886, |
|
"loss": 0.3697, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.0198675496688743, |
|
"grad_norm": 0.08265930361903498, |
|
"learning_rate": 0.00016675881402161536, |
|
"loss": 0.3551, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.033112582781457, |
|
"grad_norm": 0.08703614399996357, |
|
"learning_rate": 0.000165592860169994, |
|
"loss": 0.3442, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.0463576158940397, |
|
"grad_norm": 0.08916319509375828, |
|
"learning_rate": 0.0001644110411450398, |
|
"loss": 0.365, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.0596026490066226, |
|
"grad_norm": 0.08703848127871557, |
|
"learning_rate": 0.00016321364279743266, |
|
"loss": 0.3611, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.0728476821192052, |
|
"grad_norm": 0.09052558000694078, |
|
"learning_rate": 0.00016200095474607753, |
|
"loss": 0.3615, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.086092715231788, |
|
"grad_norm": 0.08918100371610707, |
|
"learning_rate": 0.0001607732703080532, |
|
"loss": 0.342, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.099337748344371, |
|
"grad_norm": 0.08576575268439565, |
|
"learning_rate": 0.0001595308864276666, |
|
"loss": 0.3598, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.1125827814569536, |
|
"grad_norm": 0.08585017464402006, |
|
"learning_rate": 0.0001582741036046301, |
|
"loss": 0.3504, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.1258278145695364, |
|
"grad_norm": 0.08593452414859805, |
|
"learning_rate": 0.00015700322582137827, |
|
"loss": 0.3432, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.1390728476821192, |
|
"grad_norm": 0.08731970510720415, |
|
"learning_rate": 0.00015571856046954285, |
|
"loss": 0.3457, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.152317880794702, |
|
"grad_norm": 0.0921843418842424, |
|
"learning_rate": 0.00015442041827560274, |
|
"loss": 0.3507, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.1655629139072847, |
|
"grad_norm": 0.09651961400159455, |
|
"learning_rate": 0.00015310911322572753, |
|
"loss": 0.3596, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.1788079470198676, |
|
"grad_norm": 0.08524005048376013, |
|
"learning_rate": 0.00015178496248983254, |
|
"loss": 0.3554, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.1920529801324504, |
|
"grad_norm": 0.08859594152270273, |
|
"learning_rate": 0.000150448286344864, |
|
"loss": 0.3551, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.205298013245033, |
|
"grad_norm": 0.0924808469627539, |
|
"learning_rate": 0.00014909940809733222, |
|
"loss": 0.3525, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.218543046357616, |
|
"grad_norm": 0.08644059805052462, |
|
"learning_rate": 0.00014773865400511272, |
|
"loss": 0.3503, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.2317880794701987, |
|
"grad_norm": 0.09131894341880005, |
|
"learning_rate": 0.00014636635319853275, |
|
"loss": 0.3571, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.2450331125827814, |
|
"grad_norm": 0.08393682045402433, |
|
"learning_rate": 0.0001449828376007636, |
|
"loss": 0.3476, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.2582781456953642, |
|
"grad_norm": 0.08696313045637266, |
|
"learning_rate": 0.00014358844184753712, |
|
"loss": 0.3594, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.271523178807947, |
|
"grad_norm": 0.09458041630505085, |
|
"learning_rate": 0.00014218350320620624, |
|
"loss": 0.3626, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.2847682119205297, |
|
"grad_norm": 0.08823303635376296, |
|
"learning_rate": 0.00014076836149416887, |
|
"loss": 0.3499, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.2980132450331126, |
|
"grad_norm": 0.09294675372857181, |
|
"learning_rate": 0.00013934335899667527, |
|
"loss": 0.3539, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.3112582781456954, |
|
"grad_norm": 0.08824268036877034, |
|
"learning_rate": 0.00013790884038403795, |
|
"loss": 0.3514, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.3245033112582782, |
|
"grad_norm": 0.08535480262896947, |
|
"learning_rate": 0.00013646515262826552, |
|
"loss": 0.345, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.3377483443708609, |
|
"grad_norm": 0.08847562725166169, |
|
"learning_rate": 0.00013501264491913906, |
|
"loss": 0.3616, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.3509933774834437, |
|
"grad_norm": 0.08859058434854095, |
|
"learning_rate": 0.0001335516685797525, |
|
"loss": 0.3562, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.3642384105960264, |
|
"grad_norm": 0.08715025975746184, |
|
"learning_rate": 0.00013208257698153677, |
|
"loss": 0.3455, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.3774834437086092, |
|
"grad_norm": 0.0853594568437305, |
|
"learning_rate": 0.00013060572545878875, |
|
"loss": 0.346, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.390728476821192, |
|
"grad_norm": 0.08722491192064814, |
|
"learning_rate": 0.00012912147122272523, |
|
"loss": 0.3555, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.403973509933775, |
|
"grad_norm": 0.0871433664730764, |
|
"learning_rate": 0.00012763017327508305, |
|
"loss": 0.3556, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.4172185430463577, |
|
"grad_norm": 0.08803547541904783, |
|
"learning_rate": 0.00012613219232128608, |
|
"loss": 0.3534, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.4304635761589404, |
|
"grad_norm": 0.09122226233927531, |
|
"learning_rate": 0.00012462789068320017, |
|
"loss": 0.3569, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.4437086092715232, |
|
"grad_norm": 0.09822341257641279, |
|
"learning_rate": 0.000123117632211497, |
|
"loss": 0.3633, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.4569536423841059, |
|
"grad_norm": 0.09270090775666746, |
|
"learning_rate": 0.00012160178219764837, |
|
"loss": 0.3453, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.4701986754966887, |
|
"grad_norm": 0.08925565696630358, |
|
"learning_rate": 0.00012008070728557186, |
|
"loss": 0.3508, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.4834437086092715, |
|
"grad_norm": 0.09170653617303556, |
|
"learning_rate": 0.00011855477538294935, |
|
"loss": 0.3534, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.4966887417218544, |
|
"grad_norm": 0.08583635619816832, |
|
"learning_rate": 0.00011702435557223987, |
|
"loss": 0.3463, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.5099337748344372, |
|
"grad_norm": 0.08058809711878263, |
|
"learning_rate": 0.00011548981802140848, |
|
"loss": 0.3477, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.5231788079470199, |
|
"grad_norm": 0.09093533643868798, |
|
"learning_rate": 0.00011395153389439233, |
|
"loss": 0.3512, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.5364238410596025, |
|
"grad_norm": 0.09171376470501859, |
|
"learning_rate": 0.00011240987526132594, |
|
"loss": 0.3544, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.5496688741721854, |
|
"grad_norm": 0.08586078909940174, |
|
"learning_rate": 0.00011086521500854745, |
|
"loss": 0.3694, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.5629139072847682, |
|
"grad_norm": 0.08632019045566638, |
|
"learning_rate": 0.00010931792674840718, |
|
"loss": 0.3453, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.576158940397351, |
|
"grad_norm": 0.09269094674353331, |
|
"learning_rate": 0.00010776838472890065, |
|
"loss": 0.3587, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.589403973509934, |
|
"grad_norm": 0.08779002368050795, |
|
"learning_rate": 0.00010621696374314807, |
|
"loss": 0.3478, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.6026490066225165, |
|
"grad_norm": 0.08586261022719192, |
|
"learning_rate": 0.00010466403903874176, |
|
"loss": 0.341, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.6158940397350994, |
|
"grad_norm": 0.08611577193250892, |
|
"learning_rate": 0.0001031099862269837, |
|
"loss": 0.3558, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.629139072847682, |
|
"grad_norm": 0.09316621499512412, |
|
"learning_rate": 0.0001015551811920351, |
|
"loss": 0.3541, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.6423841059602649, |
|
"grad_norm": 0.08404147766450029, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3489, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.6556291390728477, |
|
"grad_norm": 0.08524287111150772, |
|
"learning_rate": 9.844481880796491e-05, |
|
"loss": 0.3541, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.6688741721854305, |
|
"grad_norm": 0.08369196863657465, |
|
"learning_rate": 9.689001377301633e-05, |
|
"loss": 0.3421, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.6821192052980134, |
|
"grad_norm": 0.08831018354579961, |
|
"learning_rate": 9.533596096125825e-05, |
|
"loss": 0.3484, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.695364238410596, |
|
"grad_norm": 0.08931583825994703, |
|
"learning_rate": 9.378303625685195e-05, |
|
"loss": 0.3418, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.7086092715231787, |
|
"grad_norm": 0.0920976409870365, |
|
"learning_rate": 9.223161527109937e-05, |
|
"loss": 0.3477, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.7218543046357615, |
|
"grad_norm": 0.0866166191323527, |
|
"learning_rate": 9.068207325159284e-05, |
|
"loss": 0.3422, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.7350993377483444, |
|
"grad_norm": 0.08394672431065998, |
|
"learning_rate": 8.913478499145254e-05, |
|
"loss": 0.337, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.7483443708609272, |
|
"grad_norm": 0.08368403453651165, |
|
"learning_rate": 8.759012473867407e-05, |
|
"loss": 0.3487, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.76158940397351, |
|
"grad_norm": 0.08503534775674756, |
|
"learning_rate": 8.604846610560771e-05, |
|
"loss": 0.3463, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.7748344370860927, |
|
"grad_norm": 0.08495442186575057, |
|
"learning_rate": 8.451018197859153e-05, |
|
"loss": 0.3506, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.7880794701986755, |
|
"grad_norm": 0.08766338307723749, |
|
"learning_rate": 8.297564442776014e-05, |
|
"loss": 0.3423, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.8013245033112582, |
|
"grad_norm": 0.08162961612606438, |
|
"learning_rate": 8.144522461705067e-05, |
|
"loss": 0.3316, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.814569536423841, |
|
"grad_norm": 0.08852249330426205, |
|
"learning_rate": 7.991929271442817e-05, |
|
"loss": 0.3483, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.8278145695364238, |
|
"grad_norm": 0.08788889130608463, |
|
"learning_rate": 7.839821780235168e-05, |
|
"loss": 0.3554, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.8410596026490067, |
|
"grad_norm": 0.08567621661342421, |
|
"learning_rate": 7.688236778850306e-05, |
|
"loss": 0.3333, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.8543046357615895, |
|
"grad_norm": 0.09025227183243908, |
|
"learning_rate": 7.537210931679987e-05, |
|
"loss": 0.3461, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.8675496688741722, |
|
"grad_norm": 0.0887176743957205, |
|
"learning_rate": 7.386780767871397e-05, |
|
"loss": 0.3459, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.8807947019867548, |
|
"grad_norm": 0.08665996940712498, |
|
"learning_rate": 7.236982672491698e-05, |
|
"loss": 0.3539, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.8940397350993377, |
|
"grad_norm": 0.08608862013105582, |
|
"learning_rate": 7.087852877727481e-05, |
|
"loss": 0.3418, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.9072847682119205, |
|
"grad_norm": 0.08420947731369693, |
|
"learning_rate": 6.939427454121128e-05, |
|
"loss": 0.3385, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.9205298013245033, |
|
"grad_norm": 0.08687771570570416, |
|
"learning_rate": 6.791742301846326e-05, |
|
"loss": 0.3484, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.9337748344370862, |
|
"grad_norm": 0.09001811775951214, |
|
"learning_rate": 6.644833142024751e-05, |
|
"loss": 0.3482, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.9470198675496688, |
|
"grad_norm": 0.08461468347282106, |
|
"learning_rate": 6.498735508086093e-05, |
|
"loss": 0.3384, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.9602649006622517, |
|
"grad_norm": 0.08353611993941902, |
|
"learning_rate": 6.35348473717345e-05, |
|
"loss": 0.343, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.9735099337748343, |
|
"grad_norm": 0.0834738694275141, |
|
"learning_rate": 6.209115961596208e-05, |
|
"loss": 0.3431, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.9867549668874172, |
|
"grad_norm": 0.08599845820919347, |
|
"learning_rate": 6.065664100332478e-05, |
|
"loss": 0.3381, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.08781968968497832, |
|
"learning_rate": 5.923163850583113e-05, |
|
"loss": 0.3361, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.35156726837158203, |
|
"eval_runtime": 38.8035, |
|
"eval_samples_per_second": 26.08, |
|
"eval_steps_per_second": 0.825, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 2.013245033112583, |
|
"grad_norm": 0.08189131042429836, |
|
"learning_rate": 5.781649679379378e-05, |
|
"loss": 0.3168, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 2.0264900662251657, |
|
"grad_norm": 0.08590965338671859, |
|
"learning_rate": 5.6411558152462894e-05, |
|
"loss": 0.3327, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 2.0397350993377485, |
|
"grad_norm": 0.08632653329140866, |
|
"learning_rate": 5.501716239923642e-05, |
|
"loss": 0.331, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 2.052980132450331, |
|
"grad_norm": 0.08516842826462703, |
|
"learning_rate": 5.363364680146725e-05, |
|
"loss": 0.3306, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 2.066225165562914, |
|
"grad_norm": 0.08496401039658237, |
|
"learning_rate": 5.226134599488728e-05, |
|
"loss": 0.3248, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 2.0794701986754967, |
|
"grad_norm": 0.08826525390483432, |
|
"learning_rate": 5.090059190266779e-05, |
|
"loss": 0.3308, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 2.0927152317880795, |
|
"grad_norm": 0.08487280637626197, |
|
"learning_rate": 4.955171365513603e-05, |
|
"loss": 0.3211, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 2.1059602649006623, |
|
"grad_norm": 0.09382764910639449, |
|
"learning_rate": 4.821503751016746e-05, |
|
"loss": 0.3354, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 2.119205298013245, |
|
"grad_norm": 0.08732672940741114, |
|
"learning_rate": 4.689088677427249e-05, |
|
"loss": 0.3315, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.1324503311258276, |
|
"grad_norm": 0.09541697755263766, |
|
"learning_rate": 4.5579581724397255e-05, |
|
"loss": 0.3373, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 2.1456953642384105, |
|
"grad_norm": 0.08867554361971618, |
|
"learning_rate": 4.428143953045717e-05, |
|
"loss": 0.3383, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 2.1589403973509933, |
|
"grad_norm": 0.09288456090060858, |
|
"learning_rate": 4.2996774178621736e-05, |
|
"loss": 0.331, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 2.172185430463576, |
|
"grad_norm": 0.08808813047917079, |
|
"learning_rate": 4.172589639536991e-05, |
|
"loss": 0.3223, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 2.185430463576159, |
|
"grad_norm": 0.09275105554751231, |
|
"learning_rate": 4.046911357233343e-05, |
|
"loss": 0.3301, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.198675496688742, |
|
"grad_norm": 0.09353735027294084, |
|
"learning_rate": 3.922672969194686e-05, |
|
"loss": 0.3295, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 2.2119205298013247, |
|
"grad_norm": 0.09234588799290942, |
|
"learning_rate": 3.79990452539225e-05, |
|
"loss": 0.3214, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 2.225165562913907, |
|
"grad_norm": 0.09179773375765557, |
|
"learning_rate": 3.678635720256737e-05, |
|
"loss": 0.3241, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 2.23841059602649, |
|
"grad_norm": 0.08971692725792768, |
|
"learning_rate": 3.558895885496023e-05, |
|
"loss": 0.3175, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 2.251655629139073, |
|
"grad_norm": 0.08939100980866099, |
|
"learning_rate": 3.440713983000601e-05, |
|
"loss": 0.3252, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.2649006622516556, |
|
"grad_norm": 0.09306831321980909, |
|
"learning_rate": 3.324118597838464e-05, |
|
"loss": 0.3225, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 2.2781456953642385, |
|
"grad_norm": 0.09091774211009096, |
|
"learning_rate": 3.209137931341143e-05, |
|
"loss": 0.3215, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 2.2913907284768213, |
|
"grad_norm": 0.08998835153295978, |
|
"learning_rate": 3.0957997942825336e-05, |
|
"loss": 0.3332, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 2.304635761589404, |
|
"grad_norm": 0.08999871518726542, |
|
"learning_rate": 2.9841316001522347e-05, |
|
"loss": 0.3265, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 2.3178807947019866, |
|
"grad_norm": 0.08874688997641272, |
|
"learning_rate": 2.874160358524931e-05, |
|
"loss": 0.328, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 2.3311258278145695, |
|
"grad_norm": 0.08979245895359222, |
|
"learning_rate": 2.7659126685275027e-05, |
|
"loss": 0.3288, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 2.3443708609271523, |
|
"grad_norm": 0.09322170086883196, |
|
"learning_rate": 2.659414712405398e-05, |
|
"loss": 0.3264, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 2.357615894039735, |
|
"grad_norm": 0.0873785964065595, |
|
"learning_rate": 2.5546922491898495e-05, |
|
"loss": 0.3283, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 2.370860927152318, |
|
"grad_norm": 0.09137697607964013, |
|
"learning_rate": 2.451770608467432e-05, |
|
"loss": 0.3265, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 2.384105960264901, |
|
"grad_norm": 0.08934971281847022, |
|
"learning_rate": 2.3506746842535242e-05, |
|
"loss": 0.3197, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.3973509933774833, |
|
"grad_norm": 0.09226380851297578, |
|
"learning_rate": 2.251428928971102e-05, |
|
"loss": 0.3303, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 2.410596026490066, |
|
"grad_norm": 0.08813038828978075, |
|
"learning_rate": 2.1540573475363402e-05, |
|
"loss": 0.3147, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 2.423841059602649, |
|
"grad_norm": 0.09148478249319783, |
|
"learning_rate": 2.058583491552465e-05, |
|
"loss": 0.3304, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 2.437086092715232, |
|
"grad_norm": 0.08970976007155415, |
|
"learning_rate": 1.9650304536132426e-05, |
|
"loss": 0.3142, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 2.4503311258278146, |
|
"grad_norm": 0.0914061480835884, |
|
"learning_rate": 1.8734208617174988e-05, |
|
"loss": 0.3332, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 2.4635761589403975, |
|
"grad_norm": 0.09223482668849642, |
|
"learning_rate": 1.783776873795994e-05, |
|
"loss": 0.3235, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 2.47682119205298, |
|
"grad_norm": 0.09218058790384615, |
|
"learning_rate": 1.696120172352025e-05, |
|
"loss": 0.3281, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 2.4900662251655628, |
|
"grad_norm": 0.09120288324314661, |
|
"learning_rate": 1.6104719592169902e-05, |
|
"loss": 0.323, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 2.5033112582781456, |
|
"grad_norm": 0.09425838170079778, |
|
"learning_rate": 1.526852950422226e-05, |
|
"loss": 0.3214, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 2.5165562913907285, |
|
"grad_norm": 0.09259911612664488, |
|
"learning_rate": 1.4452833711883628e-05, |
|
"loss": 0.3172, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.5298013245033113, |
|
"grad_norm": 0.08967866399346999, |
|
"learning_rate": 1.3657829510333654e-05, |
|
"loss": 0.314, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 2.543046357615894, |
|
"grad_norm": 0.09263981141490185, |
|
"learning_rate": 1.2883709190004955e-05, |
|
"loss": 0.3306, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 2.556291390728477, |
|
"grad_norm": 0.0924041757651034, |
|
"learning_rate": 1.2130659990073146e-05, |
|
"loss": 0.3238, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 2.5695364238410594, |
|
"grad_norm": 0.08680414784000516, |
|
"learning_rate": 1.1398864053168534e-05, |
|
"loss": 0.3172, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 2.5827814569536423, |
|
"grad_norm": 0.08927214818010673, |
|
"learning_rate": 1.0688498381320855e-05, |
|
"loss": 0.3148, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 2.596026490066225, |
|
"grad_norm": 0.09039528377033235, |
|
"learning_rate": 9.999734793146998e-06, |
|
"loss": 0.3212, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 2.609271523178808, |
|
"grad_norm": 0.08907654916187858, |
|
"learning_rate": 9.332739882292752e-06, |
|
"loss": 0.3124, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 2.622516556291391, |
|
"grad_norm": 0.09035973348094353, |
|
"learning_rate": 8.687674977138116e-06, |
|
"loss": 0.3246, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 2.6357615894039736, |
|
"grad_norm": 0.08737713823497803, |
|
"learning_rate": 8.064696101776358e-06, |
|
"loss": 0.3143, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 2.6490066225165565, |
|
"grad_norm": 0.08814135175802748, |
|
"learning_rate": 7.463953938275858e-06, |
|
"loss": 0.3094, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.662251655629139, |
|
"grad_norm": 0.08889240634697596, |
|
"learning_rate": 6.8855937902340576e-06, |
|
"loss": 0.3214, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 2.6754966887417218, |
|
"grad_norm": 0.09012485234682949, |
|
"learning_rate": 6.329755547632499e-06, |
|
"loss": 0.3169, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 2.6887417218543046, |
|
"grad_norm": 0.09076602960863962, |
|
"learning_rate": 5.7965736530010916e-06, |
|
"loss": 0.3218, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 2.7019867549668874, |
|
"grad_norm": 0.09128692637997875, |
|
"learning_rate": 5.286177068899989e-06, |
|
"loss": 0.3224, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 2.7152317880794703, |
|
"grad_norm": 0.08980696390068593, |
|
"learning_rate": 4.798689246727006e-06, |
|
"loss": 0.3255, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 2.7284768211920527, |
|
"grad_norm": 0.08721555286082, |
|
"learning_rate": 4.3342280968580285e-06, |
|
"loss": 0.3056, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 2.741721854304636, |
|
"grad_norm": 0.09013962844918878, |
|
"learning_rate": 3.892905960127546e-06, |
|
"loss": 0.3198, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 2.7549668874172184, |
|
"grad_norm": 0.09102568370124482, |
|
"learning_rate": 3.4748295806564356e-06, |
|
"loss": 0.3192, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 2.7682119205298013, |
|
"grad_norm": 0.09384836363080047, |
|
"learning_rate": 3.0801000800333877e-06, |
|
"loss": 0.3269, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 2.781456953642384, |
|
"grad_norm": 0.09126268422899254, |
|
"learning_rate": 2.708812932856253e-06, |
|
"loss": 0.3302, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.794701986754967, |
|
"grad_norm": 0.08781813338797502, |
|
"learning_rate": 2.3610579436393e-06, |
|
"loss": 0.3272, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 2.80794701986755, |
|
"grad_norm": 0.09110065248669541, |
|
"learning_rate": 2.036919225091827e-06, |
|
"loss": 0.3206, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 2.821192052980132, |
|
"grad_norm": 0.09086421544518553, |
|
"learning_rate": 1.7364751777736332e-06, |
|
"loss": 0.3245, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 2.8344370860927155, |
|
"grad_norm": 0.08855581117736014, |
|
"learning_rate": 1.459798471131868e-06, |
|
"loss": 0.3118, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 2.847682119205298, |
|
"grad_norm": 0.08936995804191887, |
|
"learning_rate": 1.2069560259243328e-06, |
|
"loss": 0.3215, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 2.8609271523178808, |
|
"grad_norm": 0.0921595910113618, |
|
"learning_rate": 9.780089980330642e-07, |
|
"loss": 0.3174, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 2.8741721854304636, |
|
"grad_norm": 0.08711718437070236, |
|
"learning_rate": 7.730127636723539e-07, |
|
"loss": 0.3177, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 2.8874172185430464, |
|
"grad_norm": 0.09131775721484407, |
|
"learning_rate": 5.920169059947411e-07, |
|
"loss": 0.3232, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 2.9006622516556293, |
|
"grad_norm": 0.08947994407470564, |
|
"learning_rate": 4.3506520309813947e-07, |
|
"loss": 0.3204, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 2.9139072847682117, |
|
"grad_norm": 0.08743216843583222, |
|
"learning_rate": 3.0219561743707326e-07, |
|
"loss": 0.3231, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.9271523178807946, |
|
"grad_norm": 0.09204563273581286, |
|
"learning_rate": 1.9344028664056713e-07, |
|
"loss": 0.3206, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 2.9403973509933774, |
|
"grad_norm": 0.08928755161531188, |
|
"learning_rate": 1.0882551573891953e-07, |
|
"loss": 0.3258, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 2.9536423841059603, |
|
"grad_norm": 0.09055680073868443, |
|
"learning_rate": 4.837177080119215e-08, |
|
"loss": 0.3207, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 2.966887417218543, |
|
"grad_norm": 0.0882029082304654, |
|
"learning_rate": 1.209367398504746e-08, |
|
"loss": 0.314, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 2.980132450331126, |
|
"grad_norm": 0.09307741342290024, |
|
"learning_rate": 0.0, |
|
"loss": 0.3346, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.980132450331126, |
|
"eval_loss": 0.3478808104991913, |
|
"eval_runtime": 37.4367, |
|
"eval_samples_per_second": 27.032, |
|
"eval_steps_per_second": 0.855, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.980132450331126, |
|
"step": 225, |
|
"total_flos": 1.002324572158034e+17, |
|
"train_loss": 0.3962253777186076, |
|
"train_runtime": 3220.2895, |
|
"train_samples_per_second": 8.951, |
|
"train_steps_per_second": 0.07 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 225, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.002324572158034e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|