|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9926144756277697, |
|
"eval_steps": 57, |
|
"global_step": 338, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005908419497784343, |
|
"grad_norm": 4.501461029052734, |
|
"learning_rate": 6.666666666666667e-07, |
|
"loss": 1.062, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005908419497784343, |
|
"eval_loss": 1.0835397243499756, |
|
"eval_runtime": 4.3539, |
|
"eval_samples_per_second": 12.632, |
|
"eval_steps_per_second": 1.608, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.011816838995568686, |
|
"grad_norm": 4.469114303588867, |
|
"learning_rate": 1.3333333333333334e-06, |
|
"loss": 1.0268, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.01772525849335303, |
|
"grad_norm": 4.554893970489502, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.0401, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.023633677991137372, |
|
"grad_norm": 4.374792575836182, |
|
"learning_rate": 2.666666666666667e-06, |
|
"loss": 1.0423, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.029542097488921712, |
|
"grad_norm": 3.4377498626708984, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 0.9965, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03545051698670606, |
|
"grad_norm": 3.1242499351501465, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.9479, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0413589364844904, |
|
"grad_norm": 1.8368685245513916, |
|
"learning_rate": 4.666666666666667e-06, |
|
"loss": 0.8296, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.047267355982274745, |
|
"grad_norm": 1.7457680702209473, |
|
"learning_rate": 5.333333333333334e-06, |
|
"loss": 0.8159, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.053175775480059084, |
|
"grad_norm": 1.2953853607177734, |
|
"learning_rate": 6e-06, |
|
"loss": 0.664, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.059084194977843424, |
|
"grad_norm": 1.1054794788360596, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.6486, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06499261447562776, |
|
"grad_norm": 0.8712942004203796, |
|
"learning_rate": 7.333333333333333e-06, |
|
"loss": 0.6415, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.07090103397341212, |
|
"grad_norm": 1.4441039562225342, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.6255, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.07680945347119646, |
|
"grad_norm": 1.4984484910964966, |
|
"learning_rate": 8.666666666666668e-06, |
|
"loss": 0.5561, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0827178729689808, |
|
"grad_norm": 0.8376960754394531, |
|
"learning_rate": 9.333333333333334e-06, |
|
"loss": 0.5534, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.08862629246676514, |
|
"grad_norm": 0.7184750437736511, |
|
"learning_rate": 1e-05, |
|
"loss": 0.5062, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.09453471196454949, |
|
"grad_norm": 0.8381787538528442, |
|
"learning_rate": 1.0666666666666667e-05, |
|
"loss": 0.5531, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.10044313146233383, |
|
"grad_norm": 0.7621350288391113, |
|
"learning_rate": 1.1333333333333334e-05, |
|
"loss": 0.4876, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.10635155096011817, |
|
"grad_norm": 0.6955872178077698, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.5019, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.11225997045790251, |
|
"grad_norm": 0.5844917297363281, |
|
"learning_rate": 1.2666666666666667e-05, |
|
"loss": 0.4368, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.11816838995568685, |
|
"grad_norm": 0.5807573795318604, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 0.4965, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.1240768094534712, |
|
"grad_norm": 0.5376399755477905, |
|
"learning_rate": 1.4e-05, |
|
"loss": 0.4841, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.12998522895125553, |
|
"grad_norm": 0.5053263902664185, |
|
"learning_rate": 1.4666666666666666e-05, |
|
"loss": 0.4573, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.1358936484490399, |
|
"grad_norm": 0.5155225396156311, |
|
"learning_rate": 1.5333333333333334e-05, |
|
"loss": 0.451, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.14180206794682423, |
|
"grad_norm": 0.52030348777771, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.4199, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.14771048744460857, |
|
"grad_norm": 0.5321907997131348, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.4532, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.1536189069423929, |
|
"grad_norm": 0.5318155288696289, |
|
"learning_rate": 1.7333333333333336e-05, |
|
"loss": 0.4813, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.15952732644017725, |
|
"grad_norm": 0.5176340937614441, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.4288, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.1654357459379616, |
|
"grad_norm": 0.43893975019454956, |
|
"learning_rate": 1.866666666666667e-05, |
|
"loss": 0.3766, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.17134416543574593, |
|
"grad_norm": 0.43830162286758423, |
|
"learning_rate": 1.9333333333333333e-05, |
|
"loss": 0.4159, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.17725258493353027, |
|
"grad_norm": 0.45950719714164734, |
|
"learning_rate": 2e-05, |
|
"loss": 0.4505, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1831610044313146, |
|
"grad_norm": 0.40500667691230774, |
|
"learning_rate": 1.9999783114048658e-05, |
|
"loss": 0.3726, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.18906942392909898, |
|
"grad_norm": 0.43435147404670715, |
|
"learning_rate": 1.9999132465602526e-05, |
|
"loss": 0.442, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.19497784342688332, |
|
"grad_norm": 0.44813328981399536, |
|
"learning_rate": 1.999804808288491e-05, |
|
"loss": 0.437, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.20088626292466766, |
|
"grad_norm": 0.48166996240615845, |
|
"learning_rate": 1.9996530012933285e-05, |
|
"loss": 0.4107, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.206794682422452, |
|
"grad_norm": 0.398764044046402, |
|
"learning_rate": 1.9994578321597258e-05, |
|
"loss": 0.3882, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.21270310192023634, |
|
"grad_norm": 0.44229164719581604, |
|
"learning_rate": 1.999219309353572e-05, |
|
"loss": 0.4154, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.21861152141802068, |
|
"grad_norm": 0.44369620084762573, |
|
"learning_rate": 1.998937443221316e-05, |
|
"loss": 0.3863, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.22451994091580502, |
|
"grad_norm": 0.44270017743110657, |
|
"learning_rate": 1.9986122459895182e-05, |
|
"loss": 0.3945, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.23042836041358936, |
|
"grad_norm": 0.42152372002601624, |
|
"learning_rate": 1.9982437317643218e-05, |
|
"loss": 0.4094, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.2363367799113737, |
|
"grad_norm": 0.4120837450027466, |
|
"learning_rate": 1.9978319165308373e-05, |
|
"loss": 0.4411, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.24224519940915806, |
|
"grad_norm": 0.4064903259277344, |
|
"learning_rate": 1.997376818152453e-05, |
|
"loss": 0.3818, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.2481536189069424, |
|
"grad_norm": 0.3692624270915985, |
|
"learning_rate": 1.9968784563700586e-05, |
|
"loss": 0.3874, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.25406203840472674, |
|
"grad_norm": 0.4399218261241913, |
|
"learning_rate": 1.9963368528011867e-05, |
|
"loss": 0.3749, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.25997045790251105, |
|
"grad_norm": 0.3779003620147705, |
|
"learning_rate": 1.9957520309390786e-05, |
|
"loss": 0.3656, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.2658788774002954, |
|
"grad_norm": 0.3946981132030487, |
|
"learning_rate": 1.9951240161516643e-05, |
|
"loss": 0.3612, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2717872968980798, |
|
"grad_norm": 0.3969726264476776, |
|
"learning_rate": 1.99445283568046e-05, |
|
"loss": 0.3932, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.2776957163958641, |
|
"grad_norm": 0.4239075183868408, |
|
"learning_rate": 1.9937385186393888e-05, |
|
"loss": 0.387, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.28360413589364847, |
|
"grad_norm": 0.3688453733921051, |
|
"learning_rate": 1.992981096013517e-05, |
|
"loss": 0.3524, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.2895125553914328, |
|
"grad_norm": 0.4294806718826294, |
|
"learning_rate": 1.9921806006577102e-05, |
|
"loss": 0.3787, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.29542097488921715, |
|
"grad_norm": 0.3867166042327881, |
|
"learning_rate": 1.9913370672952074e-05, |
|
"loss": 0.3756, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.30132939438700146, |
|
"grad_norm": 0.43365901708602905, |
|
"learning_rate": 1.990450532516116e-05, |
|
"loss": 0.3896, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.3072378138847858, |
|
"grad_norm": 0.38658151030540466, |
|
"learning_rate": 1.9895210347758233e-05, |
|
"loss": 0.3703, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.31314623338257014, |
|
"grad_norm": 0.37093815207481384, |
|
"learning_rate": 1.98854861439333e-05, |
|
"loss": 0.3763, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.3190546528803545, |
|
"grad_norm": 0.40044137835502625, |
|
"learning_rate": 1.9875333135495e-05, |
|
"loss": 0.3752, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.3249630723781389, |
|
"grad_norm": 0.39133360981941223, |
|
"learning_rate": 1.986475176285232e-05, |
|
"loss": 0.3589, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.3308714918759232, |
|
"grad_norm": 0.38397374749183655, |
|
"learning_rate": 1.985374248499546e-05, |
|
"loss": 0.3701, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.33677991137370755, |
|
"grad_norm": 0.3795414865016937, |
|
"learning_rate": 1.984230577947597e-05, |
|
"loss": 0.3584, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.33677991137370755, |
|
"eval_loss": 0.3953791558742523, |
|
"eval_runtime": 4.6385, |
|
"eval_samples_per_second": 11.857, |
|
"eval_steps_per_second": 1.509, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.34268833087149186, |
|
"grad_norm": 0.3709493577480316, |
|
"learning_rate": 1.9830442142386e-05, |
|
"loss": 0.3647, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.34859675036927623, |
|
"grad_norm": 0.35005033016204834, |
|
"learning_rate": 1.9818152088336786e-05, |
|
"loss": 0.3317, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.35450516986706054, |
|
"grad_norm": 0.3652004599571228, |
|
"learning_rate": 1.9805436150436352e-05, |
|
"loss": 0.3394, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3604135893648449, |
|
"grad_norm": 0.3940984904766083, |
|
"learning_rate": 1.9792294880266346e-05, |
|
"loss": 0.3711, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.3663220088626292, |
|
"grad_norm": 0.35634928941726685, |
|
"learning_rate": 1.977872884785815e-05, |
|
"loss": 0.3455, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.3722304283604136, |
|
"grad_norm": 0.3972924053668976, |
|
"learning_rate": 1.9764738641668137e-05, |
|
"loss": 0.3652, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.37813884785819796, |
|
"grad_norm": 0.40372708439826965, |
|
"learning_rate": 1.9750324868552133e-05, |
|
"loss": 0.3662, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.38404726735598227, |
|
"grad_norm": 0.396133691072464, |
|
"learning_rate": 1.9735488153739128e-05, |
|
"loss": 0.3726, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.38995568685376664, |
|
"grad_norm": 0.398989737033844, |
|
"learning_rate": 1.972022914080411e-05, |
|
"loss": 0.3595, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.39586410635155095, |
|
"grad_norm": 0.4102807939052582, |
|
"learning_rate": 1.9704548491640195e-05, |
|
"loss": 0.3308, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.4017725258493353, |
|
"grad_norm": 0.344397634267807, |
|
"learning_rate": 1.9688446886429885e-05, |
|
"loss": 0.3653, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.4076809453471196, |
|
"grad_norm": 0.3550814390182495, |
|
"learning_rate": 1.9671925023615572e-05, |
|
"loss": 0.3412, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.413589364844904, |
|
"grad_norm": 0.4047009348869324, |
|
"learning_rate": 1.9654983619869242e-05, |
|
"loss": 0.3578, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4194977843426883, |
|
"grad_norm": 0.41112563014030457, |
|
"learning_rate": 1.9637623410061392e-05, |
|
"loss": 0.3694, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.4254062038404727, |
|
"grad_norm": 0.3775319755077362, |
|
"learning_rate": 1.961984514722914e-05, |
|
"loss": 0.3571, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.43131462333825704, |
|
"grad_norm": 0.3610381782054901, |
|
"learning_rate": 1.960164960254358e-05, |
|
"loss": 0.3713, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.43722304283604135, |
|
"grad_norm": 0.38662371039390564, |
|
"learning_rate": 1.9583037565276314e-05, |
|
"loss": 0.311, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.4431314623338257, |
|
"grad_norm": 0.3574771285057068, |
|
"learning_rate": 1.9564009842765225e-05, |
|
"loss": 0.3353, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.44903988183161003, |
|
"grad_norm": 0.3932562470436096, |
|
"learning_rate": 1.9544567260379455e-05, |
|
"loss": 0.3536, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.4549483013293944, |
|
"grad_norm": 0.3974682092666626, |
|
"learning_rate": 1.9524710661483594e-05, |
|
"loss": 0.3556, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.4608567208271787, |
|
"grad_norm": 0.37172290682792664, |
|
"learning_rate": 1.9504440907401113e-05, |
|
"loss": 0.3568, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.4667651403249631, |
|
"grad_norm": 0.37170422077178955, |
|
"learning_rate": 1.948375887737699e-05, |
|
"loss": 0.3556, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.4726735598227474, |
|
"grad_norm": 0.3596966862678528, |
|
"learning_rate": 1.9462665468539582e-05, |
|
"loss": 0.332, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.47858197932053176, |
|
"grad_norm": 0.35934680700302124, |
|
"learning_rate": 1.944116159586169e-05, |
|
"loss": 0.3276, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.4844903988183161, |
|
"grad_norm": 0.40984946489334106, |
|
"learning_rate": 1.94192481921209e-05, |
|
"loss": 0.3685, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.49039881831610044, |
|
"grad_norm": 0.3622114658355713, |
|
"learning_rate": 1.9396926207859085e-05, |
|
"loss": 0.3336, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.4963072378138848, |
|
"grad_norm": 0.34888842701911926, |
|
"learning_rate": 1.9374196611341212e-05, |
|
"loss": 0.3625, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.5022156573116692, |
|
"grad_norm": 0.37125518918037415, |
|
"learning_rate": 1.9351060388513304e-05, |
|
"loss": 0.3304, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.5081240768094535, |
|
"grad_norm": 0.4107120931148529, |
|
"learning_rate": 1.9327518542959717e-05, |
|
"loss": 0.3755, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.5140324963072378, |
|
"grad_norm": 0.3420109748840332, |
|
"learning_rate": 1.9303572095859545e-05, |
|
"loss": 0.3457, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.5199409158050221, |
|
"grad_norm": 0.35079535841941833, |
|
"learning_rate": 1.9279222085942396e-05, |
|
"loss": 0.3454, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.5258493353028065, |
|
"grad_norm": 0.3775666058063507, |
|
"learning_rate": 1.9254469569443274e-05, |
|
"loss": 0.3501, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.5317577548005908, |
|
"grad_norm": 0.3327409625053406, |
|
"learning_rate": 1.9229315620056805e-05, |
|
"loss": 0.3507, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5376661742983752, |
|
"grad_norm": 0.37142789363861084, |
|
"learning_rate": 1.9203761328890626e-05, |
|
"loss": 0.3453, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.5435745937961596, |
|
"grad_norm": 0.36256077885627747, |
|
"learning_rate": 1.91778078044181e-05, |
|
"loss": 0.3588, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.5494830132939439, |
|
"grad_norm": 0.3861102759838104, |
|
"learning_rate": 1.9151456172430186e-05, |
|
"loss": 0.3479, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.5553914327917282, |
|
"grad_norm": 0.3359353542327881, |
|
"learning_rate": 1.9124707575986642e-05, |
|
"loss": 0.318, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.5612998522895125, |
|
"grad_norm": 0.33662593364715576, |
|
"learning_rate": 1.909756317536643e-05, |
|
"loss": 0.3421, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.5672082717872969, |
|
"grad_norm": 0.35831600427627563, |
|
"learning_rate": 1.9070024148017375e-05, |
|
"loss": 0.3409, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.5731166912850812, |
|
"grad_norm": 0.39858701825141907, |
|
"learning_rate": 1.9042091688505104e-05, |
|
"loss": 0.3319, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.5790251107828656, |
|
"grad_norm": 0.3343643546104431, |
|
"learning_rate": 1.9013767008461236e-05, |
|
"loss": 0.3352, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.5849335302806499, |
|
"grad_norm": 0.3519919216632843, |
|
"learning_rate": 1.89850513365308e-05, |
|
"loss": 0.3634, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.5908419497784343, |
|
"grad_norm": 0.32900717854499817, |
|
"learning_rate": 1.895594591831896e-05, |
|
"loss": 0.3415, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5967503692762186, |
|
"grad_norm": 0.34432175755500793, |
|
"learning_rate": 1.8926452016336987e-05, |
|
"loss": 0.3169, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.6026587887740029, |
|
"grad_norm": 0.33144107460975647, |
|
"learning_rate": 1.8896570909947477e-05, |
|
"loss": 0.3431, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.6085672082717873, |
|
"grad_norm": 0.3299802839756012, |
|
"learning_rate": 1.8866303895308856e-05, |
|
"loss": 0.3411, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.6144756277695717, |
|
"grad_norm": 0.30740225315093994, |
|
"learning_rate": 1.883565228531919e-05, |
|
"loss": 0.3355, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.620384047267356, |
|
"grad_norm": 0.34325993061065674, |
|
"learning_rate": 1.88046174095592e-05, |
|
"loss": 0.3188, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.6262924667651403, |
|
"grad_norm": 0.3394065797328949, |
|
"learning_rate": 1.8773200614234587e-05, |
|
"loss": 0.3153, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.6322008862629247, |
|
"grad_norm": 0.35468512773513794, |
|
"learning_rate": 1.874140326211766e-05, |
|
"loss": 0.3387, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.638109305760709, |
|
"grad_norm": 0.36726799607276917, |
|
"learning_rate": 1.8709226732488216e-05, |
|
"loss": 0.3457, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.6440177252584933, |
|
"grad_norm": 0.3223711848258972, |
|
"learning_rate": 1.86766724210737e-05, |
|
"loss": 0.3588, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.6499261447562777, |
|
"grad_norm": 0.3537541925907135, |
|
"learning_rate": 1.8643741739988672e-05, |
|
"loss": 0.3506, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.6558345642540621, |
|
"grad_norm": 0.3755073845386505, |
|
"learning_rate": 1.8610436117673557e-05, |
|
"loss": 0.3221, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.6617429837518464, |
|
"grad_norm": 0.31778833270072937, |
|
"learning_rate": 1.8576756998832667e-05, |
|
"loss": 0.3161, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.6676514032496307, |
|
"grad_norm": 0.3517738878726959, |
|
"learning_rate": 1.8542705844371544e-05, |
|
"loss": 0.3442, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.6735598227474151, |
|
"grad_norm": 0.3254755139350891, |
|
"learning_rate": 1.8508284131333604e-05, |
|
"loss": 0.3372, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.6735598227474151, |
|
"eval_loss": 0.363791823387146, |
|
"eval_runtime": 4.0908, |
|
"eval_samples_per_second": 13.445, |
|
"eval_steps_per_second": 1.711, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.6794682422451994, |
|
"grad_norm": 0.3458060622215271, |
|
"learning_rate": 1.8473493352836032e-05, |
|
"loss": 0.3329, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.6853766617429837, |
|
"grad_norm": 0.33962881565093994, |
|
"learning_rate": 1.8438335018005052e-05, |
|
"loss": 0.3478, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.691285081240768, |
|
"grad_norm": 0.33980926871299744, |
|
"learning_rate": 1.8402810651910444e-05, |
|
"loss": 0.3484, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.6971935007385525, |
|
"grad_norm": 0.355694979429245, |
|
"learning_rate": 1.8366921795499394e-05, |
|
"loss": 0.3686, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.7031019202363368, |
|
"grad_norm": 0.3415476083755493, |
|
"learning_rate": 1.8330670005529657e-05, |
|
"loss": 0.3204, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.7090103397341211, |
|
"grad_norm": 0.3336890935897827, |
|
"learning_rate": 1.829405685450202e-05, |
|
"loss": 0.3323, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7149187592319055, |
|
"grad_norm": 0.34337785840034485, |
|
"learning_rate": 1.8257083930592102e-05, |
|
"loss": 0.3283, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.7208271787296898, |
|
"grad_norm": 0.3578524887561798, |
|
"learning_rate": 1.8219752837581466e-05, |
|
"loss": 0.3326, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.7267355982274741, |
|
"grad_norm": 0.32392922043800354, |
|
"learning_rate": 1.8182065194788024e-05, |
|
"loss": 0.3141, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.7326440177252584, |
|
"grad_norm": 0.36127492785453796, |
|
"learning_rate": 1.814402263699584e-05, |
|
"loss": 0.3461, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.7385524372230429, |
|
"grad_norm": 0.33812931180000305, |
|
"learning_rate": 1.8105626814384173e-05, |
|
"loss": 0.3404, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.7444608567208272, |
|
"grad_norm": 0.3138431906700134, |
|
"learning_rate": 1.8066879392455932e-05, |
|
"loss": 0.3237, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.7503692762186115, |
|
"grad_norm": 0.33033978939056396, |
|
"learning_rate": 1.8027782051965408e-05, |
|
"loss": 0.3416, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.7562776957163959, |
|
"grad_norm": 0.3907163143157959, |
|
"learning_rate": 1.7988336488845374e-05, |
|
"loss": 0.3352, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.7621861152141802, |
|
"grad_norm": 0.315248042345047, |
|
"learning_rate": 1.7948544414133534e-05, |
|
"loss": 0.3225, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.7680945347119645, |
|
"grad_norm": 0.3284492790699005, |
|
"learning_rate": 1.7908407553898282e-05, |
|
"loss": 0.3217, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.7740029542097489, |
|
"grad_norm": 0.3439176082611084, |
|
"learning_rate": 1.7867927649163838e-05, |
|
"loss": 0.3367, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.7799113737075333, |
|
"grad_norm": 0.31954073905944824, |
|
"learning_rate": 1.782710645583473e-05, |
|
"loss": 0.3133, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.7858197932053176, |
|
"grad_norm": 0.38416293263435364, |
|
"learning_rate": 1.7785945744619642e-05, |
|
"loss": 0.3484, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.7917282127031019, |
|
"grad_norm": 0.34139737486839294, |
|
"learning_rate": 1.774444730095456e-05, |
|
"loss": 0.3042, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.7976366322008862, |
|
"grad_norm": 0.3623535931110382, |
|
"learning_rate": 1.7702612924925377e-05, |
|
"loss": 0.3318, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.8035450516986706, |
|
"grad_norm": 0.32973209023475647, |
|
"learning_rate": 1.766044443118978e-05, |
|
"loss": 0.3092, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.8094534711964549, |
|
"grad_norm": 0.30704402923583984, |
|
"learning_rate": 1.761794364889855e-05, |
|
"loss": 0.321, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.8153618906942393, |
|
"grad_norm": 0.34877485036849976, |
|
"learning_rate": 1.7575112421616203e-05, |
|
"loss": 0.3266, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.8212703101920237, |
|
"grad_norm": 0.3538282811641693, |
|
"learning_rate": 1.7531952607241033e-05, |
|
"loss": 0.3703, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.827178729689808, |
|
"grad_norm": 0.35590365529060364, |
|
"learning_rate": 1.7488466077924525e-05, |
|
"loss": 0.3506, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.8330871491875923, |
|
"grad_norm": 0.33215418457984924, |
|
"learning_rate": 1.7444654719990128e-05, |
|
"loss": 0.3207, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.8389955686853766, |
|
"grad_norm": 0.3381923735141754, |
|
"learning_rate": 1.7400520433851457e-05, |
|
"loss": 0.3237, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.844903988183161, |
|
"grad_norm": 0.3371356129646301, |
|
"learning_rate": 1.735606513392984e-05, |
|
"loss": 0.3394, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.8508124076809453, |
|
"grad_norm": 0.344291627407074, |
|
"learning_rate": 1.7311290748571273e-05, |
|
"loss": 0.3604, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.8567208271787297, |
|
"grad_norm": 0.3567575216293335, |
|
"learning_rate": 1.72661992199628e-05, |
|
"loss": 0.3518, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.8626292466765141, |
|
"grad_norm": 0.33762165904045105, |
|
"learning_rate": 1.7220792504048227e-05, |
|
"loss": 0.3146, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.8685376661742984, |
|
"grad_norm": 0.3404117822647095, |
|
"learning_rate": 1.717507257044331e-05, |
|
"loss": 0.3192, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.8744460856720827, |
|
"grad_norm": 0.3535095751285553, |
|
"learning_rate": 1.7129041402350317e-05, |
|
"loss": 0.3364, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.880354505169867, |
|
"grad_norm": 0.3418992757797241, |
|
"learning_rate": 1.708270099647198e-05, |
|
"loss": 0.3327, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.8862629246676514, |
|
"grad_norm": 0.3172495663166046, |
|
"learning_rate": 1.7036053362924896e-05, |
|
"loss": 0.3404, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.8921713441654358, |
|
"grad_norm": 0.3307952284812927, |
|
"learning_rate": 1.6989100525152346e-05, |
|
"loss": 0.3279, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.8980797636632201, |
|
"grad_norm": 0.29014381766319275, |
|
"learning_rate": 1.694184451983651e-05, |
|
"loss": 0.3027, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.9039881831610044, |
|
"grad_norm": 0.3290538191795349, |
|
"learning_rate": 1.689428739681012e-05, |
|
"loss": 0.3297, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.9098966026587888, |
|
"grad_norm": 0.3165034353733063, |
|
"learning_rate": 1.684643121896755e-05, |
|
"loss": 0.3225, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.9158050221565731, |
|
"grad_norm": 0.3677435517311096, |
|
"learning_rate": 1.679827806217533e-05, |
|
"loss": 0.328, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.9217134416543574, |
|
"grad_norm": 0.3617594242095947, |
|
"learning_rate": 1.6749830015182106e-05, |
|
"loss": 0.3299, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.9276218611521418, |
|
"grad_norm": 0.31069889664649963, |
|
"learning_rate": 1.6701089179528032e-05, |
|
"loss": 0.3146, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.9335302806499262, |
|
"grad_norm": 0.3610530197620392, |
|
"learning_rate": 1.6652057669453606e-05, |
|
"loss": 0.3223, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.9394387001477105, |
|
"grad_norm": 0.3169001638889313, |
|
"learning_rate": 1.6602737611807975e-05, |
|
"loss": 0.3194, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.9453471196454948, |
|
"grad_norm": 0.33033737540245056, |
|
"learning_rate": 1.655313114595666e-05, |
|
"loss": 0.3317, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.9512555391432792, |
|
"grad_norm": 0.35510334372520447, |
|
"learning_rate": 1.6503240423688768e-05, |
|
"loss": 0.3249, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.9571639586410635, |
|
"grad_norm": 0.356079638004303, |
|
"learning_rate": 1.6453067609123656e-05, |
|
"loss": 0.3274, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.9630723781388478, |
|
"grad_norm": 0.36350899934768677, |
|
"learning_rate": 1.6402614878617037e-05, |
|
"loss": 0.3553, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.9689807976366323, |
|
"grad_norm": 0.3371831476688385, |
|
"learning_rate": 1.6351884420666616e-05, |
|
"loss": 0.3245, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.9748892171344166, |
|
"grad_norm": 0.3398657739162445, |
|
"learning_rate": 1.6300878435817115e-05, |
|
"loss": 0.3043, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.9807976366322009, |
|
"grad_norm": 0.34537115693092346, |
|
"learning_rate": 1.6249599136564837e-05, |
|
"loss": 0.349, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.9867060561299852, |
|
"grad_norm": 0.31506776809692383, |
|
"learning_rate": 1.619804874726171e-05, |
|
"loss": 0.315, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.9926144756277696, |
|
"grad_norm": 0.32844215631484985, |
|
"learning_rate": 1.6146229504018777e-05, |
|
"loss": 0.3247, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.9985228951255539, |
|
"grad_norm": 0.3447742760181427, |
|
"learning_rate": 1.609414365460921e-05, |
|
"loss": 0.3193, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.3447742760181427, |
|
"learning_rate": 1.6041793458370812e-05, |
|
"loss": 0.3359, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.0059084194977843, |
|
"grad_norm": 0.27635836601257324, |
|
"learning_rate": 1.5989181186108003e-05, |
|
"loss": 0.2579, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.0059084194977843, |
|
"eval_loss": 0.3496532440185547, |
|
"eval_runtime": 4.0258, |
|
"eval_samples_per_second": 13.662, |
|
"eval_steps_per_second": 1.739, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.0118168389955686, |
|
"grad_norm": 0.27547529339790344, |
|
"learning_rate": 1.5936309119993333e-05, |
|
"loss": 0.2532, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.017725258493353, |
|
"grad_norm": 0.2674752473831177, |
|
"learning_rate": 1.5883179553468465e-05, |
|
"loss": 0.2413, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.0236336779911375, |
|
"grad_norm": 0.3056715428829193, |
|
"learning_rate": 1.5829794791144723e-05, |
|
"loss": 0.2418, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.0295420974889218, |
|
"grad_norm": 0.27895164489746094, |
|
"learning_rate": 1.5776157148703094e-05, |
|
"loss": 0.2516, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.035450516986706, |
|
"grad_norm": 0.2935872972011566, |
|
"learning_rate": 1.5722268952793806e-05, |
|
"loss": 0.254, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.0413589364844904, |
|
"grad_norm": 0.28329288959503174, |
|
"learning_rate": 1.566813254093538e-05, |
|
"loss": 0.2356, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.0472673559822747, |
|
"grad_norm": 0.29026728868484497, |
|
"learning_rate": 1.5613750261413256e-05, |
|
"loss": 0.2404, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.053175775480059, |
|
"grad_norm": 0.3126751780509949, |
|
"learning_rate": 1.555912447317792e-05, |
|
"loss": 0.2303, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.0590841949778433, |
|
"grad_norm": 0.26517724990844727, |
|
"learning_rate": 1.5504257545742585e-05, |
|
"loss": 0.2175, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.0649926144756279, |
|
"grad_norm": 0.26433265209198, |
|
"learning_rate": 1.5449151859080395e-05, |
|
"loss": 0.2169, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.0709010339734122, |
|
"grad_norm": 0.2908313274383545, |
|
"learning_rate": 1.5393809803521213e-05, |
|
"loss": 0.2236, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.0768094534711965, |
|
"grad_norm": 0.2951337397098541, |
|
"learning_rate": 1.533823377964791e-05, |
|
"loss": 0.2305, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.0827178729689808, |
|
"grad_norm": 0.29755067825317383, |
|
"learning_rate": 1.528242619819224e-05, |
|
"loss": 0.2385, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.0886262924667651, |
|
"grad_norm": 0.2879098355770111, |
|
"learning_rate": 1.5226389479930296e-05, |
|
"loss": 0.2377, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.0945347119645494, |
|
"grad_norm": 0.2590835392475128, |
|
"learning_rate": 1.517012605557746e-05, |
|
"loss": 0.2312, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.1004431314623337, |
|
"grad_norm": 0.2694130837917328, |
|
"learning_rate": 1.5113638365682996e-05, |
|
"loss": 0.2347, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.106351550960118, |
|
"grad_norm": 0.29442402720451355, |
|
"learning_rate": 1.5056928860524181e-05, |
|
"loss": 0.2428, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.1122599704579026, |
|
"grad_norm": 0.29042768478393555, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.2501, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.118168389955687, |
|
"grad_norm": 0.2620311975479126, |
|
"learning_rate": 1.4942854253524479e-05, |
|
"loss": 0.2395, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.1240768094534712, |
|
"grad_norm": 0.26113441586494446, |
|
"learning_rate": 1.488549409991953e-05, |
|
"loss": 0.2532, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.1299852289512555, |
|
"grad_norm": 0.2995262145996094, |
|
"learning_rate": 1.482792202730745e-05, |
|
"loss": 0.2319, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.1358936484490398, |
|
"grad_norm": 0.27327674627304077, |
|
"learning_rate": 1.477014053300299e-05, |
|
"loss": 0.2348, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.1418020679468242, |
|
"grad_norm": 0.26245003938674927, |
|
"learning_rate": 1.4712152123405018e-05, |
|
"loss": 0.228, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.1477104874446087, |
|
"grad_norm": 0.28888335824012756, |
|
"learning_rate": 1.4653959313887813e-05, |
|
"loss": 0.2436, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.153618906942393, |
|
"grad_norm": 0.2724781632423401, |
|
"learning_rate": 1.4595564628691944e-05, |
|
"loss": 0.2442, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.1595273264401773, |
|
"grad_norm": 0.2921780049800873, |
|
"learning_rate": 1.4536970600814789e-05, |
|
"loss": 0.2412, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.1654357459379616, |
|
"grad_norm": 0.27938568592071533, |
|
"learning_rate": 1.4478179771900634e-05, |
|
"loss": 0.2465, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.171344165435746, |
|
"grad_norm": 0.29516273736953735, |
|
"learning_rate": 1.4419194692130453e-05, |
|
"loss": 0.2415, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.1772525849335302, |
|
"grad_norm": 0.27947136759757996, |
|
"learning_rate": 1.436001792011128e-05, |
|
"loss": 0.2295, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.1831610044313146, |
|
"grad_norm": 0.26482367515563965, |
|
"learning_rate": 1.4300652022765207e-05, |
|
"loss": 0.2273, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.1890694239290989, |
|
"grad_norm": 0.2728091776371002, |
|
"learning_rate": 1.424109957521806e-05, |
|
"loss": 0.2227, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.1949778434268834, |
|
"grad_norm": 0.28748828172683716, |
|
"learning_rate": 1.4181363160687693e-05, |
|
"loss": 0.2402, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.2008862629246677, |
|
"grad_norm": 0.2891993820667267, |
|
"learning_rate": 1.4121445370371922e-05, |
|
"loss": 0.224, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.206794682422452, |
|
"grad_norm": 0.24767152965068817, |
|
"learning_rate": 1.4061348803336135e-05, |
|
"loss": 0.221, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.2127031019202363, |
|
"grad_norm": 0.2819165885448456, |
|
"learning_rate": 1.400107606640056e-05, |
|
"loss": 0.2231, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.2186115214180206, |
|
"grad_norm": 0.27328819036483765, |
|
"learning_rate": 1.394062977402717e-05, |
|
"loss": 0.229, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.224519940915805, |
|
"grad_norm": 0.2674582302570343, |
|
"learning_rate": 1.3880012548206292e-05, |
|
"loss": 0.2155, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.2304283604135893, |
|
"grad_norm": 0.2989075481891632, |
|
"learning_rate": 1.3819227018342865e-05, |
|
"loss": 0.2184, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.2363367799113738, |
|
"grad_norm": 0.30796098709106445, |
|
"learning_rate": 1.3758275821142382e-05, |
|
"loss": 0.2288, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.2422451994091581, |
|
"grad_norm": 0.29833805561065674, |
|
"learning_rate": 1.3697161600496525e-05, |
|
"loss": 0.2368, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.2481536189069424, |
|
"grad_norm": 0.26458829641342163, |
|
"learning_rate": 1.3635887007368467e-05, |
|
"loss": 0.2376, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.2540620384047267, |
|
"grad_norm": 0.2781698703765869, |
|
"learning_rate": 1.3574454699677893e-05, |
|
"loss": 0.2167, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.259970457902511, |
|
"grad_norm": 0.268433153629303, |
|
"learning_rate": 1.3512867342185705e-05, |
|
"loss": 0.2229, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.2658788774002954, |
|
"grad_norm": 0.2726047933101654, |
|
"learning_rate": 1.3451127606378425e-05, |
|
"loss": 0.223, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.2717872968980797, |
|
"grad_norm": 0.29567429423332214, |
|
"learning_rate": 1.3389238170352318e-05, |
|
"loss": 0.2105, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.277695716395864, |
|
"grad_norm": 0.30303359031677246, |
|
"learning_rate": 1.3327201718697232e-05, |
|
"loss": 0.2602, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.2836041358936485, |
|
"grad_norm": 0.27332380414009094, |
|
"learning_rate": 1.326502094238013e-05, |
|
"loss": 0.2288, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.2895125553914328, |
|
"grad_norm": 0.2703614830970764, |
|
"learning_rate": 1.3202698538628376e-05, |
|
"loss": 0.2308, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.2954209748892171, |
|
"grad_norm": 0.2788908779621124, |
|
"learning_rate": 1.3140237210812741e-05, |
|
"loss": 0.2254, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.3013293943870015, |
|
"grad_norm": 0.27442580461502075, |
|
"learning_rate": 1.3077639668330124e-05, |
|
"loss": 0.2158, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.3072378138847858, |
|
"grad_norm": 0.28895896673202515, |
|
"learning_rate": 1.3014908626486032e-05, |
|
"loss": 0.2404, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.31314623338257, |
|
"grad_norm": 0.24982582032680511, |
|
"learning_rate": 1.2952046806376806e-05, |
|
"loss": 0.2201, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.3190546528803546, |
|
"grad_norm": 0.28909650444984436, |
|
"learning_rate": 1.2889056934771577e-05, |
|
"loss": 0.2384, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.324963072378139, |
|
"grad_norm": 0.28018954396247864, |
|
"learning_rate": 1.282594174399399e-05, |
|
"loss": 0.2324, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.3308714918759232, |
|
"grad_norm": 0.29922735691070557, |
|
"learning_rate": 1.2762703971803684e-05, |
|
"loss": 0.2457, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.3367799113737076, |
|
"grad_norm": 0.289288729429245, |
|
"learning_rate": 1.2699346361277538e-05, |
|
"loss": 0.2366, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.3426883308714919, |
|
"grad_norm": 0.2790012061595917, |
|
"learning_rate": 1.2635871660690677e-05, |
|
"loss": 0.2359, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.3426883308714919, |
|
"eval_loss": 0.35204342007637024, |
|
"eval_runtime": 4.4578, |
|
"eval_samples_per_second": 12.338, |
|
"eval_steps_per_second": 1.57, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.3485967503692762, |
|
"grad_norm": 0.36030444502830505, |
|
"learning_rate": 1.2572282623397268e-05, |
|
"loss": 0.2405, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.3545051698670605, |
|
"grad_norm": 0.24079382419586182, |
|
"learning_rate": 1.2508582007711074e-05, |
|
"loss": 0.2148, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.3604135893648448, |
|
"grad_norm": 0.26674559712409973, |
|
"learning_rate": 1.2444772576785828e-05, |
|
"loss": 0.2457, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.3663220088626291, |
|
"grad_norm": 0.25345727801322937, |
|
"learning_rate": 1.2380857098495355e-05, |
|
"loss": 0.2229, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.3722304283604136, |
|
"grad_norm": 0.2623337507247925, |
|
"learning_rate": 1.2316838345313517e-05, |
|
"loss": 0.231, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.378138847858198, |
|
"grad_norm": 0.27783095836639404, |
|
"learning_rate": 1.225271909419395e-05, |
|
"loss": 0.2251, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.3840472673559823, |
|
"grad_norm": 0.25021976232528687, |
|
"learning_rate": 1.2188502126449616e-05, |
|
"loss": 0.226, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.3899556868537666, |
|
"grad_norm": 0.2695038318634033, |
|
"learning_rate": 1.2124190227632138e-05, |
|
"loss": 0.2438, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.395864106351551, |
|
"grad_norm": 0.24312005937099457, |
|
"learning_rate": 1.2059786187410984e-05, |
|
"loss": 0.2138, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.4017725258493354, |
|
"grad_norm": 0.2761548161506653, |
|
"learning_rate": 1.1995292799452472e-05, |
|
"loss": 0.244, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.4076809453471197, |
|
"grad_norm": 0.2740529477596283, |
|
"learning_rate": 1.1930712861298553e-05, |
|
"loss": 0.2416, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.413589364844904, |
|
"grad_norm": 0.2605426013469696, |
|
"learning_rate": 1.186604917424549e-05, |
|
"loss": 0.2515, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.4194977843426884, |
|
"grad_norm": 0.27557292580604553, |
|
"learning_rate": 1.1801304543222349e-05, |
|
"loss": 0.232, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.4254062038404727, |
|
"grad_norm": 0.2512328624725342, |
|
"learning_rate": 1.1736481776669307e-05, |
|
"loss": 0.2311, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.431314623338257, |
|
"grad_norm": 0.2634104788303375, |
|
"learning_rate": 1.1671583686415833e-05, |
|
"loss": 0.2207, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.4372230428360413, |
|
"grad_norm": 0.2541881203651428, |
|
"learning_rate": 1.1606613087558748e-05, |
|
"loss": 0.2207, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.4431314623338256, |
|
"grad_norm": 0.24408863484859467, |
|
"learning_rate": 1.1541572798340076e-05, |
|
"loss": 0.2155, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.44903988183161, |
|
"grad_norm": 0.25305289030075073, |
|
"learning_rate": 1.1476465640024814e-05, |
|
"loss": 0.2245, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.4549483013293945, |
|
"grad_norm": 0.26579606533050537, |
|
"learning_rate": 1.1411294436778562e-05, |
|
"loss": 0.2295, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.4608567208271788, |
|
"grad_norm": 0.26332345604896545, |
|
"learning_rate": 1.1346062015544997e-05, |
|
"loss": 0.2363, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.466765140324963, |
|
"grad_norm": 0.2519514262676239, |
|
"learning_rate": 1.1280771205923269e-05, |
|
"loss": 0.2215, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.4726735598227474, |
|
"grad_norm": 0.2569345533847809, |
|
"learning_rate": 1.1215424840045254e-05, |
|
"loss": 0.223, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.4785819793205317, |
|
"grad_norm": 0.25557035207748413, |
|
"learning_rate": 1.1150025752452693e-05, |
|
"loss": 0.2511, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.4844903988183162, |
|
"grad_norm": 0.26646342873573303, |
|
"learning_rate": 1.1084576779974257e-05, |
|
"loss": 0.2476, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.4903988183161005, |
|
"grad_norm": 0.27917614579200745, |
|
"learning_rate": 1.1019080761602473e-05, |
|
"loss": 0.2284, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.4963072378138849, |
|
"grad_norm": 0.2594425082206726, |
|
"learning_rate": 1.0953540538370591e-05, |
|
"loss": 0.2319, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.5022156573116692, |
|
"grad_norm": 0.23648317158222198, |
|
"learning_rate": 1.0887958953229349e-05, |
|
"loss": 0.225, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.5081240768094535, |
|
"grad_norm": 0.24810343980789185, |
|
"learning_rate": 1.0822338850923644e-05, |
|
"loss": 0.2222, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.5140324963072378, |
|
"grad_norm": 0.25305667519569397, |
|
"learning_rate": 1.0756683077869133e-05, |
|
"loss": 0.2178, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.519940915805022, |
|
"grad_norm": 0.23994190990924835, |
|
"learning_rate": 1.069099448202878e-05, |
|
"loss": 0.2274, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.5258493353028064, |
|
"grad_norm": 0.28112536668777466, |
|
"learning_rate": 1.0625275912789307e-05, |
|
"loss": 0.2157, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.5317577548005907, |
|
"grad_norm": 0.2910768687725067, |
|
"learning_rate": 1.0559530220837593e-05, |
|
"loss": 0.2337, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.537666174298375, |
|
"grad_norm": 0.26320862770080566, |
|
"learning_rate": 1.049376025803703e-05, |
|
"loss": 0.2156, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.5435745937961596, |
|
"grad_norm": 0.2653874456882477, |
|
"learning_rate": 1.0427968877303809e-05, |
|
"loss": 0.2269, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.549483013293944, |
|
"grad_norm": 0.24998469650745392, |
|
"learning_rate": 1.0362158932483165e-05, |
|
"loss": 0.2252, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.5553914327917282, |
|
"grad_norm": 0.25920990109443665, |
|
"learning_rate": 1.0296333278225599e-05, |
|
"loss": 0.2274, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.5612998522895125, |
|
"grad_norm": 0.2827723026275635, |
|
"learning_rate": 1.023049476986304e-05, |
|
"loss": 0.248, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.567208271787297, |
|
"grad_norm": 0.27848076820373535, |
|
"learning_rate": 1.0164646263284993e-05, |
|
"loss": 0.2372, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.5731166912850814, |
|
"grad_norm": 0.2601296305656433, |
|
"learning_rate": 1.0098790614814658e-05, |
|
"loss": 0.212, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.5790251107828657, |
|
"grad_norm": 0.24360589683055878, |
|
"learning_rate": 1.0032930681085028e-05, |
|
"loss": 0.2152, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.58493353028065, |
|
"grad_norm": 0.3080978989601135, |
|
"learning_rate": 9.967069318914977e-06, |
|
"loss": 0.2218, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.5908419497784343, |
|
"grad_norm": 0.26208099722862244, |
|
"learning_rate": 9.901209385185345e-06, |
|
"loss": 0.2184, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.5967503692762186, |
|
"grad_norm": 0.2984671890735626, |
|
"learning_rate": 9.835353736715007e-06, |
|
"loss": 0.2432, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.602658788774003, |
|
"grad_norm": 0.26782581210136414, |
|
"learning_rate": 9.769505230136962e-06, |
|
"loss": 0.2126, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.6085672082717872, |
|
"grad_norm": 0.28440967202186584, |
|
"learning_rate": 9.703666721774403e-06, |
|
"loss": 0.2214, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.6144756277695715, |
|
"grad_norm": 0.2926226854324341, |
|
"learning_rate": 9.637841067516837e-06, |
|
"loss": 0.2256, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.6203840472673559, |
|
"grad_norm": 0.25548121333122253, |
|
"learning_rate": 9.572031122696196e-06, |
|
"loss": 0.2304, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.6262924667651402, |
|
"grad_norm": 0.28455373644828796, |
|
"learning_rate": 9.506239741962971e-06, |
|
"loss": 0.2299, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.6322008862629247, |
|
"grad_norm": 0.262614369392395, |
|
"learning_rate": 9.440469779162407e-06, |
|
"loss": 0.2251, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.638109305760709, |
|
"grad_norm": 0.27394819259643555, |
|
"learning_rate": 9.374724087210698e-06, |
|
"loss": 0.2117, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.6440177252584933, |
|
"grad_norm": 0.2843812108039856, |
|
"learning_rate": 9.309005517971222e-06, |
|
"loss": 0.2268, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.6499261447562779, |
|
"grad_norm": 0.25647154450416565, |
|
"learning_rate": 9.24331692213087e-06, |
|
"loss": 0.2187, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.6558345642540622, |
|
"grad_norm": 0.27861371636390686, |
|
"learning_rate": 9.17766114907636e-06, |
|
"loss": 0.2311, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.6617429837518465, |
|
"grad_norm": 0.270049512386322, |
|
"learning_rate": 9.112041046770653e-06, |
|
"loss": 0.2265, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.6676514032496308, |
|
"grad_norm": 0.2750328779220581, |
|
"learning_rate": 9.04645946162941e-06, |
|
"loss": 0.2253, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.673559822747415, |
|
"grad_norm": 0.2412230521440506, |
|
"learning_rate": 8.980919238397532e-06, |
|
"loss": 0.2394, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.6794682422451994, |
|
"grad_norm": 0.2524693012237549, |
|
"learning_rate": 8.915423220025747e-06, |
|
"loss": 0.2258, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.6794682422451994, |
|
"eval_loss": 0.3460842967033386, |
|
"eval_runtime": 4.0784, |
|
"eval_samples_per_second": 13.486, |
|
"eval_steps_per_second": 1.716, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.6853766617429837, |
|
"grad_norm": 0.25439098477363586, |
|
"learning_rate": 8.849974247547307e-06, |
|
"loss": 0.2266, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.691285081240768, |
|
"grad_norm": 0.257929265499115, |
|
"learning_rate": 8.784575159954748e-06, |
|
"loss": 0.2133, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.6971935007385524, |
|
"grad_norm": 0.24912972748279572, |
|
"learning_rate": 8.719228794076733e-06, |
|
"loss": 0.2129, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.7031019202363367, |
|
"grad_norm": 0.27103564143180847, |
|
"learning_rate": 8.653937984455007e-06, |
|
"loss": 0.2276, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.709010339734121, |
|
"grad_norm": 0.2718878984451294, |
|
"learning_rate": 8.588705563221444e-06, |
|
"loss": 0.2276, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.7149187592319055, |
|
"grad_norm": 0.26431816816329956, |
|
"learning_rate": 8.52353435997519e-06, |
|
"loss": 0.2328, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.7208271787296898, |
|
"grad_norm": 0.2725984752178192, |
|
"learning_rate": 8.458427201659926e-06, |
|
"loss": 0.2292, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.7267355982274741, |
|
"grad_norm": 0.2515108585357666, |
|
"learning_rate": 8.393386912441257e-06, |
|
"loss": 0.226, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.7326440177252584, |
|
"grad_norm": 0.2476361244916916, |
|
"learning_rate": 8.328416313584169e-06, |
|
"loss": 0.2277, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.738552437223043, |
|
"grad_norm": 0.25414201617240906, |
|
"learning_rate": 8.263518223330698e-06, |
|
"loss": 0.2268, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.7444608567208273, |
|
"grad_norm": 0.26264503598213196, |
|
"learning_rate": 8.198695456777653e-06, |
|
"loss": 0.2193, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.7503692762186116, |
|
"grad_norm": 0.26917147636413574, |
|
"learning_rate": 8.133950825754511e-06, |
|
"loss": 0.2251, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.756277695716396, |
|
"grad_norm": 0.2692192792892456, |
|
"learning_rate": 8.069287138701452e-06, |
|
"loss": 0.232, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.7621861152141802, |
|
"grad_norm": 0.27494263648986816, |
|
"learning_rate": 8.004707200547534e-06, |
|
"loss": 0.2461, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.7680945347119645, |
|
"grad_norm": 0.28247448801994324, |
|
"learning_rate": 7.940213812589018e-06, |
|
"loss": 0.2226, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.7740029542097489, |
|
"grad_norm": 0.2632560133934021, |
|
"learning_rate": 7.875809772367867e-06, |
|
"loss": 0.216, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.7799113737075332, |
|
"grad_norm": 0.26561063528060913, |
|
"learning_rate": 7.81149787355039e-06, |
|
"loss": 0.2286, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.7858197932053175, |
|
"grad_norm": 0.24065916240215302, |
|
"learning_rate": 7.747280905806051e-06, |
|
"loss": 0.2201, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.7917282127031018, |
|
"grad_norm": 0.288473904132843, |
|
"learning_rate": 7.683161654686486e-06, |
|
"loss": 0.2179, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.797636632200886, |
|
"grad_norm": 0.27798035740852356, |
|
"learning_rate": 7.619142901504649e-06, |
|
"loss": 0.2341, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.8035450516986706, |
|
"grad_norm": 0.28387168049812317, |
|
"learning_rate": 7.555227423214174e-06, |
|
"loss": 0.226, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.809453471196455, |
|
"grad_norm": 0.28974682092666626, |
|
"learning_rate": 7.491417992288927e-06, |
|
"loss": 0.2296, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.8153618906942393, |
|
"grad_norm": 0.26052042841911316, |
|
"learning_rate": 7.427717376602739e-06, |
|
"loss": 0.2002, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.8212703101920238, |
|
"grad_norm": 0.29558730125427246, |
|
"learning_rate": 7.364128339309326e-06, |
|
"loss": 0.263, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.827178729689808, |
|
"grad_norm": 0.24457122385501862, |
|
"learning_rate": 7.300653638722463e-06, |
|
"loss": 0.224, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.8330871491875924, |
|
"grad_norm": 0.2517196834087372, |
|
"learning_rate": 7.2372960281963165e-06, |
|
"loss": 0.2134, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 1.8389955686853767, |
|
"grad_norm": 0.27632561326026917, |
|
"learning_rate": 7.174058256006012e-06, |
|
"loss": 0.2229, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.844903988183161, |
|
"grad_norm": 0.2603515684604645, |
|
"learning_rate": 7.110943065228425e-06, |
|
"loss": 0.2299, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 1.8508124076809453, |
|
"grad_norm": 0.24517123401165009, |
|
"learning_rate": 7.047953193623195e-06, |
|
"loss": 0.2096, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 1.8567208271787297, |
|
"grad_norm": 0.24135427176952362, |
|
"learning_rate": 6.985091373513972e-06, |
|
"loss": 0.2072, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.862629246676514, |
|
"grad_norm": 0.2676647901535034, |
|
"learning_rate": 6.92236033166988e-06, |
|
"loss": 0.2173, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 1.8685376661742983, |
|
"grad_norm": 0.2504200041294098, |
|
"learning_rate": 6.859762789187259e-06, |
|
"loss": 0.2192, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 1.8744460856720826, |
|
"grad_norm": 0.26364269852638245, |
|
"learning_rate": 6.797301461371626e-06, |
|
"loss": 0.2193, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.880354505169867, |
|
"grad_norm": 0.24448218941688538, |
|
"learning_rate": 6.734979057619873e-06, |
|
"loss": 0.2208, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 1.8862629246676514, |
|
"grad_norm": 0.24706940352916718, |
|
"learning_rate": 6.67279828130277e-06, |
|
"loss": 0.2211, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.8921713441654358, |
|
"grad_norm": 0.24761930108070374, |
|
"learning_rate": 6.610761829647685e-06, |
|
"loss": 0.2222, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 1.89807976366322, |
|
"grad_norm": 0.2566414475440979, |
|
"learning_rate": 6.548872393621578e-06, |
|
"loss": 0.2136, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 1.9039881831610044, |
|
"grad_norm": 0.2611066401004791, |
|
"learning_rate": 6.487132657814297e-06, |
|
"loss": 0.2146, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 1.909896602658789, |
|
"grad_norm": 0.27130842208862305, |
|
"learning_rate": 6.4255453003221115e-06, |
|
"loss": 0.2184, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.9158050221565732, |
|
"grad_norm": 0.2548243999481201, |
|
"learning_rate": 6.364112992631537e-06, |
|
"loss": 0.2299, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.9217134416543575, |
|
"grad_norm": 0.2533697187900543, |
|
"learning_rate": 6.302838399503477e-06, |
|
"loss": 0.2043, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 1.9276218611521418, |
|
"grad_norm": 0.2540424168109894, |
|
"learning_rate": 6.241724178857621e-06, |
|
"loss": 0.2039, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 1.9335302806499262, |
|
"grad_norm": 0.2535569965839386, |
|
"learning_rate": 6.180772981657139e-06, |
|
"loss": 0.2019, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 1.9394387001477105, |
|
"grad_norm": 0.29982754588127136, |
|
"learning_rate": 6.119987451793711e-06, |
|
"loss": 0.2228, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 1.9453471196454948, |
|
"grad_norm": 0.23110415041446686, |
|
"learning_rate": 6.059370225972834e-06, |
|
"loss": 0.2188, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.951255539143279, |
|
"grad_norm": 0.2608148753643036, |
|
"learning_rate": 5.998923933599443e-06, |
|
"loss": 0.2236, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 1.9571639586410634, |
|
"grad_norm": 0.26010897755622864, |
|
"learning_rate": 5.938651196663865e-06, |
|
"loss": 0.2032, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 1.9630723781388477, |
|
"grad_norm": 0.26297712326049805, |
|
"learning_rate": 5.878554629628081e-06, |
|
"loss": 0.2224, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 1.9689807976366323, |
|
"grad_norm": 0.2658803164958954, |
|
"learning_rate": 5.818636839312309e-06, |
|
"loss": 0.2153, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 1.9748892171344166, |
|
"grad_norm": 0.23885361850261688, |
|
"learning_rate": 5.758900424781939e-06, |
|
"loss": 0.2029, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.9807976366322009, |
|
"grad_norm": 0.2604767978191376, |
|
"learning_rate": 5.699347977234799e-06, |
|
"loss": 0.2059, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 1.9867060561299852, |
|
"grad_norm": 0.2535778284072876, |
|
"learning_rate": 5.6399820798887266e-06, |
|
"loss": 0.2204, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 1.9926144756277697, |
|
"grad_norm": 0.2699243128299713, |
|
"learning_rate": 5.580805307869549e-06, |
|
"loss": 0.2158, |
|
"step": 338 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 507, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 169, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.797158580880671e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|