amphora's picture
Upload folder using huggingface_hub
55d3247 verified
raw
history blame
60.6 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9926144756277697,
"eval_steps": 57,
"global_step": 338,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005908419497784343,
"grad_norm": 4.501461029052734,
"learning_rate": 6.666666666666667e-07,
"loss": 1.062,
"step": 1
},
{
"epoch": 0.005908419497784343,
"eval_loss": 1.0835397243499756,
"eval_runtime": 4.3539,
"eval_samples_per_second": 12.632,
"eval_steps_per_second": 1.608,
"step": 1
},
{
"epoch": 0.011816838995568686,
"grad_norm": 4.469114303588867,
"learning_rate": 1.3333333333333334e-06,
"loss": 1.0268,
"step": 2
},
{
"epoch": 0.01772525849335303,
"grad_norm": 4.554893970489502,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.0401,
"step": 3
},
{
"epoch": 0.023633677991137372,
"grad_norm": 4.374792575836182,
"learning_rate": 2.666666666666667e-06,
"loss": 1.0423,
"step": 4
},
{
"epoch": 0.029542097488921712,
"grad_norm": 3.4377498626708984,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.9965,
"step": 5
},
{
"epoch": 0.03545051698670606,
"grad_norm": 3.1242499351501465,
"learning_rate": 4.000000000000001e-06,
"loss": 0.9479,
"step": 6
},
{
"epoch": 0.0413589364844904,
"grad_norm": 1.8368685245513916,
"learning_rate": 4.666666666666667e-06,
"loss": 0.8296,
"step": 7
},
{
"epoch": 0.047267355982274745,
"grad_norm": 1.7457680702209473,
"learning_rate": 5.333333333333334e-06,
"loss": 0.8159,
"step": 8
},
{
"epoch": 0.053175775480059084,
"grad_norm": 1.2953853607177734,
"learning_rate": 6e-06,
"loss": 0.664,
"step": 9
},
{
"epoch": 0.059084194977843424,
"grad_norm": 1.1054794788360596,
"learning_rate": 6.666666666666667e-06,
"loss": 0.6486,
"step": 10
},
{
"epoch": 0.06499261447562776,
"grad_norm": 0.8712942004203796,
"learning_rate": 7.333333333333333e-06,
"loss": 0.6415,
"step": 11
},
{
"epoch": 0.07090103397341212,
"grad_norm": 1.4441039562225342,
"learning_rate": 8.000000000000001e-06,
"loss": 0.6255,
"step": 12
},
{
"epoch": 0.07680945347119646,
"grad_norm": 1.4984484910964966,
"learning_rate": 8.666666666666668e-06,
"loss": 0.5561,
"step": 13
},
{
"epoch": 0.0827178729689808,
"grad_norm": 0.8376960754394531,
"learning_rate": 9.333333333333334e-06,
"loss": 0.5534,
"step": 14
},
{
"epoch": 0.08862629246676514,
"grad_norm": 0.7184750437736511,
"learning_rate": 1e-05,
"loss": 0.5062,
"step": 15
},
{
"epoch": 0.09453471196454949,
"grad_norm": 0.8381787538528442,
"learning_rate": 1.0666666666666667e-05,
"loss": 0.5531,
"step": 16
},
{
"epoch": 0.10044313146233383,
"grad_norm": 0.7621350288391113,
"learning_rate": 1.1333333333333334e-05,
"loss": 0.4876,
"step": 17
},
{
"epoch": 0.10635155096011817,
"grad_norm": 0.6955872178077698,
"learning_rate": 1.2e-05,
"loss": 0.5019,
"step": 18
},
{
"epoch": 0.11225997045790251,
"grad_norm": 0.5844917297363281,
"learning_rate": 1.2666666666666667e-05,
"loss": 0.4368,
"step": 19
},
{
"epoch": 0.11816838995568685,
"grad_norm": 0.5807573795318604,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.4965,
"step": 20
},
{
"epoch": 0.1240768094534712,
"grad_norm": 0.5376399755477905,
"learning_rate": 1.4e-05,
"loss": 0.4841,
"step": 21
},
{
"epoch": 0.12998522895125553,
"grad_norm": 0.5053263902664185,
"learning_rate": 1.4666666666666666e-05,
"loss": 0.4573,
"step": 22
},
{
"epoch": 0.1358936484490399,
"grad_norm": 0.5155225396156311,
"learning_rate": 1.5333333333333334e-05,
"loss": 0.451,
"step": 23
},
{
"epoch": 0.14180206794682423,
"grad_norm": 0.52030348777771,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.4199,
"step": 24
},
{
"epoch": 0.14771048744460857,
"grad_norm": 0.5321907997131348,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.4532,
"step": 25
},
{
"epoch": 0.1536189069423929,
"grad_norm": 0.5318155288696289,
"learning_rate": 1.7333333333333336e-05,
"loss": 0.4813,
"step": 26
},
{
"epoch": 0.15952732644017725,
"grad_norm": 0.5176340937614441,
"learning_rate": 1.8e-05,
"loss": 0.4288,
"step": 27
},
{
"epoch": 0.1654357459379616,
"grad_norm": 0.43893975019454956,
"learning_rate": 1.866666666666667e-05,
"loss": 0.3766,
"step": 28
},
{
"epoch": 0.17134416543574593,
"grad_norm": 0.43830162286758423,
"learning_rate": 1.9333333333333333e-05,
"loss": 0.4159,
"step": 29
},
{
"epoch": 0.17725258493353027,
"grad_norm": 0.45950719714164734,
"learning_rate": 2e-05,
"loss": 0.4505,
"step": 30
},
{
"epoch": 0.1831610044313146,
"grad_norm": 0.40500667691230774,
"learning_rate": 1.9999783114048658e-05,
"loss": 0.3726,
"step": 31
},
{
"epoch": 0.18906942392909898,
"grad_norm": 0.43435147404670715,
"learning_rate": 1.9999132465602526e-05,
"loss": 0.442,
"step": 32
},
{
"epoch": 0.19497784342688332,
"grad_norm": 0.44813328981399536,
"learning_rate": 1.999804808288491e-05,
"loss": 0.437,
"step": 33
},
{
"epoch": 0.20088626292466766,
"grad_norm": 0.48166996240615845,
"learning_rate": 1.9996530012933285e-05,
"loss": 0.4107,
"step": 34
},
{
"epoch": 0.206794682422452,
"grad_norm": 0.398764044046402,
"learning_rate": 1.9994578321597258e-05,
"loss": 0.3882,
"step": 35
},
{
"epoch": 0.21270310192023634,
"grad_norm": 0.44229164719581604,
"learning_rate": 1.999219309353572e-05,
"loss": 0.4154,
"step": 36
},
{
"epoch": 0.21861152141802068,
"grad_norm": 0.44369620084762573,
"learning_rate": 1.998937443221316e-05,
"loss": 0.3863,
"step": 37
},
{
"epoch": 0.22451994091580502,
"grad_norm": 0.44270017743110657,
"learning_rate": 1.9986122459895182e-05,
"loss": 0.3945,
"step": 38
},
{
"epoch": 0.23042836041358936,
"grad_norm": 0.42152372002601624,
"learning_rate": 1.9982437317643218e-05,
"loss": 0.4094,
"step": 39
},
{
"epoch": 0.2363367799113737,
"grad_norm": 0.4120837450027466,
"learning_rate": 1.9978319165308373e-05,
"loss": 0.4411,
"step": 40
},
{
"epoch": 0.24224519940915806,
"grad_norm": 0.4064903259277344,
"learning_rate": 1.997376818152453e-05,
"loss": 0.3818,
"step": 41
},
{
"epoch": 0.2481536189069424,
"grad_norm": 0.3692624270915985,
"learning_rate": 1.9968784563700586e-05,
"loss": 0.3874,
"step": 42
},
{
"epoch": 0.25406203840472674,
"grad_norm": 0.4399218261241913,
"learning_rate": 1.9963368528011867e-05,
"loss": 0.3749,
"step": 43
},
{
"epoch": 0.25997045790251105,
"grad_norm": 0.3779003620147705,
"learning_rate": 1.9957520309390786e-05,
"loss": 0.3656,
"step": 44
},
{
"epoch": 0.2658788774002954,
"grad_norm": 0.3946981132030487,
"learning_rate": 1.9951240161516643e-05,
"loss": 0.3612,
"step": 45
},
{
"epoch": 0.2717872968980798,
"grad_norm": 0.3969726264476776,
"learning_rate": 1.99445283568046e-05,
"loss": 0.3932,
"step": 46
},
{
"epoch": 0.2776957163958641,
"grad_norm": 0.4239075183868408,
"learning_rate": 1.9937385186393888e-05,
"loss": 0.387,
"step": 47
},
{
"epoch": 0.28360413589364847,
"grad_norm": 0.3688453733921051,
"learning_rate": 1.992981096013517e-05,
"loss": 0.3524,
"step": 48
},
{
"epoch": 0.2895125553914328,
"grad_norm": 0.4294806718826294,
"learning_rate": 1.9921806006577102e-05,
"loss": 0.3787,
"step": 49
},
{
"epoch": 0.29542097488921715,
"grad_norm": 0.3867166042327881,
"learning_rate": 1.9913370672952074e-05,
"loss": 0.3756,
"step": 50
},
{
"epoch": 0.30132939438700146,
"grad_norm": 0.43365901708602905,
"learning_rate": 1.990450532516116e-05,
"loss": 0.3896,
"step": 51
},
{
"epoch": 0.3072378138847858,
"grad_norm": 0.38658151030540466,
"learning_rate": 1.9895210347758233e-05,
"loss": 0.3703,
"step": 52
},
{
"epoch": 0.31314623338257014,
"grad_norm": 0.37093815207481384,
"learning_rate": 1.98854861439333e-05,
"loss": 0.3763,
"step": 53
},
{
"epoch": 0.3190546528803545,
"grad_norm": 0.40044137835502625,
"learning_rate": 1.9875333135495e-05,
"loss": 0.3752,
"step": 54
},
{
"epoch": 0.3249630723781389,
"grad_norm": 0.39133360981941223,
"learning_rate": 1.986475176285232e-05,
"loss": 0.3589,
"step": 55
},
{
"epoch": 0.3308714918759232,
"grad_norm": 0.38397374749183655,
"learning_rate": 1.985374248499546e-05,
"loss": 0.3701,
"step": 56
},
{
"epoch": 0.33677991137370755,
"grad_norm": 0.3795414865016937,
"learning_rate": 1.984230577947597e-05,
"loss": 0.3584,
"step": 57
},
{
"epoch": 0.33677991137370755,
"eval_loss": 0.3953791558742523,
"eval_runtime": 4.6385,
"eval_samples_per_second": 11.857,
"eval_steps_per_second": 1.509,
"step": 57
},
{
"epoch": 0.34268833087149186,
"grad_norm": 0.3709493577480316,
"learning_rate": 1.9830442142386e-05,
"loss": 0.3647,
"step": 58
},
{
"epoch": 0.34859675036927623,
"grad_norm": 0.35005033016204834,
"learning_rate": 1.9818152088336786e-05,
"loss": 0.3317,
"step": 59
},
{
"epoch": 0.35450516986706054,
"grad_norm": 0.3652004599571228,
"learning_rate": 1.9805436150436352e-05,
"loss": 0.3394,
"step": 60
},
{
"epoch": 0.3604135893648449,
"grad_norm": 0.3940984904766083,
"learning_rate": 1.9792294880266346e-05,
"loss": 0.3711,
"step": 61
},
{
"epoch": 0.3663220088626292,
"grad_norm": 0.35634928941726685,
"learning_rate": 1.977872884785815e-05,
"loss": 0.3455,
"step": 62
},
{
"epoch": 0.3722304283604136,
"grad_norm": 0.3972924053668976,
"learning_rate": 1.9764738641668137e-05,
"loss": 0.3652,
"step": 63
},
{
"epoch": 0.37813884785819796,
"grad_norm": 0.40372708439826965,
"learning_rate": 1.9750324868552133e-05,
"loss": 0.3662,
"step": 64
},
{
"epoch": 0.38404726735598227,
"grad_norm": 0.396133691072464,
"learning_rate": 1.9735488153739128e-05,
"loss": 0.3726,
"step": 65
},
{
"epoch": 0.38995568685376664,
"grad_norm": 0.398989737033844,
"learning_rate": 1.972022914080411e-05,
"loss": 0.3595,
"step": 66
},
{
"epoch": 0.39586410635155095,
"grad_norm": 0.4102807939052582,
"learning_rate": 1.9704548491640195e-05,
"loss": 0.3308,
"step": 67
},
{
"epoch": 0.4017725258493353,
"grad_norm": 0.344397634267807,
"learning_rate": 1.9688446886429885e-05,
"loss": 0.3653,
"step": 68
},
{
"epoch": 0.4076809453471196,
"grad_norm": 0.3550814390182495,
"learning_rate": 1.9671925023615572e-05,
"loss": 0.3412,
"step": 69
},
{
"epoch": 0.413589364844904,
"grad_norm": 0.4047009348869324,
"learning_rate": 1.9654983619869242e-05,
"loss": 0.3578,
"step": 70
},
{
"epoch": 0.4194977843426883,
"grad_norm": 0.41112563014030457,
"learning_rate": 1.9637623410061392e-05,
"loss": 0.3694,
"step": 71
},
{
"epoch": 0.4254062038404727,
"grad_norm": 0.3775319755077362,
"learning_rate": 1.961984514722914e-05,
"loss": 0.3571,
"step": 72
},
{
"epoch": 0.43131462333825704,
"grad_norm": 0.3610381782054901,
"learning_rate": 1.960164960254358e-05,
"loss": 0.3713,
"step": 73
},
{
"epoch": 0.43722304283604135,
"grad_norm": 0.38662371039390564,
"learning_rate": 1.9583037565276314e-05,
"loss": 0.311,
"step": 74
},
{
"epoch": 0.4431314623338257,
"grad_norm": 0.3574771285057068,
"learning_rate": 1.9564009842765225e-05,
"loss": 0.3353,
"step": 75
},
{
"epoch": 0.44903988183161003,
"grad_norm": 0.3932562470436096,
"learning_rate": 1.9544567260379455e-05,
"loss": 0.3536,
"step": 76
},
{
"epoch": 0.4549483013293944,
"grad_norm": 0.3974682092666626,
"learning_rate": 1.9524710661483594e-05,
"loss": 0.3556,
"step": 77
},
{
"epoch": 0.4608567208271787,
"grad_norm": 0.37172290682792664,
"learning_rate": 1.9504440907401113e-05,
"loss": 0.3568,
"step": 78
},
{
"epoch": 0.4667651403249631,
"grad_norm": 0.37170422077178955,
"learning_rate": 1.948375887737699e-05,
"loss": 0.3556,
"step": 79
},
{
"epoch": 0.4726735598227474,
"grad_norm": 0.3596966862678528,
"learning_rate": 1.9462665468539582e-05,
"loss": 0.332,
"step": 80
},
{
"epoch": 0.47858197932053176,
"grad_norm": 0.35934680700302124,
"learning_rate": 1.944116159586169e-05,
"loss": 0.3276,
"step": 81
},
{
"epoch": 0.4844903988183161,
"grad_norm": 0.40984946489334106,
"learning_rate": 1.94192481921209e-05,
"loss": 0.3685,
"step": 82
},
{
"epoch": 0.49039881831610044,
"grad_norm": 0.3622114658355713,
"learning_rate": 1.9396926207859085e-05,
"loss": 0.3336,
"step": 83
},
{
"epoch": 0.4963072378138848,
"grad_norm": 0.34888842701911926,
"learning_rate": 1.9374196611341212e-05,
"loss": 0.3625,
"step": 84
},
{
"epoch": 0.5022156573116692,
"grad_norm": 0.37125518918037415,
"learning_rate": 1.9351060388513304e-05,
"loss": 0.3304,
"step": 85
},
{
"epoch": 0.5081240768094535,
"grad_norm": 0.4107120931148529,
"learning_rate": 1.9327518542959717e-05,
"loss": 0.3755,
"step": 86
},
{
"epoch": 0.5140324963072378,
"grad_norm": 0.3420109748840332,
"learning_rate": 1.9303572095859545e-05,
"loss": 0.3457,
"step": 87
},
{
"epoch": 0.5199409158050221,
"grad_norm": 0.35079535841941833,
"learning_rate": 1.9279222085942396e-05,
"loss": 0.3454,
"step": 88
},
{
"epoch": 0.5258493353028065,
"grad_norm": 0.3775666058063507,
"learning_rate": 1.9254469569443274e-05,
"loss": 0.3501,
"step": 89
},
{
"epoch": 0.5317577548005908,
"grad_norm": 0.3327409625053406,
"learning_rate": 1.9229315620056805e-05,
"loss": 0.3507,
"step": 90
},
{
"epoch": 0.5376661742983752,
"grad_norm": 0.37142789363861084,
"learning_rate": 1.9203761328890626e-05,
"loss": 0.3453,
"step": 91
},
{
"epoch": 0.5435745937961596,
"grad_norm": 0.36256077885627747,
"learning_rate": 1.91778078044181e-05,
"loss": 0.3588,
"step": 92
},
{
"epoch": 0.5494830132939439,
"grad_norm": 0.3861102759838104,
"learning_rate": 1.9151456172430186e-05,
"loss": 0.3479,
"step": 93
},
{
"epoch": 0.5553914327917282,
"grad_norm": 0.3359353542327881,
"learning_rate": 1.9124707575986642e-05,
"loss": 0.318,
"step": 94
},
{
"epoch": 0.5612998522895125,
"grad_norm": 0.33662593364715576,
"learning_rate": 1.909756317536643e-05,
"loss": 0.3421,
"step": 95
},
{
"epoch": 0.5672082717872969,
"grad_norm": 0.35831600427627563,
"learning_rate": 1.9070024148017375e-05,
"loss": 0.3409,
"step": 96
},
{
"epoch": 0.5731166912850812,
"grad_norm": 0.39858701825141907,
"learning_rate": 1.9042091688505104e-05,
"loss": 0.3319,
"step": 97
},
{
"epoch": 0.5790251107828656,
"grad_norm": 0.3343643546104431,
"learning_rate": 1.9013767008461236e-05,
"loss": 0.3352,
"step": 98
},
{
"epoch": 0.5849335302806499,
"grad_norm": 0.3519919216632843,
"learning_rate": 1.89850513365308e-05,
"loss": 0.3634,
"step": 99
},
{
"epoch": 0.5908419497784343,
"grad_norm": 0.32900717854499817,
"learning_rate": 1.895594591831896e-05,
"loss": 0.3415,
"step": 100
},
{
"epoch": 0.5967503692762186,
"grad_norm": 0.34432175755500793,
"learning_rate": 1.8926452016336987e-05,
"loss": 0.3169,
"step": 101
},
{
"epoch": 0.6026587887740029,
"grad_norm": 0.33144107460975647,
"learning_rate": 1.8896570909947477e-05,
"loss": 0.3431,
"step": 102
},
{
"epoch": 0.6085672082717873,
"grad_norm": 0.3299802839756012,
"learning_rate": 1.8866303895308856e-05,
"loss": 0.3411,
"step": 103
},
{
"epoch": 0.6144756277695717,
"grad_norm": 0.30740225315093994,
"learning_rate": 1.883565228531919e-05,
"loss": 0.3355,
"step": 104
},
{
"epoch": 0.620384047267356,
"grad_norm": 0.34325993061065674,
"learning_rate": 1.88046174095592e-05,
"loss": 0.3188,
"step": 105
},
{
"epoch": 0.6262924667651403,
"grad_norm": 0.3394065797328949,
"learning_rate": 1.8773200614234587e-05,
"loss": 0.3153,
"step": 106
},
{
"epoch": 0.6322008862629247,
"grad_norm": 0.35468512773513794,
"learning_rate": 1.874140326211766e-05,
"loss": 0.3387,
"step": 107
},
{
"epoch": 0.638109305760709,
"grad_norm": 0.36726799607276917,
"learning_rate": 1.8709226732488216e-05,
"loss": 0.3457,
"step": 108
},
{
"epoch": 0.6440177252584933,
"grad_norm": 0.3223711848258972,
"learning_rate": 1.86766724210737e-05,
"loss": 0.3588,
"step": 109
},
{
"epoch": 0.6499261447562777,
"grad_norm": 0.3537541925907135,
"learning_rate": 1.8643741739988672e-05,
"loss": 0.3506,
"step": 110
},
{
"epoch": 0.6558345642540621,
"grad_norm": 0.3755073845386505,
"learning_rate": 1.8610436117673557e-05,
"loss": 0.3221,
"step": 111
},
{
"epoch": 0.6617429837518464,
"grad_norm": 0.31778833270072937,
"learning_rate": 1.8576756998832667e-05,
"loss": 0.3161,
"step": 112
},
{
"epoch": 0.6676514032496307,
"grad_norm": 0.3517738878726959,
"learning_rate": 1.8542705844371544e-05,
"loss": 0.3442,
"step": 113
},
{
"epoch": 0.6735598227474151,
"grad_norm": 0.3254755139350891,
"learning_rate": 1.8508284131333604e-05,
"loss": 0.3372,
"step": 114
},
{
"epoch": 0.6735598227474151,
"eval_loss": 0.363791823387146,
"eval_runtime": 4.0908,
"eval_samples_per_second": 13.445,
"eval_steps_per_second": 1.711,
"step": 114
},
{
"epoch": 0.6794682422451994,
"grad_norm": 0.3458060622215271,
"learning_rate": 1.8473493352836032e-05,
"loss": 0.3329,
"step": 115
},
{
"epoch": 0.6853766617429837,
"grad_norm": 0.33962881565093994,
"learning_rate": 1.8438335018005052e-05,
"loss": 0.3478,
"step": 116
},
{
"epoch": 0.691285081240768,
"grad_norm": 0.33980926871299744,
"learning_rate": 1.8402810651910444e-05,
"loss": 0.3484,
"step": 117
},
{
"epoch": 0.6971935007385525,
"grad_norm": 0.355694979429245,
"learning_rate": 1.8366921795499394e-05,
"loss": 0.3686,
"step": 118
},
{
"epoch": 0.7031019202363368,
"grad_norm": 0.3415476083755493,
"learning_rate": 1.8330670005529657e-05,
"loss": 0.3204,
"step": 119
},
{
"epoch": 0.7090103397341211,
"grad_norm": 0.3336890935897827,
"learning_rate": 1.829405685450202e-05,
"loss": 0.3323,
"step": 120
},
{
"epoch": 0.7149187592319055,
"grad_norm": 0.34337785840034485,
"learning_rate": 1.8257083930592102e-05,
"loss": 0.3283,
"step": 121
},
{
"epoch": 0.7208271787296898,
"grad_norm": 0.3578524887561798,
"learning_rate": 1.8219752837581466e-05,
"loss": 0.3326,
"step": 122
},
{
"epoch": 0.7267355982274741,
"grad_norm": 0.32392922043800354,
"learning_rate": 1.8182065194788024e-05,
"loss": 0.3141,
"step": 123
},
{
"epoch": 0.7326440177252584,
"grad_norm": 0.36127492785453796,
"learning_rate": 1.814402263699584e-05,
"loss": 0.3461,
"step": 124
},
{
"epoch": 0.7385524372230429,
"grad_norm": 0.33812931180000305,
"learning_rate": 1.8105626814384173e-05,
"loss": 0.3404,
"step": 125
},
{
"epoch": 0.7444608567208272,
"grad_norm": 0.3138431906700134,
"learning_rate": 1.8066879392455932e-05,
"loss": 0.3237,
"step": 126
},
{
"epoch": 0.7503692762186115,
"grad_norm": 0.33033978939056396,
"learning_rate": 1.8027782051965408e-05,
"loss": 0.3416,
"step": 127
},
{
"epoch": 0.7562776957163959,
"grad_norm": 0.3907163143157959,
"learning_rate": 1.7988336488845374e-05,
"loss": 0.3352,
"step": 128
},
{
"epoch": 0.7621861152141802,
"grad_norm": 0.315248042345047,
"learning_rate": 1.7948544414133534e-05,
"loss": 0.3225,
"step": 129
},
{
"epoch": 0.7680945347119645,
"grad_norm": 0.3284492790699005,
"learning_rate": 1.7908407553898282e-05,
"loss": 0.3217,
"step": 130
},
{
"epoch": 0.7740029542097489,
"grad_norm": 0.3439176082611084,
"learning_rate": 1.7867927649163838e-05,
"loss": 0.3367,
"step": 131
},
{
"epoch": 0.7799113737075333,
"grad_norm": 0.31954073905944824,
"learning_rate": 1.782710645583473e-05,
"loss": 0.3133,
"step": 132
},
{
"epoch": 0.7858197932053176,
"grad_norm": 0.38416293263435364,
"learning_rate": 1.7785945744619642e-05,
"loss": 0.3484,
"step": 133
},
{
"epoch": 0.7917282127031019,
"grad_norm": 0.34139737486839294,
"learning_rate": 1.774444730095456e-05,
"loss": 0.3042,
"step": 134
},
{
"epoch": 0.7976366322008862,
"grad_norm": 0.3623535931110382,
"learning_rate": 1.7702612924925377e-05,
"loss": 0.3318,
"step": 135
},
{
"epoch": 0.8035450516986706,
"grad_norm": 0.32973209023475647,
"learning_rate": 1.766044443118978e-05,
"loss": 0.3092,
"step": 136
},
{
"epoch": 0.8094534711964549,
"grad_norm": 0.30704402923583984,
"learning_rate": 1.761794364889855e-05,
"loss": 0.321,
"step": 137
},
{
"epoch": 0.8153618906942393,
"grad_norm": 0.34877485036849976,
"learning_rate": 1.7575112421616203e-05,
"loss": 0.3266,
"step": 138
},
{
"epoch": 0.8212703101920237,
"grad_norm": 0.3538282811641693,
"learning_rate": 1.7531952607241033e-05,
"loss": 0.3703,
"step": 139
},
{
"epoch": 0.827178729689808,
"grad_norm": 0.35590365529060364,
"learning_rate": 1.7488466077924525e-05,
"loss": 0.3506,
"step": 140
},
{
"epoch": 0.8330871491875923,
"grad_norm": 0.33215418457984924,
"learning_rate": 1.7444654719990128e-05,
"loss": 0.3207,
"step": 141
},
{
"epoch": 0.8389955686853766,
"grad_norm": 0.3381923735141754,
"learning_rate": 1.7400520433851457e-05,
"loss": 0.3237,
"step": 142
},
{
"epoch": 0.844903988183161,
"grad_norm": 0.3371356129646301,
"learning_rate": 1.735606513392984e-05,
"loss": 0.3394,
"step": 143
},
{
"epoch": 0.8508124076809453,
"grad_norm": 0.344291627407074,
"learning_rate": 1.7311290748571273e-05,
"loss": 0.3604,
"step": 144
},
{
"epoch": 0.8567208271787297,
"grad_norm": 0.3567575216293335,
"learning_rate": 1.72661992199628e-05,
"loss": 0.3518,
"step": 145
},
{
"epoch": 0.8626292466765141,
"grad_norm": 0.33762165904045105,
"learning_rate": 1.7220792504048227e-05,
"loss": 0.3146,
"step": 146
},
{
"epoch": 0.8685376661742984,
"grad_norm": 0.3404117822647095,
"learning_rate": 1.717507257044331e-05,
"loss": 0.3192,
"step": 147
},
{
"epoch": 0.8744460856720827,
"grad_norm": 0.3535095751285553,
"learning_rate": 1.7129041402350317e-05,
"loss": 0.3364,
"step": 148
},
{
"epoch": 0.880354505169867,
"grad_norm": 0.3418992757797241,
"learning_rate": 1.708270099647198e-05,
"loss": 0.3327,
"step": 149
},
{
"epoch": 0.8862629246676514,
"grad_norm": 0.3172495663166046,
"learning_rate": 1.7036053362924896e-05,
"loss": 0.3404,
"step": 150
},
{
"epoch": 0.8921713441654358,
"grad_norm": 0.3307952284812927,
"learning_rate": 1.6989100525152346e-05,
"loss": 0.3279,
"step": 151
},
{
"epoch": 0.8980797636632201,
"grad_norm": 0.29014381766319275,
"learning_rate": 1.694184451983651e-05,
"loss": 0.3027,
"step": 152
},
{
"epoch": 0.9039881831610044,
"grad_norm": 0.3290538191795349,
"learning_rate": 1.689428739681012e-05,
"loss": 0.3297,
"step": 153
},
{
"epoch": 0.9098966026587888,
"grad_norm": 0.3165034353733063,
"learning_rate": 1.684643121896755e-05,
"loss": 0.3225,
"step": 154
},
{
"epoch": 0.9158050221565731,
"grad_norm": 0.3677435517311096,
"learning_rate": 1.679827806217533e-05,
"loss": 0.328,
"step": 155
},
{
"epoch": 0.9217134416543574,
"grad_norm": 0.3617594242095947,
"learning_rate": 1.6749830015182106e-05,
"loss": 0.3299,
"step": 156
},
{
"epoch": 0.9276218611521418,
"grad_norm": 0.31069889664649963,
"learning_rate": 1.6701089179528032e-05,
"loss": 0.3146,
"step": 157
},
{
"epoch": 0.9335302806499262,
"grad_norm": 0.3610530197620392,
"learning_rate": 1.6652057669453606e-05,
"loss": 0.3223,
"step": 158
},
{
"epoch": 0.9394387001477105,
"grad_norm": 0.3169001638889313,
"learning_rate": 1.6602737611807975e-05,
"loss": 0.3194,
"step": 159
},
{
"epoch": 0.9453471196454948,
"grad_norm": 0.33033737540245056,
"learning_rate": 1.655313114595666e-05,
"loss": 0.3317,
"step": 160
},
{
"epoch": 0.9512555391432792,
"grad_norm": 0.35510334372520447,
"learning_rate": 1.6503240423688768e-05,
"loss": 0.3249,
"step": 161
},
{
"epoch": 0.9571639586410635,
"grad_norm": 0.356079638004303,
"learning_rate": 1.6453067609123656e-05,
"loss": 0.3274,
"step": 162
},
{
"epoch": 0.9630723781388478,
"grad_norm": 0.36350899934768677,
"learning_rate": 1.6402614878617037e-05,
"loss": 0.3553,
"step": 163
},
{
"epoch": 0.9689807976366323,
"grad_norm": 0.3371831476688385,
"learning_rate": 1.6351884420666616e-05,
"loss": 0.3245,
"step": 164
},
{
"epoch": 0.9748892171344166,
"grad_norm": 0.3398657739162445,
"learning_rate": 1.6300878435817115e-05,
"loss": 0.3043,
"step": 165
},
{
"epoch": 0.9807976366322009,
"grad_norm": 0.34537115693092346,
"learning_rate": 1.6249599136564837e-05,
"loss": 0.349,
"step": 166
},
{
"epoch": 0.9867060561299852,
"grad_norm": 0.31506776809692383,
"learning_rate": 1.619804874726171e-05,
"loss": 0.315,
"step": 167
},
{
"epoch": 0.9926144756277696,
"grad_norm": 0.32844215631484985,
"learning_rate": 1.6146229504018777e-05,
"loss": 0.3247,
"step": 168
},
{
"epoch": 0.9985228951255539,
"grad_norm": 0.3447742760181427,
"learning_rate": 1.609414365460921e-05,
"loss": 0.3193,
"step": 169
},
{
"epoch": 1.0,
"grad_norm": 0.3447742760181427,
"learning_rate": 1.6041793458370812e-05,
"loss": 0.3359,
"step": 170
},
{
"epoch": 1.0059084194977843,
"grad_norm": 0.27635836601257324,
"learning_rate": 1.5989181186108003e-05,
"loss": 0.2579,
"step": 171
},
{
"epoch": 1.0059084194977843,
"eval_loss": 0.3496532440185547,
"eval_runtime": 4.0258,
"eval_samples_per_second": 13.662,
"eval_steps_per_second": 1.739,
"step": 171
},
{
"epoch": 1.0118168389955686,
"grad_norm": 0.27547529339790344,
"learning_rate": 1.5936309119993333e-05,
"loss": 0.2532,
"step": 172
},
{
"epoch": 1.017725258493353,
"grad_norm": 0.2674752473831177,
"learning_rate": 1.5883179553468465e-05,
"loss": 0.2413,
"step": 173
},
{
"epoch": 1.0236336779911375,
"grad_norm": 0.3056715428829193,
"learning_rate": 1.5829794791144723e-05,
"loss": 0.2418,
"step": 174
},
{
"epoch": 1.0295420974889218,
"grad_norm": 0.27895164489746094,
"learning_rate": 1.5776157148703094e-05,
"loss": 0.2516,
"step": 175
},
{
"epoch": 1.035450516986706,
"grad_norm": 0.2935872972011566,
"learning_rate": 1.5722268952793806e-05,
"loss": 0.254,
"step": 176
},
{
"epoch": 1.0413589364844904,
"grad_norm": 0.28329288959503174,
"learning_rate": 1.566813254093538e-05,
"loss": 0.2356,
"step": 177
},
{
"epoch": 1.0472673559822747,
"grad_norm": 0.29026728868484497,
"learning_rate": 1.5613750261413256e-05,
"loss": 0.2404,
"step": 178
},
{
"epoch": 1.053175775480059,
"grad_norm": 0.3126751780509949,
"learning_rate": 1.555912447317792e-05,
"loss": 0.2303,
"step": 179
},
{
"epoch": 1.0590841949778433,
"grad_norm": 0.26517724990844727,
"learning_rate": 1.5504257545742585e-05,
"loss": 0.2175,
"step": 180
},
{
"epoch": 1.0649926144756279,
"grad_norm": 0.26433265209198,
"learning_rate": 1.5449151859080395e-05,
"loss": 0.2169,
"step": 181
},
{
"epoch": 1.0709010339734122,
"grad_norm": 0.2908313274383545,
"learning_rate": 1.5393809803521213e-05,
"loss": 0.2236,
"step": 182
},
{
"epoch": 1.0768094534711965,
"grad_norm": 0.2951337397098541,
"learning_rate": 1.533823377964791e-05,
"loss": 0.2305,
"step": 183
},
{
"epoch": 1.0827178729689808,
"grad_norm": 0.29755067825317383,
"learning_rate": 1.528242619819224e-05,
"loss": 0.2385,
"step": 184
},
{
"epoch": 1.0886262924667651,
"grad_norm": 0.2879098355770111,
"learning_rate": 1.5226389479930296e-05,
"loss": 0.2377,
"step": 185
},
{
"epoch": 1.0945347119645494,
"grad_norm": 0.2590835392475128,
"learning_rate": 1.517012605557746e-05,
"loss": 0.2312,
"step": 186
},
{
"epoch": 1.1004431314623337,
"grad_norm": 0.2694130837917328,
"learning_rate": 1.5113638365682996e-05,
"loss": 0.2347,
"step": 187
},
{
"epoch": 1.106351550960118,
"grad_norm": 0.29442402720451355,
"learning_rate": 1.5056928860524181e-05,
"loss": 0.2428,
"step": 188
},
{
"epoch": 1.1122599704579026,
"grad_norm": 0.29042768478393555,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.2501,
"step": 189
},
{
"epoch": 1.118168389955687,
"grad_norm": 0.2620311975479126,
"learning_rate": 1.4942854253524479e-05,
"loss": 0.2395,
"step": 190
},
{
"epoch": 1.1240768094534712,
"grad_norm": 0.26113441586494446,
"learning_rate": 1.488549409991953e-05,
"loss": 0.2532,
"step": 191
},
{
"epoch": 1.1299852289512555,
"grad_norm": 0.2995262145996094,
"learning_rate": 1.482792202730745e-05,
"loss": 0.2319,
"step": 192
},
{
"epoch": 1.1358936484490398,
"grad_norm": 0.27327674627304077,
"learning_rate": 1.477014053300299e-05,
"loss": 0.2348,
"step": 193
},
{
"epoch": 1.1418020679468242,
"grad_norm": 0.26245003938674927,
"learning_rate": 1.4712152123405018e-05,
"loss": 0.228,
"step": 194
},
{
"epoch": 1.1477104874446087,
"grad_norm": 0.28888335824012756,
"learning_rate": 1.4653959313887813e-05,
"loss": 0.2436,
"step": 195
},
{
"epoch": 1.153618906942393,
"grad_norm": 0.2724781632423401,
"learning_rate": 1.4595564628691944e-05,
"loss": 0.2442,
"step": 196
},
{
"epoch": 1.1595273264401773,
"grad_norm": 0.2921780049800873,
"learning_rate": 1.4536970600814789e-05,
"loss": 0.2412,
"step": 197
},
{
"epoch": 1.1654357459379616,
"grad_norm": 0.27938568592071533,
"learning_rate": 1.4478179771900634e-05,
"loss": 0.2465,
"step": 198
},
{
"epoch": 1.171344165435746,
"grad_norm": 0.29516273736953735,
"learning_rate": 1.4419194692130453e-05,
"loss": 0.2415,
"step": 199
},
{
"epoch": 1.1772525849335302,
"grad_norm": 0.27947136759757996,
"learning_rate": 1.436001792011128e-05,
"loss": 0.2295,
"step": 200
},
{
"epoch": 1.1831610044313146,
"grad_norm": 0.26482367515563965,
"learning_rate": 1.4300652022765207e-05,
"loss": 0.2273,
"step": 201
},
{
"epoch": 1.1890694239290989,
"grad_norm": 0.2728091776371002,
"learning_rate": 1.424109957521806e-05,
"loss": 0.2227,
"step": 202
},
{
"epoch": 1.1949778434268834,
"grad_norm": 0.28748828172683716,
"learning_rate": 1.4181363160687693e-05,
"loss": 0.2402,
"step": 203
},
{
"epoch": 1.2008862629246677,
"grad_norm": 0.2891993820667267,
"learning_rate": 1.4121445370371922e-05,
"loss": 0.224,
"step": 204
},
{
"epoch": 1.206794682422452,
"grad_norm": 0.24767152965068817,
"learning_rate": 1.4061348803336135e-05,
"loss": 0.221,
"step": 205
},
{
"epoch": 1.2127031019202363,
"grad_norm": 0.2819165885448456,
"learning_rate": 1.400107606640056e-05,
"loss": 0.2231,
"step": 206
},
{
"epoch": 1.2186115214180206,
"grad_norm": 0.27328819036483765,
"learning_rate": 1.394062977402717e-05,
"loss": 0.229,
"step": 207
},
{
"epoch": 1.224519940915805,
"grad_norm": 0.2674582302570343,
"learning_rate": 1.3880012548206292e-05,
"loss": 0.2155,
"step": 208
},
{
"epoch": 1.2304283604135893,
"grad_norm": 0.2989075481891632,
"learning_rate": 1.3819227018342865e-05,
"loss": 0.2184,
"step": 209
},
{
"epoch": 1.2363367799113738,
"grad_norm": 0.30796098709106445,
"learning_rate": 1.3758275821142382e-05,
"loss": 0.2288,
"step": 210
},
{
"epoch": 1.2422451994091581,
"grad_norm": 0.29833805561065674,
"learning_rate": 1.3697161600496525e-05,
"loss": 0.2368,
"step": 211
},
{
"epoch": 1.2481536189069424,
"grad_norm": 0.26458829641342163,
"learning_rate": 1.3635887007368467e-05,
"loss": 0.2376,
"step": 212
},
{
"epoch": 1.2540620384047267,
"grad_norm": 0.2781698703765869,
"learning_rate": 1.3574454699677893e-05,
"loss": 0.2167,
"step": 213
},
{
"epoch": 1.259970457902511,
"grad_norm": 0.268433153629303,
"learning_rate": 1.3512867342185705e-05,
"loss": 0.2229,
"step": 214
},
{
"epoch": 1.2658788774002954,
"grad_norm": 0.2726047933101654,
"learning_rate": 1.3451127606378425e-05,
"loss": 0.223,
"step": 215
},
{
"epoch": 1.2717872968980797,
"grad_norm": 0.29567429423332214,
"learning_rate": 1.3389238170352318e-05,
"loss": 0.2105,
"step": 216
},
{
"epoch": 1.277695716395864,
"grad_norm": 0.30303359031677246,
"learning_rate": 1.3327201718697232e-05,
"loss": 0.2602,
"step": 217
},
{
"epoch": 1.2836041358936485,
"grad_norm": 0.27332380414009094,
"learning_rate": 1.326502094238013e-05,
"loss": 0.2288,
"step": 218
},
{
"epoch": 1.2895125553914328,
"grad_norm": 0.2703614830970764,
"learning_rate": 1.3202698538628376e-05,
"loss": 0.2308,
"step": 219
},
{
"epoch": 1.2954209748892171,
"grad_norm": 0.2788908779621124,
"learning_rate": 1.3140237210812741e-05,
"loss": 0.2254,
"step": 220
},
{
"epoch": 1.3013293943870015,
"grad_norm": 0.27442580461502075,
"learning_rate": 1.3077639668330124e-05,
"loss": 0.2158,
"step": 221
},
{
"epoch": 1.3072378138847858,
"grad_norm": 0.28895896673202515,
"learning_rate": 1.3014908626486032e-05,
"loss": 0.2404,
"step": 222
},
{
"epoch": 1.31314623338257,
"grad_norm": 0.24982582032680511,
"learning_rate": 1.2952046806376806e-05,
"loss": 0.2201,
"step": 223
},
{
"epoch": 1.3190546528803546,
"grad_norm": 0.28909650444984436,
"learning_rate": 1.2889056934771577e-05,
"loss": 0.2384,
"step": 224
},
{
"epoch": 1.324963072378139,
"grad_norm": 0.28018954396247864,
"learning_rate": 1.282594174399399e-05,
"loss": 0.2324,
"step": 225
},
{
"epoch": 1.3308714918759232,
"grad_norm": 0.29922735691070557,
"learning_rate": 1.2762703971803684e-05,
"loss": 0.2457,
"step": 226
},
{
"epoch": 1.3367799113737076,
"grad_norm": 0.289288729429245,
"learning_rate": 1.2699346361277538e-05,
"loss": 0.2366,
"step": 227
},
{
"epoch": 1.3426883308714919,
"grad_norm": 0.2790012061595917,
"learning_rate": 1.2635871660690677e-05,
"loss": 0.2359,
"step": 228
},
{
"epoch": 1.3426883308714919,
"eval_loss": 0.35204342007637024,
"eval_runtime": 4.4578,
"eval_samples_per_second": 12.338,
"eval_steps_per_second": 1.57,
"step": 228
},
{
"epoch": 1.3485967503692762,
"grad_norm": 0.36030444502830505,
"learning_rate": 1.2572282623397268e-05,
"loss": 0.2405,
"step": 229
},
{
"epoch": 1.3545051698670605,
"grad_norm": 0.24079382419586182,
"learning_rate": 1.2508582007711074e-05,
"loss": 0.2148,
"step": 230
},
{
"epoch": 1.3604135893648448,
"grad_norm": 0.26674559712409973,
"learning_rate": 1.2444772576785828e-05,
"loss": 0.2457,
"step": 231
},
{
"epoch": 1.3663220088626291,
"grad_norm": 0.25345727801322937,
"learning_rate": 1.2380857098495355e-05,
"loss": 0.2229,
"step": 232
},
{
"epoch": 1.3722304283604136,
"grad_norm": 0.2623337507247925,
"learning_rate": 1.2316838345313517e-05,
"loss": 0.231,
"step": 233
},
{
"epoch": 1.378138847858198,
"grad_norm": 0.27783095836639404,
"learning_rate": 1.225271909419395e-05,
"loss": 0.2251,
"step": 234
},
{
"epoch": 1.3840472673559823,
"grad_norm": 0.25021976232528687,
"learning_rate": 1.2188502126449616e-05,
"loss": 0.226,
"step": 235
},
{
"epoch": 1.3899556868537666,
"grad_norm": 0.2695038318634033,
"learning_rate": 1.2124190227632138e-05,
"loss": 0.2438,
"step": 236
},
{
"epoch": 1.395864106351551,
"grad_norm": 0.24312005937099457,
"learning_rate": 1.2059786187410984e-05,
"loss": 0.2138,
"step": 237
},
{
"epoch": 1.4017725258493354,
"grad_norm": 0.2761548161506653,
"learning_rate": 1.1995292799452472e-05,
"loss": 0.244,
"step": 238
},
{
"epoch": 1.4076809453471197,
"grad_norm": 0.2740529477596283,
"learning_rate": 1.1930712861298553e-05,
"loss": 0.2416,
"step": 239
},
{
"epoch": 1.413589364844904,
"grad_norm": 0.2605426013469696,
"learning_rate": 1.186604917424549e-05,
"loss": 0.2515,
"step": 240
},
{
"epoch": 1.4194977843426884,
"grad_norm": 0.27557292580604553,
"learning_rate": 1.1801304543222349e-05,
"loss": 0.232,
"step": 241
},
{
"epoch": 1.4254062038404727,
"grad_norm": 0.2512328624725342,
"learning_rate": 1.1736481776669307e-05,
"loss": 0.2311,
"step": 242
},
{
"epoch": 1.431314623338257,
"grad_norm": 0.2634104788303375,
"learning_rate": 1.1671583686415833e-05,
"loss": 0.2207,
"step": 243
},
{
"epoch": 1.4372230428360413,
"grad_norm": 0.2541881203651428,
"learning_rate": 1.1606613087558748e-05,
"loss": 0.2207,
"step": 244
},
{
"epoch": 1.4431314623338256,
"grad_norm": 0.24408863484859467,
"learning_rate": 1.1541572798340076e-05,
"loss": 0.2155,
"step": 245
},
{
"epoch": 1.44903988183161,
"grad_norm": 0.25305289030075073,
"learning_rate": 1.1476465640024814e-05,
"loss": 0.2245,
"step": 246
},
{
"epoch": 1.4549483013293945,
"grad_norm": 0.26579606533050537,
"learning_rate": 1.1411294436778562e-05,
"loss": 0.2295,
"step": 247
},
{
"epoch": 1.4608567208271788,
"grad_norm": 0.26332345604896545,
"learning_rate": 1.1346062015544997e-05,
"loss": 0.2363,
"step": 248
},
{
"epoch": 1.466765140324963,
"grad_norm": 0.2519514262676239,
"learning_rate": 1.1280771205923269e-05,
"loss": 0.2215,
"step": 249
},
{
"epoch": 1.4726735598227474,
"grad_norm": 0.2569345533847809,
"learning_rate": 1.1215424840045254e-05,
"loss": 0.223,
"step": 250
},
{
"epoch": 1.4785819793205317,
"grad_norm": 0.25557035207748413,
"learning_rate": 1.1150025752452693e-05,
"loss": 0.2511,
"step": 251
},
{
"epoch": 1.4844903988183162,
"grad_norm": 0.26646342873573303,
"learning_rate": 1.1084576779974257e-05,
"loss": 0.2476,
"step": 252
},
{
"epoch": 1.4903988183161005,
"grad_norm": 0.27917614579200745,
"learning_rate": 1.1019080761602473e-05,
"loss": 0.2284,
"step": 253
},
{
"epoch": 1.4963072378138849,
"grad_norm": 0.2594425082206726,
"learning_rate": 1.0953540538370591e-05,
"loss": 0.2319,
"step": 254
},
{
"epoch": 1.5022156573116692,
"grad_norm": 0.23648317158222198,
"learning_rate": 1.0887958953229349e-05,
"loss": 0.225,
"step": 255
},
{
"epoch": 1.5081240768094535,
"grad_norm": 0.24810343980789185,
"learning_rate": 1.0822338850923644e-05,
"loss": 0.2222,
"step": 256
},
{
"epoch": 1.5140324963072378,
"grad_norm": 0.25305667519569397,
"learning_rate": 1.0756683077869133e-05,
"loss": 0.2178,
"step": 257
},
{
"epoch": 1.519940915805022,
"grad_norm": 0.23994190990924835,
"learning_rate": 1.069099448202878e-05,
"loss": 0.2274,
"step": 258
},
{
"epoch": 1.5258493353028064,
"grad_norm": 0.28112536668777466,
"learning_rate": 1.0625275912789307e-05,
"loss": 0.2157,
"step": 259
},
{
"epoch": 1.5317577548005907,
"grad_norm": 0.2910768687725067,
"learning_rate": 1.0559530220837593e-05,
"loss": 0.2337,
"step": 260
},
{
"epoch": 1.537666174298375,
"grad_norm": 0.26320862770080566,
"learning_rate": 1.049376025803703e-05,
"loss": 0.2156,
"step": 261
},
{
"epoch": 1.5435745937961596,
"grad_norm": 0.2653874456882477,
"learning_rate": 1.0427968877303809e-05,
"loss": 0.2269,
"step": 262
},
{
"epoch": 1.549483013293944,
"grad_norm": 0.24998469650745392,
"learning_rate": 1.0362158932483165e-05,
"loss": 0.2252,
"step": 263
},
{
"epoch": 1.5553914327917282,
"grad_norm": 0.25920990109443665,
"learning_rate": 1.0296333278225599e-05,
"loss": 0.2274,
"step": 264
},
{
"epoch": 1.5612998522895125,
"grad_norm": 0.2827723026275635,
"learning_rate": 1.023049476986304e-05,
"loss": 0.248,
"step": 265
},
{
"epoch": 1.567208271787297,
"grad_norm": 0.27848076820373535,
"learning_rate": 1.0164646263284993e-05,
"loss": 0.2372,
"step": 266
},
{
"epoch": 1.5731166912850814,
"grad_norm": 0.2601296305656433,
"learning_rate": 1.0098790614814658e-05,
"loss": 0.212,
"step": 267
},
{
"epoch": 1.5790251107828657,
"grad_norm": 0.24360589683055878,
"learning_rate": 1.0032930681085028e-05,
"loss": 0.2152,
"step": 268
},
{
"epoch": 1.58493353028065,
"grad_norm": 0.3080978989601135,
"learning_rate": 9.967069318914977e-06,
"loss": 0.2218,
"step": 269
},
{
"epoch": 1.5908419497784343,
"grad_norm": 0.26208099722862244,
"learning_rate": 9.901209385185345e-06,
"loss": 0.2184,
"step": 270
},
{
"epoch": 1.5967503692762186,
"grad_norm": 0.2984671890735626,
"learning_rate": 9.835353736715007e-06,
"loss": 0.2432,
"step": 271
},
{
"epoch": 1.602658788774003,
"grad_norm": 0.26782581210136414,
"learning_rate": 9.769505230136962e-06,
"loss": 0.2126,
"step": 272
},
{
"epoch": 1.6085672082717872,
"grad_norm": 0.28440967202186584,
"learning_rate": 9.703666721774403e-06,
"loss": 0.2214,
"step": 273
},
{
"epoch": 1.6144756277695715,
"grad_norm": 0.2926226854324341,
"learning_rate": 9.637841067516837e-06,
"loss": 0.2256,
"step": 274
},
{
"epoch": 1.6203840472673559,
"grad_norm": 0.25548121333122253,
"learning_rate": 9.572031122696196e-06,
"loss": 0.2304,
"step": 275
},
{
"epoch": 1.6262924667651402,
"grad_norm": 0.28455373644828796,
"learning_rate": 9.506239741962971e-06,
"loss": 0.2299,
"step": 276
},
{
"epoch": 1.6322008862629247,
"grad_norm": 0.262614369392395,
"learning_rate": 9.440469779162407e-06,
"loss": 0.2251,
"step": 277
},
{
"epoch": 1.638109305760709,
"grad_norm": 0.27394819259643555,
"learning_rate": 9.374724087210698e-06,
"loss": 0.2117,
"step": 278
},
{
"epoch": 1.6440177252584933,
"grad_norm": 0.2843812108039856,
"learning_rate": 9.309005517971222e-06,
"loss": 0.2268,
"step": 279
},
{
"epoch": 1.6499261447562779,
"grad_norm": 0.25647154450416565,
"learning_rate": 9.24331692213087e-06,
"loss": 0.2187,
"step": 280
},
{
"epoch": 1.6558345642540622,
"grad_norm": 0.27861371636390686,
"learning_rate": 9.17766114907636e-06,
"loss": 0.2311,
"step": 281
},
{
"epoch": 1.6617429837518465,
"grad_norm": 0.270049512386322,
"learning_rate": 9.112041046770653e-06,
"loss": 0.2265,
"step": 282
},
{
"epoch": 1.6676514032496308,
"grad_norm": 0.2750328779220581,
"learning_rate": 9.04645946162941e-06,
"loss": 0.2253,
"step": 283
},
{
"epoch": 1.673559822747415,
"grad_norm": 0.2412230521440506,
"learning_rate": 8.980919238397532e-06,
"loss": 0.2394,
"step": 284
},
{
"epoch": 1.6794682422451994,
"grad_norm": 0.2524693012237549,
"learning_rate": 8.915423220025747e-06,
"loss": 0.2258,
"step": 285
},
{
"epoch": 1.6794682422451994,
"eval_loss": 0.3460842967033386,
"eval_runtime": 4.0784,
"eval_samples_per_second": 13.486,
"eval_steps_per_second": 1.716,
"step": 285
},
{
"epoch": 1.6853766617429837,
"grad_norm": 0.25439098477363586,
"learning_rate": 8.849974247547307e-06,
"loss": 0.2266,
"step": 286
},
{
"epoch": 1.691285081240768,
"grad_norm": 0.257929265499115,
"learning_rate": 8.784575159954748e-06,
"loss": 0.2133,
"step": 287
},
{
"epoch": 1.6971935007385524,
"grad_norm": 0.24912972748279572,
"learning_rate": 8.719228794076733e-06,
"loss": 0.2129,
"step": 288
},
{
"epoch": 1.7031019202363367,
"grad_norm": 0.27103564143180847,
"learning_rate": 8.653937984455007e-06,
"loss": 0.2276,
"step": 289
},
{
"epoch": 1.709010339734121,
"grad_norm": 0.2718878984451294,
"learning_rate": 8.588705563221444e-06,
"loss": 0.2276,
"step": 290
},
{
"epoch": 1.7149187592319055,
"grad_norm": 0.26431816816329956,
"learning_rate": 8.52353435997519e-06,
"loss": 0.2328,
"step": 291
},
{
"epoch": 1.7208271787296898,
"grad_norm": 0.2725984752178192,
"learning_rate": 8.458427201659926e-06,
"loss": 0.2292,
"step": 292
},
{
"epoch": 1.7267355982274741,
"grad_norm": 0.2515108585357666,
"learning_rate": 8.393386912441257e-06,
"loss": 0.226,
"step": 293
},
{
"epoch": 1.7326440177252584,
"grad_norm": 0.2476361244916916,
"learning_rate": 8.328416313584169e-06,
"loss": 0.2277,
"step": 294
},
{
"epoch": 1.738552437223043,
"grad_norm": 0.25414201617240906,
"learning_rate": 8.263518223330698e-06,
"loss": 0.2268,
"step": 295
},
{
"epoch": 1.7444608567208273,
"grad_norm": 0.26264503598213196,
"learning_rate": 8.198695456777653e-06,
"loss": 0.2193,
"step": 296
},
{
"epoch": 1.7503692762186116,
"grad_norm": 0.26917147636413574,
"learning_rate": 8.133950825754511e-06,
"loss": 0.2251,
"step": 297
},
{
"epoch": 1.756277695716396,
"grad_norm": 0.2692192792892456,
"learning_rate": 8.069287138701452e-06,
"loss": 0.232,
"step": 298
},
{
"epoch": 1.7621861152141802,
"grad_norm": 0.27494263648986816,
"learning_rate": 8.004707200547534e-06,
"loss": 0.2461,
"step": 299
},
{
"epoch": 1.7680945347119645,
"grad_norm": 0.28247448801994324,
"learning_rate": 7.940213812589018e-06,
"loss": 0.2226,
"step": 300
},
{
"epoch": 1.7740029542097489,
"grad_norm": 0.2632560133934021,
"learning_rate": 7.875809772367867e-06,
"loss": 0.216,
"step": 301
},
{
"epoch": 1.7799113737075332,
"grad_norm": 0.26561063528060913,
"learning_rate": 7.81149787355039e-06,
"loss": 0.2286,
"step": 302
},
{
"epoch": 1.7858197932053175,
"grad_norm": 0.24065916240215302,
"learning_rate": 7.747280905806051e-06,
"loss": 0.2201,
"step": 303
},
{
"epoch": 1.7917282127031018,
"grad_norm": 0.288473904132843,
"learning_rate": 7.683161654686486e-06,
"loss": 0.2179,
"step": 304
},
{
"epoch": 1.797636632200886,
"grad_norm": 0.27798035740852356,
"learning_rate": 7.619142901504649e-06,
"loss": 0.2341,
"step": 305
},
{
"epoch": 1.8035450516986706,
"grad_norm": 0.28387168049812317,
"learning_rate": 7.555227423214174e-06,
"loss": 0.226,
"step": 306
},
{
"epoch": 1.809453471196455,
"grad_norm": 0.28974682092666626,
"learning_rate": 7.491417992288927e-06,
"loss": 0.2296,
"step": 307
},
{
"epoch": 1.8153618906942393,
"grad_norm": 0.26052042841911316,
"learning_rate": 7.427717376602739e-06,
"loss": 0.2002,
"step": 308
},
{
"epoch": 1.8212703101920238,
"grad_norm": 0.29558730125427246,
"learning_rate": 7.364128339309326e-06,
"loss": 0.263,
"step": 309
},
{
"epoch": 1.827178729689808,
"grad_norm": 0.24457122385501862,
"learning_rate": 7.300653638722463e-06,
"loss": 0.224,
"step": 310
},
{
"epoch": 1.8330871491875924,
"grad_norm": 0.2517196834087372,
"learning_rate": 7.2372960281963165e-06,
"loss": 0.2134,
"step": 311
},
{
"epoch": 1.8389955686853767,
"grad_norm": 0.27632561326026917,
"learning_rate": 7.174058256006012e-06,
"loss": 0.2229,
"step": 312
},
{
"epoch": 1.844903988183161,
"grad_norm": 0.2603515684604645,
"learning_rate": 7.110943065228425e-06,
"loss": 0.2299,
"step": 313
},
{
"epoch": 1.8508124076809453,
"grad_norm": 0.24517123401165009,
"learning_rate": 7.047953193623195e-06,
"loss": 0.2096,
"step": 314
},
{
"epoch": 1.8567208271787297,
"grad_norm": 0.24135427176952362,
"learning_rate": 6.985091373513972e-06,
"loss": 0.2072,
"step": 315
},
{
"epoch": 1.862629246676514,
"grad_norm": 0.2676647901535034,
"learning_rate": 6.92236033166988e-06,
"loss": 0.2173,
"step": 316
},
{
"epoch": 1.8685376661742983,
"grad_norm": 0.2504200041294098,
"learning_rate": 6.859762789187259e-06,
"loss": 0.2192,
"step": 317
},
{
"epoch": 1.8744460856720826,
"grad_norm": 0.26364269852638245,
"learning_rate": 6.797301461371626e-06,
"loss": 0.2193,
"step": 318
},
{
"epoch": 1.880354505169867,
"grad_norm": 0.24448218941688538,
"learning_rate": 6.734979057619873e-06,
"loss": 0.2208,
"step": 319
},
{
"epoch": 1.8862629246676514,
"grad_norm": 0.24706940352916718,
"learning_rate": 6.67279828130277e-06,
"loss": 0.2211,
"step": 320
},
{
"epoch": 1.8921713441654358,
"grad_norm": 0.24761930108070374,
"learning_rate": 6.610761829647685e-06,
"loss": 0.2222,
"step": 321
},
{
"epoch": 1.89807976366322,
"grad_norm": 0.2566414475440979,
"learning_rate": 6.548872393621578e-06,
"loss": 0.2136,
"step": 322
},
{
"epoch": 1.9039881831610044,
"grad_norm": 0.2611066401004791,
"learning_rate": 6.487132657814297e-06,
"loss": 0.2146,
"step": 323
},
{
"epoch": 1.909896602658789,
"grad_norm": 0.27130842208862305,
"learning_rate": 6.4255453003221115e-06,
"loss": 0.2184,
"step": 324
},
{
"epoch": 1.9158050221565732,
"grad_norm": 0.2548243999481201,
"learning_rate": 6.364112992631537e-06,
"loss": 0.2299,
"step": 325
},
{
"epoch": 1.9217134416543575,
"grad_norm": 0.2533697187900543,
"learning_rate": 6.302838399503477e-06,
"loss": 0.2043,
"step": 326
},
{
"epoch": 1.9276218611521418,
"grad_norm": 0.2540424168109894,
"learning_rate": 6.241724178857621e-06,
"loss": 0.2039,
"step": 327
},
{
"epoch": 1.9335302806499262,
"grad_norm": 0.2535569965839386,
"learning_rate": 6.180772981657139e-06,
"loss": 0.2019,
"step": 328
},
{
"epoch": 1.9394387001477105,
"grad_norm": 0.29982754588127136,
"learning_rate": 6.119987451793711e-06,
"loss": 0.2228,
"step": 329
},
{
"epoch": 1.9453471196454948,
"grad_norm": 0.23110415041446686,
"learning_rate": 6.059370225972834e-06,
"loss": 0.2188,
"step": 330
},
{
"epoch": 1.951255539143279,
"grad_norm": 0.2608148753643036,
"learning_rate": 5.998923933599443e-06,
"loss": 0.2236,
"step": 331
},
{
"epoch": 1.9571639586410634,
"grad_norm": 0.26010897755622864,
"learning_rate": 5.938651196663865e-06,
"loss": 0.2032,
"step": 332
},
{
"epoch": 1.9630723781388477,
"grad_norm": 0.26297712326049805,
"learning_rate": 5.878554629628081e-06,
"loss": 0.2224,
"step": 333
},
{
"epoch": 1.9689807976366323,
"grad_norm": 0.2658803164958954,
"learning_rate": 5.818636839312309e-06,
"loss": 0.2153,
"step": 334
},
{
"epoch": 1.9748892171344166,
"grad_norm": 0.23885361850261688,
"learning_rate": 5.758900424781939e-06,
"loss": 0.2029,
"step": 335
},
{
"epoch": 1.9807976366322009,
"grad_norm": 0.2604767978191376,
"learning_rate": 5.699347977234799e-06,
"loss": 0.2059,
"step": 336
},
{
"epoch": 1.9867060561299852,
"grad_norm": 0.2535778284072876,
"learning_rate": 5.6399820798887266e-06,
"loss": 0.2204,
"step": 337
},
{
"epoch": 1.9926144756277697,
"grad_norm": 0.2699243128299713,
"learning_rate": 5.580805307869549e-06,
"loss": 0.2158,
"step": 338
}
],
"logging_steps": 1,
"max_steps": 507,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 169,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.797158580880671e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}