satyanshu404's picture
Model save
f003069 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 18025,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0011095700416088765,
"grad_norm": 0.6721351742744446,
"learning_rate": 2.7739251040221912e-09,
"loss": 1.1791,
"step": 20
},
{
"epoch": 0.002219140083217753,
"grad_norm": 0.48403581976890564,
"learning_rate": 5.5478502080443824e-09,
"loss": 1.1797,
"step": 40
},
{
"epoch": 0.00332871012482663,
"grad_norm": 0.328329473733902,
"learning_rate": 8.321775312066573e-09,
"loss": 1.1276,
"step": 60
},
{
"epoch": 0.004438280166435506,
"grad_norm": 0.573009192943573,
"learning_rate": 1.1095700416088765e-08,
"loss": 1.1502,
"step": 80
},
{
"epoch": 0.005547850208044383,
"grad_norm": 0.7344629764556885,
"learning_rate": 1.3869625520110957e-08,
"loss": 1.1745,
"step": 100
},
{
"epoch": 0.00665742024965326,
"grad_norm": 0.4872736632823944,
"learning_rate": 1.6643550624133146e-08,
"loss": 1.1419,
"step": 120
},
{
"epoch": 0.007766990291262136,
"grad_norm": 0.42412033677101135,
"learning_rate": 1.9417475728155338e-08,
"loss": 1.1833,
"step": 140
},
{
"epoch": 0.008876560332871012,
"grad_norm": 0.41351547837257385,
"learning_rate": 2.219140083217753e-08,
"loss": 1.1966,
"step": 160
},
{
"epoch": 0.009986130374479889,
"grad_norm": 0.5638367533683777,
"learning_rate": 2.4965325936199722e-08,
"loss": 1.1935,
"step": 180
},
{
"epoch": 0.011095700416088766,
"grad_norm": 0.35621383786201477,
"learning_rate": 2.7739251040221914e-08,
"loss": 1.1403,
"step": 200
},
{
"epoch": 0.012205270457697643,
"grad_norm": 0.519483208656311,
"learning_rate": 3.0513176144244106e-08,
"loss": 1.1716,
"step": 220
},
{
"epoch": 0.01331484049930652,
"grad_norm": 0.655170202255249,
"learning_rate": 3.328710124826629e-08,
"loss": 1.187,
"step": 240
},
{
"epoch": 0.014424410540915394,
"grad_norm": 0.30613973736763,
"learning_rate": 3.606102635228848e-08,
"loss": 1.1639,
"step": 260
},
{
"epoch": 0.015533980582524271,
"grad_norm": 0.6322771310806274,
"learning_rate": 3.8834951456310675e-08,
"loss": 1.1547,
"step": 280
},
{
"epoch": 0.016643550624133148,
"grad_norm": 0.622188925743103,
"learning_rate": 4.1608876560332874e-08,
"loss": 1.1878,
"step": 300
},
{
"epoch": 0.017753120665742025,
"grad_norm": 0.636248767375946,
"learning_rate": 4.438280166435506e-08,
"loss": 1.1298,
"step": 320
},
{
"epoch": 0.0188626907073509,
"grad_norm": 0.5820329189300537,
"learning_rate": 4.715672676837725e-08,
"loss": 1.1284,
"step": 340
},
{
"epoch": 0.019972260748959778,
"grad_norm": 0.40758267045021057,
"learning_rate": 4.9930651872399443e-08,
"loss": 1.2361,
"step": 360
},
{
"epoch": 0.021081830790568655,
"grad_norm": 0.4012329578399658,
"learning_rate": 5.270457697642163e-08,
"loss": 1.1518,
"step": 380
},
{
"epoch": 0.022191400832177532,
"grad_norm": 0.5331623554229736,
"learning_rate": 5.547850208044383e-08,
"loss": 1.2203,
"step": 400
},
{
"epoch": 0.02330097087378641,
"grad_norm": 0.5145316123962402,
"learning_rate": 5.825242718446602e-08,
"loss": 1.1253,
"step": 420
},
{
"epoch": 0.024410540915395285,
"grad_norm": 0.36710885167121887,
"learning_rate": 6.102635228848821e-08,
"loss": 1.0685,
"step": 440
},
{
"epoch": 0.025520110957004162,
"grad_norm": 0.3183213472366333,
"learning_rate": 6.38002773925104e-08,
"loss": 1.2097,
"step": 460
},
{
"epoch": 0.02662968099861304,
"grad_norm": 0.3870026171207428,
"learning_rate": 6.657420249653258e-08,
"loss": 1.1605,
"step": 480
},
{
"epoch": 0.027739251040221916,
"grad_norm": 0.5289874076843262,
"learning_rate": 6.934812760055478e-08,
"loss": 1.1163,
"step": 500
},
{
"epoch": 0.02884882108183079,
"grad_norm": 0.6247478127479553,
"learning_rate": 7.212205270457697e-08,
"loss": 1.2463,
"step": 520
},
{
"epoch": 0.029958391123439666,
"grad_norm": 0.5780977010726929,
"learning_rate": 7.489597780859917e-08,
"loss": 1.195,
"step": 540
},
{
"epoch": 0.031067961165048542,
"grad_norm": 0.5350005626678467,
"learning_rate": 7.766990291262135e-08,
"loss": 1.1289,
"step": 560
},
{
"epoch": 0.03217753120665742,
"grad_norm": 0.30264171957969666,
"learning_rate": 8.044382801664355e-08,
"loss": 1.0648,
"step": 580
},
{
"epoch": 0.033287101248266296,
"grad_norm": 0.45016685128211975,
"learning_rate": 8.321775312066575e-08,
"loss": 1.153,
"step": 600
},
{
"epoch": 0.034396671289875176,
"grad_norm": 0.5521411895751953,
"learning_rate": 8.599167822468793e-08,
"loss": 1.1257,
"step": 620
},
{
"epoch": 0.03550624133148405,
"grad_norm": 0.4235968291759491,
"learning_rate": 8.876560332871012e-08,
"loss": 1.1198,
"step": 640
},
{
"epoch": 0.03661581137309293,
"grad_norm": 0.5764958262443542,
"learning_rate": 9.153952843273232e-08,
"loss": 1.1015,
"step": 660
},
{
"epoch": 0.0377253814147018,
"grad_norm": 0.5378324389457703,
"learning_rate": 9.43134535367545e-08,
"loss": 1.2342,
"step": 680
},
{
"epoch": 0.038834951456310676,
"grad_norm": 0.3303524851799011,
"learning_rate": 9.708737864077669e-08,
"loss": 1.1156,
"step": 700
},
{
"epoch": 0.039944521497919556,
"grad_norm": 0.5583963394165039,
"learning_rate": 9.986130374479889e-08,
"loss": 1.1579,
"step": 720
},
{
"epoch": 0.04105409153952843,
"grad_norm": 0.813584566116333,
"learning_rate": 1.0263522884882107e-07,
"loss": 1.1749,
"step": 740
},
{
"epoch": 0.04216366158113731,
"grad_norm": 0.6232322454452515,
"learning_rate": 1.0540915395284326e-07,
"loss": 1.195,
"step": 760
},
{
"epoch": 0.04327323162274618,
"grad_norm": 0.5598679780960083,
"learning_rate": 1.0818307905686546e-07,
"loss": 1.1183,
"step": 780
},
{
"epoch": 0.044382801664355064,
"grad_norm": 0.5374318361282349,
"learning_rate": 1.1095700416088766e-07,
"loss": 1.1944,
"step": 800
},
{
"epoch": 0.04549237170596394,
"grad_norm": 0.19993217289447784,
"learning_rate": 1.1373092926490985e-07,
"loss": 1.0835,
"step": 820
},
{
"epoch": 0.04660194174757282,
"grad_norm": 0.4393330514431,
"learning_rate": 1.1650485436893204e-07,
"loss": 1.1977,
"step": 840
},
{
"epoch": 0.04771151178918169,
"grad_norm": 0.6222187876701355,
"learning_rate": 1.1927877947295422e-07,
"loss": 1.0523,
"step": 860
},
{
"epoch": 0.04882108183079057,
"grad_norm": 0.319072425365448,
"learning_rate": 1.2205270457697642e-07,
"loss": 1.136,
"step": 880
},
{
"epoch": 0.049930651872399444,
"grad_norm": 0.5406301617622375,
"learning_rate": 1.248266296809986e-07,
"loss": 1.2155,
"step": 900
},
{
"epoch": 0.051040221914008324,
"grad_norm": 0.618211567401886,
"learning_rate": 1.276005547850208e-07,
"loss": 1.1562,
"step": 920
},
{
"epoch": 0.0521497919556172,
"grad_norm": 0.3475450277328491,
"learning_rate": 1.30374479889043e-07,
"loss": 1.1551,
"step": 940
},
{
"epoch": 0.05325936199722608,
"grad_norm": 0.4210108518600464,
"learning_rate": 1.3314840499306516e-07,
"loss": 1.1612,
"step": 960
},
{
"epoch": 0.05436893203883495,
"grad_norm": 0.386306494474411,
"learning_rate": 1.3592233009708736e-07,
"loss": 1.2195,
"step": 980
},
{
"epoch": 0.05547850208044383,
"grad_norm": 0.42582884430885315,
"learning_rate": 1.3869625520110956e-07,
"loss": 1.1139,
"step": 1000
},
{
"epoch": 0.056588072122052704,
"grad_norm": 0.3357681930065155,
"learning_rate": 1.4147018030513176e-07,
"loss": 1.1757,
"step": 1020
},
{
"epoch": 0.05769764216366158,
"grad_norm": 0.2897900938987732,
"learning_rate": 1.4424410540915393e-07,
"loss": 1.2098,
"step": 1040
},
{
"epoch": 0.05880721220527046,
"grad_norm": 0.38025063276290894,
"learning_rate": 1.4701803051317613e-07,
"loss": 1.1234,
"step": 1060
},
{
"epoch": 0.05991678224687933,
"grad_norm": 0.36913609504699707,
"learning_rate": 1.4979195561719833e-07,
"loss": 1.1552,
"step": 1080
},
{
"epoch": 0.06102635228848821,
"grad_norm": 0.5480925440788269,
"learning_rate": 1.525658807212205e-07,
"loss": 1.135,
"step": 1100
},
{
"epoch": 0.062135922330097085,
"grad_norm": 0.3799718916416168,
"learning_rate": 1.553398058252427e-07,
"loss": 1.1454,
"step": 1120
},
{
"epoch": 0.06324549237170596,
"grad_norm": 0.3841489255428314,
"learning_rate": 1.5811373092926493e-07,
"loss": 1.1351,
"step": 1140
},
{
"epoch": 0.06435506241331485,
"grad_norm": 0.29666101932525635,
"learning_rate": 1.608876560332871e-07,
"loss": 1.1616,
"step": 1160
},
{
"epoch": 0.06546463245492372,
"grad_norm": 0.2889375686645508,
"learning_rate": 1.636615811373093e-07,
"loss": 1.1802,
"step": 1180
},
{
"epoch": 0.06657420249653259,
"grad_norm": 0.5737839937210083,
"learning_rate": 1.664355062413315e-07,
"loss": 1.1667,
"step": 1200
},
{
"epoch": 0.06768377253814147,
"grad_norm": 0.5003082156181335,
"learning_rate": 1.6920943134535367e-07,
"loss": 1.075,
"step": 1220
},
{
"epoch": 0.06879334257975035,
"grad_norm": 0.46454185247421265,
"learning_rate": 1.7198335644937587e-07,
"loss": 1.1724,
"step": 1240
},
{
"epoch": 0.06990291262135923,
"grad_norm": 0.32240554690361023,
"learning_rate": 1.7475728155339807e-07,
"loss": 1.1437,
"step": 1260
},
{
"epoch": 0.0710124826629681,
"grad_norm": 0.42182767391204834,
"learning_rate": 1.7753120665742024e-07,
"loss": 0.9911,
"step": 1280
},
{
"epoch": 0.07212205270457697,
"grad_norm": 0.4385708272457123,
"learning_rate": 1.8030513176144244e-07,
"loss": 1.0787,
"step": 1300
},
{
"epoch": 0.07323162274618586,
"grad_norm": 0.3282943367958069,
"learning_rate": 1.8307905686546463e-07,
"loss": 1.1217,
"step": 1320
},
{
"epoch": 0.07434119278779473,
"grad_norm": 0.7223221063613892,
"learning_rate": 1.858529819694868e-07,
"loss": 1.1546,
"step": 1340
},
{
"epoch": 0.0754507628294036,
"grad_norm": 0.36656028032302856,
"learning_rate": 1.88626907073509e-07,
"loss": 1.163,
"step": 1360
},
{
"epoch": 0.07656033287101248,
"grad_norm": 0.5122601389884949,
"learning_rate": 1.914008321775312e-07,
"loss": 1.053,
"step": 1380
},
{
"epoch": 0.07766990291262135,
"grad_norm": 0.5633952021598816,
"learning_rate": 1.9417475728155338e-07,
"loss": 1.1306,
"step": 1400
},
{
"epoch": 0.07877947295423024,
"grad_norm": 0.5684695243835449,
"learning_rate": 1.9694868238557558e-07,
"loss": 1.1589,
"step": 1420
},
{
"epoch": 0.07988904299583911,
"grad_norm": 0.6536048054695129,
"learning_rate": 1.9972260748959777e-07,
"loss": 1.113,
"step": 1440
},
{
"epoch": 0.08099861303744799,
"grad_norm": 0.4558582901954651,
"learning_rate": 2.0249653259361995e-07,
"loss": 1.0757,
"step": 1460
},
{
"epoch": 0.08210818307905686,
"grad_norm": 0.6440749168395996,
"learning_rate": 2.0527045769764214e-07,
"loss": 1.1674,
"step": 1480
},
{
"epoch": 0.08321775312066575,
"grad_norm": 0.41806939244270325,
"learning_rate": 2.0804438280166434e-07,
"loss": 1.0539,
"step": 1500
},
{
"epoch": 0.08432732316227462,
"grad_norm": 0.5335156321525574,
"learning_rate": 2.1081830790568652e-07,
"loss": 1.1338,
"step": 1520
},
{
"epoch": 0.0854368932038835,
"grad_norm": 0.35202693939208984,
"learning_rate": 2.1359223300970871e-07,
"loss": 1.1638,
"step": 1540
},
{
"epoch": 0.08654646324549237,
"grad_norm": 0.45304250717163086,
"learning_rate": 2.163661581137309e-07,
"loss": 1.0912,
"step": 1560
},
{
"epoch": 0.08765603328710125,
"grad_norm": 0.37071916460990906,
"learning_rate": 2.191400832177531e-07,
"loss": 1.0883,
"step": 1580
},
{
"epoch": 0.08876560332871013,
"grad_norm": 0.5757469534873962,
"learning_rate": 2.219140083217753e-07,
"loss": 1.0081,
"step": 1600
},
{
"epoch": 0.089875173370319,
"grad_norm": 0.4922785460948944,
"learning_rate": 2.246879334257975e-07,
"loss": 1.0775,
"step": 1620
},
{
"epoch": 0.09098474341192787,
"grad_norm": 0.6358697414398193,
"learning_rate": 2.274618585298197e-07,
"loss": 1.1156,
"step": 1640
},
{
"epoch": 0.09209431345353676,
"grad_norm": 0.33512547612190247,
"learning_rate": 2.3023578363384188e-07,
"loss": 1.049,
"step": 1660
},
{
"epoch": 0.09320388349514563,
"grad_norm": 0.3588186502456665,
"learning_rate": 2.3300970873786408e-07,
"loss": 1.0028,
"step": 1680
},
{
"epoch": 0.09431345353675451,
"grad_norm": 0.4455154836177826,
"learning_rate": 2.3578363384188628e-07,
"loss": 0.9894,
"step": 1700
},
{
"epoch": 0.09542302357836338,
"grad_norm": 0.4005114734172821,
"learning_rate": 2.3855755894590845e-07,
"loss": 1.0441,
"step": 1720
},
{
"epoch": 0.09653259361997225,
"grad_norm": 0.3630480170249939,
"learning_rate": 2.413314840499306e-07,
"loss": 1.1274,
"step": 1740
},
{
"epoch": 0.09764216366158114,
"grad_norm": 0.38374799489974976,
"learning_rate": 2.4410540915395285e-07,
"loss": 1.0205,
"step": 1760
},
{
"epoch": 0.09875173370319001,
"grad_norm": 0.240658700466156,
"learning_rate": 2.46879334257975e-07,
"loss": 0.9917,
"step": 1780
},
{
"epoch": 0.09986130374479889,
"grad_norm": 0.32337549328804016,
"learning_rate": 2.496532593619972e-07,
"loss": 1.0758,
"step": 1800
},
{
"epoch": 0.10097087378640776,
"grad_norm": 0.47185397148132324,
"learning_rate": 2.524271844660194e-07,
"loss": 1.1119,
"step": 1820
},
{
"epoch": 0.10208044382801665,
"grad_norm": 0.5956501960754395,
"learning_rate": 2.552011095700416e-07,
"loss": 1.043,
"step": 1840
},
{
"epoch": 0.10319001386962552,
"grad_norm": 0.36230626702308655,
"learning_rate": 2.5797503467406376e-07,
"loss": 1.0284,
"step": 1860
},
{
"epoch": 0.1042995839112344,
"grad_norm": 0.4904063642024994,
"learning_rate": 2.60748959778086e-07,
"loss": 1.0458,
"step": 1880
},
{
"epoch": 0.10540915395284327,
"grad_norm": 0.3035784661769867,
"learning_rate": 2.6352288488210816e-07,
"loss": 1.0256,
"step": 1900
},
{
"epoch": 0.10651872399445216,
"grad_norm": 0.5605130791664124,
"learning_rate": 2.6629680998613033e-07,
"loss": 1.0233,
"step": 1920
},
{
"epoch": 0.10762829403606103,
"grad_norm": 0.3493014872074127,
"learning_rate": 2.6907073509015255e-07,
"loss": 1.0023,
"step": 1940
},
{
"epoch": 0.1087378640776699,
"grad_norm": 0.5957789421081543,
"learning_rate": 2.7184466019417473e-07,
"loss": 1.0315,
"step": 1960
},
{
"epoch": 0.10984743411927878,
"grad_norm": 0.6720208525657654,
"learning_rate": 2.746185852981969e-07,
"loss": 1.0138,
"step": 1980
},
{
"epoch": 0.11095700416088766,
"grad_norm": 0.43460479378700256,
"learning_rate": 2.773925104022191e-07,
"loss": 0.9863,
"step": 2000
},
{
"epoch": 0.11206657420249654,
"grad_norm": 0.5312954783439636,
"learning_rate": 2.801664355062413e-07,
"loss": 1.0486,
"step": 2020
},
{
"epoch": 0.11317614424410541,
"grad_norm": 0.6359843611717224,
"learning_rate": 2.829403606102635e-07,
"loss": 0.9457,
"step": 2040
},
{
"epoch": 0.11428571428571428,
"grad_norm": 0.6254010796546936,
"learning_rate": 2.857142857142857e-07,
"loss": 1.1109,
"step": 2060
},
{
"epoch": 0.11539528432732316,
"grad_norm": 0.36780887842178345,
"learning_rate": 2.8848821081830787e-07,
"loss": 1.0425,
"step": 2080
},
{
"epoch": 0.11650485436893204,
"grad_norm": 0.46995213627815247,
"learning_rate": 2.912621359223301e-07,
"loss": 0.9666,
"step": 2100
},
{
"epoch": 0.11761442441054092,
"grad_norm": 0.49816495180130005,
"learning_rate": 2.9403606102635226e-07,
"loss": 0.9935,
"step": 2120
},
{
"epoch": 0.11872399445214979,
"grad_norm": 0.9470138549804688,
"learning_rate": 2.9680998613037444e-07,
"loss": 1.0937,
"step": 2140
},
{
"epoch": 0.11983356449375866,
"grad_norm": 0.4583646357059479,
"learning_rate": 2.9958391123439666e-07,
"loss": 0.9972,
"step": 2160
},
{
"epoch": 0.12094313453536755,
"grad_norm": 0.351114958524704,
"learning_rate": 3.0235783633841883e-07,
"loss": 1.0061,
"step": 2180
},
{
"epoch": 0.12205270457697642,
"grad_norm": 0.4335211515426636,
"learning_rate": 3.05131761442441e-07,
"loss": 1.0091,
"step": 2200
},
{
"epoch": 0.1231622746185853,
"grad_norm": 0.3483923673629761,
"learning_rate": 3.0790568654646323e-07,
"loss": 1.0092,
"step": 2220
},
{
"epoch": 0.12427184466019417,
"grad_norm": 0.33233171701431274,
"learning_rate": 3.106796116504854e-07,
"loss": 1.0185,
"step": 2240
},
{
"epoch": 0.12538141470180306,
"grad_norm": 0.34460940957069397,
"learning_rate": 3.1345353675450763e-07,
"loss": 0.9484,
"step": 2260
},
{
"epoch": 0.12649098474341192,
"grad_norm": 0.42694029211997986,
"learning_rate": 3.1622746185852985e-07,
"loss": 0.9484,
"step": 2280
},
{
"epoch": 0.1276005547850208,
"grad_norm": 0.5112186074256897,
"learning_rate": 3.19001386962552e-07,
"loss": 1.0039,
"step": 2300
},
{
"epoch": 0.1287101248266297,
"grad_norm": 0.4560784101486206,
"learning_rate": 3.217753120665742e-07,
"loss": 0.9961,
"step": 2320
},
{
"epoch": 0.12981969486823855,
"grad_norm": 0.5053315162658691,
"learning_rate": 3.245492371705964e-07,
"loss": 0.9542,
"step": 2340
},
{
"epoch": 0.13092926490984744,
"grad_norm": 0.6796769499778748,
"learning_rate": 3.273231622746186e-07,
"loss": 0.9187,
"step": 2360
},
{
"epoch": 0.13203883495145632,
"grad_norm": 0.7734983563423157,
"learning_rate": 3.3009708737864077e-07,
"loss": 0.9298,
"step": 2380
},
{
"epoch": 0.13314840499306518,
"grad_norm": 0.4399431049823761,
"learning_rate": 3.32871012482663e-07,
"loss": 0.8959,
"step": 2400
},
{
"epoch": 0.13425797503467407,
"grad_norm": 0.4783932864665985,
"learning_rate": 3.3564493758668516e-07,
"loss": 0.8539,
"step": 2420
},
{
"epoch": 0.13536754507628293,
"grad_norm": 0.4672847092151642,
"learning_rate": 3.3841886269070734e-07,
"loss": 0.8839,
"step": 2440
},
{
"epoch": 0.13647711511789182,
"grad_norm": 0.4219910204410553,
"learning_rate": 3.4119278779472956e-07,
"loss": 0.934,
"step": 2460
},
{
"epoch": 0.1375866851595007,
"grad_norm": 0.3283788561820984,
"learning_rate": 3.4396671289875173e-07,
"loss": 0.8729,
"step": 2480
},
{
"epoch": 0.13869625520110956,
"grad_norm": 0.6127363443374634,
"learning_rate": 3.467406380027739e-07,
"loss": 0.8355,
"step": 2500
},
{
"epoch": 0.13980582524271845,
"grad_norm": 0.8837600350379944,
"learning_rate": 3.4951456310679613e-07,
"loss": 0.9256,
"step": 2520
},
{
"epoch": 0.1409153952843273,
"grad_norm": 0.3368714153766632,
"learning_rate": 3.522884882108183e-07,
"loss": 0.8326,
"step": 2540
},
{
"epoch": 0.1420249653259362,
"grad_norm": 0.6457244753837585,
"learning_rate": 3.550624133148405e-07,
"loss": 0.8638,
"step": 2560
},
{
"epoch": 0.14313453536754508,
"grad_norm": 0.5497669577598572,
"learning_rate": 3.578363384188627e-07,
"loss": 0.8049,
"step": 2580
},
{
"epoch": 0.14424410540915394,
"grad_norm": 0.5958977341651917,
"learning_rate": 3.6061026352288487e-07,
"loss": 0.8107,
"step": 2600
},
{
"epoch": 0.14535367545076283,
"grad_norm": 0.5878711938858032,
"learning_rate": 3.6338418862690704e-07,
"loss": 0.8011,
"step": 2620
},
{
"epoch": 0.14646324549237172,
"grad_norm": 0.4262014627456665,
"learning_rate": 3.6615811373092927e-07,
"loss": 0.898,
"step": 2640
},
{
"epoch": 0.14757281553398058,
"grad_norm": 0.7306149005889893,
"learning_rate": 3.6893203883495144e-07,
"loss": 0.7726,
"step": 2660
},
{
"epoch": 0.14868238557558947,
"grad_norm": 0.32822510600090027,
"learning_rate": 3.717059639389736e-07,
"loss": 0.775,
"step": 2680
},
{
"epoch": 0.14979195561719832,
"grad_norm": 0.41548779606819153,
"learning_rate": 3.7447988904299584e-07,
"loss": 0.7613,
"step": 2700
},
{
"epoch": 0.1509015256588072,
"grad_norm": 1.1839288473129272,
"learning_rate": 3.77253814147018e-07,
"loss": 0.8496,
"step": 2720
},
{
"epoch": 0.1520110957004161,
"grad_norm": 0.5219757556915283,
"learning_rate": 3.800277392510402e-07,
"loss": 0.7402,
"step": 2740
},
{
"epoch": 0.15312066574202496,
"grad_norm": 0.8173393607139587,
"learning_rate": 3.828016643550624e-07,
"loss": 0.7204,
"step": 2760
},
{
"epoch": 0.15423023578363385,
"grad_norm": 0.49754881858825684,
"learning_rate": 3.855755894590846e-07,
"loss": 0.7716,
"step": 2780
},
{
"epoch": 0.1553398058252427,
"grad_norm": 0.39697808027267456,
"learning_rate": 3.8834951456310675e-07,
"loss": 0.7791,
"step": 2800
},
{
"epoch": 0.1564493758668516,
"grad_norm": 0.6214376091957092,
"learning_rate": 3.91123439667129e-07,
"loss": 0.6724,
"step": 2820
},
{
"epoch": 0.15755894590846048,
"grad_norm": 0.6486151218414307,
"learning_rate": 3.9389736477115115e-07,
"loss": 0.8015,
"step": 2840
},
{
"epoch": 0.15866851595006934,
"grad_norm": 0.5499553084373474,
"learning_rate": 3.966712898751733e-07,
"loss": 0.7871,
"step": 2860
},
{
"epoch": 0.15977808599167823,
"grad_norm": 0.8797757029533386,
"learning_rate": 3.9944521497919555e-07,
"loss": 0.7183,
"step": 2880
},
{
"epoch": 0.1608876560332871,
"grad_norm": 0.47135302424430847,
"learning_rate": 4.022191400832177e-07,
"loss": 0.7348,
"step": 2900
},
{
"epoch": 0.16199722607489597,
"grad_norm": 0.8005576729774475,
"learning_rate": 4.049930651872399e-07,
"loss": 0.6212,
"step": 2920
},
{
"epoch": 0.16310679611650486,
"grad_norm": 0.47837623953819275,
"learning_rate": 4.077669902912621e-07,
"loss": 0.6812,
"step": 2940
},
{
"epoch": 0.16421636615811372,
"grad_norm": 0.36638781428337097,
"learning_rate": 4.105409153952843e-07,
"loss": 0.6925,
"step": 2960
},
{
"epoch": 0.1653259361997226,
"grad_norm": 0.817538857460022,
"learning_rate": 4.1331484049930646e-07,
"loss": 0.6186,
"step": 2980
},
{
"epoch": 0.1664355062413315,
"grad_norm": 0.5090010166168213,
"learning_rate": 4.160887656033287e-07,
"loss": 0.6844,
"step": 3000
},
{
"epoch": 0.16754507628294035,
"grad_norm": 0.6102781295776367,
"learning_rate": 4.1886269070735086e-07,
"loss": 0.6943,
"step": 3020
},
{
"epoch": 0.16865464632454924,
"grad_norm": 0.8231751918792725,
"learning_rate": 4.2163661581137303e-07,
"loss": 0.6391,
"step": 3040
},
{
"epoch": 0.16976421636615813,
"grad_norm": 0.38910776376724243,
"learning_rate": 4.2441054091539526e-07,
"loss": 0.6937,
"step": 3060
},
{
"epoch": 0.170873786407767,
"grad_norm": 0.5838291049003601,
"learning_rate": 4.2718446601941743e-07,
"loss": 0.5791,
"step": 3080
},
{
"epoch": 0.17198335644937587,
"grad_norm": 0.519530177116394,
"learning_rate": 4.299583911234396e-07,
"loss": 0.7412,
"step": 3100
},
{
"epoch": 0.17309292649098473,
"grad_norm": 0.45696595311164856,
"learning_rate": 4.327323162274618e-07,
"loss": 0.6469,
"step": 3120
},
{
"epoch": 0.17420249653259362,
"grad_norm": 0.6771582961082458,
"learning_rate": 4.35506241331484e-07,
"loss": 0.6441,
"step": 3140
},
{
"epoch": 0.1753120665742025,
"grad_norm": 0.559917151927948,
"learning_rate": 4.382801664355062e-07,
"loss": 0.5778,
"step": 3160
},
{
"epoch": 0.17642163661581137,
"grad_norm": 0.9249961376190186,
"learning_rate": 4.4105409153952845e-07,
"loss": 0.6637,
"step": 3180
},
{
"epoch": 0.17753120665742025,
"grad_norm": 0.5211077928543091,
"learning_rate": 4.438280166435506e-07,
"loss": 0.7047,
"step": 3200
},
{
"epoch": 0.1786407766990291,
"grad_norm": 0.7488894462585449,
"learning_rate": 4.4660194174757285e-07,
"loss": 0.5802,
"step": 3220
},
{
"epoch": 0.179750346740638,
"grad_norm": 0.6046866774559021,
"learning_rate": 4.49375866851595e-07,
"loss": 0.6601,
"step": 3240
},
{
"epoch": 0.1808599167822469,
"grad_norm": 0.3715108036994934,
"learning_rate": 4.521497919556172e-07,
"loss": 0.6094,
"step": 3260
},
{
"epoch": 0.18196948682385575,
"grad_norm": 0.5831759572029114,
"learning_rate": 4.549237170596394e-07,
"loss": 0.6086,
"step": 3280
},
{
"epoch": 0.18307905686546463,
"grad_norm": 0.595746636390686,
"learning_rate": 4.576976421636616e-07,
"loss": 0.5916,
"step": 3300
},
{
"epoch": 0.18418862690707352,
"grad_norm": 0.48339492082595825,
"learning_rate": 4.6047156726768376e-07,
"loss": 0.6127,
"step": 3320
},
{
"epoch": 0.18529819694868238,
"grad_norm": 1.626143455505371,
"learning_rate": 4.63245492371706e-07,
"loss": 0.5437,
"step": 3340
},
{
"epoch": 0.18640776699029127,
"grad_norm": 0.3789680004119873,
"learning_rate": 4.6601941747572816e-07,
"loss": 0.6499,
"step": 3360
},
{
"epoch": 0.18751733703190013,
"grad_norm": 0.5178479552268982,
"learning_rate": 4.6879334257975033e-07,
"loss": 0.6509,
"step": 3380
},
{
"epoch": 0.18862690707350901,
"grad_norm": 0.5709561109542847,
"learning_rate": 4.7156726768377255e-07,
"loss": 0.6429,
"step": 3400
},
{
"epoch": 0.1897364771151179,
"grad_norm": 0.3643471896648407,
"learning_rate": 4.743411927877947e-07,
"loss": 0.6235,
"step": 3420
},
{
"epoch": 0.19084604715672676,
"grad_norm": 0.5804113745689392,
"learning_rate": 4.771151178918169e-07,
"loss": 0.7341,
"step": 3440
},
{
"epoch": 0.19195561719833565,
"grad_norm": 0.5089621543884277,
"learning_rate": 4.798890429958391e-07,
"loss": 0.6286,
"step": 3460
},
{
"epoch": 0.1930651872399445,
"grad_norm": 0.4625658392906189,
"learning_rate": 4.826629680998612e-07,
"loss": 0.6128,
"step": 3480
},
{
"epoch": 0.1941747572815534,
"grad_norm": 0.36961832642555237,
"learning_rate": 4.854368932038835e-07,
"loss": 0.6213,
"step": 3500
},
{
"epoch": 0.19528432732316228,
"grad_norm": 0.4466856122016907,
"learning_rate": 4.882108183079057e-07,
"loss": 0.5383,
"step": 3520
},
{
"epoch": 0.19639389736477114,
"grad_norm": 0.45287024974823,
"learning_rate": 4.909847434119279e-07,
"loss": 0.5064,
"step": 3540
},
{
"epoch": 0.19750346740638003,
"grad_norm": 0.6351368427276611,
"learning_rate": 4.9375866851595e-07,
"loss": 0.5485,
"step": 3560
},
{
"epoch": 0.19861303744798892,
"grad_norm": 0.46472978591918945,
"learning_rate": 4.965325936199722e-07,
"loss": 0.5392,
"step": 3580
},
{
"epoch": 0.19972260748959778,
"grad_norm": 0.38963034749031067,
"learning_rate": 4.993065187239944e-07,
"loss": 0.5459,
"step": 3600
},
{
"epoch": 0.20083217753120666,
"grad_norm": 2.1769580841064453,
"learning_rate": 4.999986650611594e-07,
"loss": 0.5158,
"step": 3620
},
{
"epoch": 0.20194174757281552,
"grad_norm": 0.6485143899917603,
"learning_rate": 4.999927320283929e-07,
"loss": 0.5815,
"step": 3640
},
{
"epoch": 0.2030513176144244,
"grad_norm": 0.37338986992836,
"learning_rate": 4.999820526876891e-07,
"loss": 0.5475,
"step": 3660
},
{
"epoch": 0.2041608876560333,
"grad_norm": 0.4554106593132019,
"learning_rate": 4.999666272418033e-07,
"loss": 0.547,
"step": 3680
},
{
"epoch": 0.20527045769764216,
"grad_norm": 0.3950905501842499,
"learning_rate": 4.999464559835997e-07,
"loss": 0.5561,
"step": 3700
},
{
"epoch": 0.20638002773925104,
"grad_norm": 0.28335675597190857,
"learning_rate": 4.999215392960455e-07,
"loss": 0.6461,
"step": 3720
},
{
"epoch": 0.20748959778085993,
"grad_norm": 0.3045244514942169,
"learning_rate": 4.998918776522036e-07,
"loss": 0.5206,
"step": 3740
},
{
"epoch": 0.2085991678224688,
"grad_norm": 0.4339936375617981,
"learning_rate": 4.998574716152234e-07,
"loss": 0.4728,
"step": 3760
},
{
"epoch": 0.20970873786407768,
"grad_norm": 0.39513078331947327,
"learning_rate": 4.998183218383305e-07,
"loss": 0.5485,
"step": 3780
},
{
"epoch": 0.21081830790568654,
"grad_norm": 0.40521058440208435,
"learning_rate": 4.997744290648143e-07,
"loss": 0.6388,
"step": 3800
},
{
"epoch": 0.21192787794729542,
"grad_norm": 0.3975263833999634,
"learning_rate": 4.997257941280133e-07,
"loss": 0.5521,
"step": 3820
},
{
"epoch": 0.2130374479889043,
"grad_norm": 0.3691408634185791,
"learning_rate": 4.996724179512999e-07,
"loss": 0.5293,
"step": 3840
},
{
"epoch": 0.21414701803051317,
"grad_norm": 0.30931538343429565,
"learning_rate": 4.996143015480629e-07,
"loss": 0.6779,
"step": 3860
},
{
"epoch": 0.21525658807212206,
"grad_norm": 0.4070769250392914,
"learning_rate": 4.995514460216873e-07,
"loss": 0.4724,
"step": 3880
},
{
"epoch": 0.21636615811373092,
"grad_norm": 0.34178927540779114,
"learning_rate": 4.994838525655349e-07,
"loss": 0.4932,
"step": 3900
},
{
"epoch": 0.2174757281553398,
"grad_norm": 0.3776053190231323,
"learning_rate": 4.994115224629204e-07,
"loss": 0.513,
"step": 3920
},
{
"epoch": 0.2185852981969487,
"grad_norm": 0.3009076416492462,
"learning_rate": 4.993344570870874e-07,
"loss": 0.4694,
"step": 3940
},
{
"epoch": 0.21969486823855755,
"grad_norm": 0.24924315512180328,
"learning_rate": 4.992526579011823e-07,
"loss": 0.5135,
"step": 3960
},
{
"epoch": 0.22080443828016644,
"grad_norm": 0.39235708117485046,
"learning_rate": 4.991661264582271e-07,
"loss": 0.5608,
"step": 3980
},
{
"epoch": 0.22191400832177532,
"grad_norm": 0.34350383281707764,
"learning_rate": 4.990748644010888e-07,
"loss": 0.5201,
"step": 4000
},
{
"epoch": 0.22302357836338418,
"grad_norm": 0.5332874059677124,
"learning_rate": 4.989788734624492e-07,
"loss": 0.5994,
"step": 4020
},
{
"epoch": 0.22413314840499307,
"grad_norm": 0.32011643052101135,
"learning_rate": 4.988781554647714e-07,
"loss": 0.5103,
"step": 4040
},
{
"epoch": 0.22524271844660193,
"grad_norm": 0.37372103333473206,
"learning_rate": 4.987727123202655e-07,
"loss": 0.5483,
"step": 4060
},
{
"epoch": 0.22635228848821082,
"grad_norm": 0.3511541187763214,
"learning_rate": 4.986625460308524e-07,
"loss": 0.5508,
"step": 4080
},
{
"epoch": 0.2274618585298197,
"grad_norm": 0.33943256735801697,
"learning_rate": 4.985476586881254e-07,
"loss": 0.5437,
"step": 4100
},
{
"epoch": 0.22857142857142856,
"grad_norm": 0.5390433669090271,
"learning_rate": 4.984280524733107e-07,
"loss": 0.5161,
"step": 4120
},
{
"epoch": 0.22968099861303745,
"grad_norm": 0.40753117203712463,
"learning_rate": 4.983037296572259e-07,
"loss": 0.4993,
"step": 4140
},
{
"epoch": 0.2307905686546463,
"grad_norm": 0.34956789016723633,
"learning_rate": 4.981746926002372e-07,
"loss": 0.613,
"step": 4160
},
{
"epoch": 0.2319001386962552,
"grad_norm": 0.3175369203090668,
"learning_rate": 4.980409437522143e-07,
"loss": 0.5396,
"step": 4180
},
{
"epoch": 0.23300970873786409,
"grad_norm": 0.30835628509521484,
"learning_rate": 4.979024856524839e-07,
"loss": 0.5407,
"step": 4200
},
{
"epoch": 0.23411927877947294,
"grad_norm": 0.3639371693134308,
"learning_rate": 4.977593209297814e-07,
"loss": 0.5457,
"step": 4220
},
{
"epoch": 0.23522884882108183,
"grad_norm": 0.4240809381008148,
"learning_rate": 4.976114523022015e-07,
"loss": 0.5323,
"step": 4240
},
{
"epoch": 0.23633841886269072,
"grad_norm": 0.509510338306427,
"learning_rate": 4.974588825771457e-07,
"loss": 0.5374,
"step": 4260
},
{
"epoch": 0.23744798890429958,
"grad_norm": 0.3038425147533417,
"learning_rate": 4.9730161465127e-07,
"loss": 0.5041,
"step": 4280
},
{
"epoch": 0.23855755894590847,
"grad_norm": 0.33369964361190796,
"learning_rate": 4.971396515104292e-07,
"loss": 0.601,
"step": 4300
},
{
"epoch": 0.23966712898751732,
"grad_norm": 0.4764558970928192,
"learning_rate": 4.969729962296203e-07,
"loss": 0.6066,
"step": 4320
},
{
"epoch": 0.2407766990291262,
"grad_norm": 0.4532679319381714,
"learning_rate": 4.968016519729246e-07,
"loss": 0.5999,
"step": 4340
},
{
"epoch": 0.2418862690707351,
"grad_norm": 0.39469200372695923,
"learning_rate": 4.966256219934471e-07,
"loss": 0.498,
"step": 4360
},
{
"epoch": 0.24299583911234396,
"grad_norm": 0.4191853702068329,
"learning_rate": 4.964449096332547e-07,
"loss": 0.5246,
"step": 4380
},
{
"epoch": 0.24410540915395285,
"grad_norm": 0.3406555950641632,
"learning_rate": 4.962595183233133e-07,
"loss": 0.6331,
"step": 4400
},
{
"epoch": 0.24521497919556173,
"grad_norm": 0.32338958978652954,
"learning_rate": 4.960694515834224e-07,
"loss": 0.5389,
"step": 4420
},
{
"epoch": 0.2463245492371706,
"grad_norm": 0.8198554515838623,
"learning_rate": 4.958747130221477e-07,
"loss": 0.5678,
"step": 4440
},
{
"epoch": 0.24743411927877948,
"grad_norm": 0.38297727704048157,
"learning_rate": 4.956753063367537e-07,
"loss": 0.4682,
"step": 4460
},
{
"epoch": 0.24854368932038834,
"grad_norm": 0.36951327323913574,
"learning_rate": 4.954712353131323e-07,
"loss": 0.4903,
"step": 4480
},
{
"epoch": 0.24965325936199723,
"grad_norm": 0.30048689246177673,
"learning_rate": 4.952625038257321e-07,
"loss": 0.6061,
"step": 4500
},
{
"epoch": 0.2507628294036061,
"grad_norm": 0.2912724018096924,
"learning_rate": 4.950491158374837e-07,
"loss": 0.565,
"step": 4520
},
{
"epoch": 0.251872399445215,
"grad_norm": 0.5042518377304077,
"learning_rate": 4.948310753997254e-07,
"loss": 0.5231,
"step": 4540
},
{
"epoch": 0.25298196948682383,
"grad_norm": 0.31980255246162415,
"learning_rate": 4.94608386652126e-07,
"loss": 0.5077,
"step": 4560
},
{
"epoch": 0.2540915395284327,
"grad_norm": 0.2944648861885071,
"learning_rate": 4.943810538226056e-07,
"loss": 0.4751,
"step": 4580
},
{
"epoch": 0.2552011095700416,
"grad_norm": 0.24331466853618622,
"learning_rate": 4.941490812272563e-07,
"loss": 0.5061,
"step": 4600
},
{
"epoch": 0.2563106796116505,
"grad_norm": 0.308380663394928,
"learning_rate": 4.939124732702595e-07,
"loss": 0.5207,
"step": 4620
},
{
"epoch": 0.2574202496532594,
"grad_norm": 0.3826179802417755,
"learning_rate": 4.936712344438028e-07,
"loss": 0.5081,
"step": 4640
},
{
"epoch": 0.2585298196948682,
"grad_norm": 0.41639629006385803,
"learning_rate": 4.934253693279943e-07,
"loss": 0.5334,
"step": 4660
},
{
"epoch": 0.2596393897364771,
"grad_norm": 0.38280215859413147,
"learning_rate": 4.931748825907759e-07,
"loss": 0.5957,
"step": 4680
},
{
"epoch": 0.260748959778086,
"grad_norm": 0.3500733971595764,
"learning_rate": 4.929197789878347e-07,
"loss": 0.5426,
"step": 4700
},
{
"epoch": 0.2618585298196949,
"grad_norm": 0.4630734324455261,
"learning_rate": 4.926600633625126e-07,
"loss": 0.539,
"step": 4720
},
{
"epoch": 0.26296809986130376,
"grad_norm": 0.3602588474750519,
"learning_rate": 4.92395740645714e-07,
"loss": 0.4219,
"step": 4740
},
{
"epoch": 0.26407766990291265,
"grad_norm": 0.28921574354171753,
"learning_rate": 4.92126815855813e-07,
"loss": 0.5068,
"step": 4760
},
{
"epoch": 0.2651872399445215,
"grad_norm": 0.298486590385437,
"learning_rate": 4.918532940985576e-07,
"loss": 0.5365,
"step": 4780
},
{
"epoch": 0.26629680998613037,
"grad_norm": 0.32068467140197754,
"learning_rate": 4.915751805669725e-07,
"loss": 0.5623,
"step": 4800
},
{
"epoch": 0.26740638002773925,
"grad_norm": 0.2494667023420334,
"learning_rate": 4.912924805412613e-07,
"loss": 0.5911,
"step": 4820
},
{
"epoch": 0.26851595006934814,
"grad_norm": 0.3729526698589325,
"learning_rate": 4.910051993887053e-07,
"loss": 0.6284,
"step": 4840
},
{
"epoch": 0.26962552011095703,
"grad_norm": 0.36454734206199646,
"learning_rate": 4.907133425635625e-07,
"loss": 0.5695,
"step": 4860
},
{
"epoch": 0.27073509015256586,
"grad_norm": 0.3649137318134308,
"learning_rate": 4.904169156069633e-07,
"loss": 0.5287,
"step": 4880
},
{
"epoch": 0.27184466019417475,
"grad_norm": 0.44556987285614014,
"learning_rate": 4.90115924146806e-07,
"loss": 0.5561,
"step": 4900
},
{
"epoch": 0.27295423023578363,
"grad_norm": 0.3902558386325836,
"learning_rate": 4.898103738976491e-07,
"loss": 0.5358,
"step": 4920
},
{
"epoch": 0.2740638002773925,
"grad_norm": 0.38199660181999207,
"learning_rate": 4.895002706606037e-07,
"loss": 0.5221,
"step": 4940
},
{
"epoch": 0.2751733703190014,
"grad_norm": 0.7920161485671997,
"learning_rate": 4.891856203232228e-07,
"loss": 0.552,
"step": 4960
},
{
"epoch": 0.27628294036061024,
"grad_norm": 0.40676286816596985,
"learning_rate": 4.888664288593896e-07,
"loss": 0.563,
"step": 4980
},
{
"epoch": 0.27739251040221913,
"grad_norm": 0.2597495913505554,
"learning_rate": 4.885427023292043e-07,
"loss": 0.5276,
"step": 5000
},
{
"epoch": 0.278502080443828,
"grad_norm": 0.408184289932251,
"learning_rate": 4.882144468788685e-07,
"loss": 0.4505,
"step": 5020
},
{
"epoch": 0.2796116504854369,
"grad_norm": 0.31909486651420593,
"learning_rate": 4.878816687405694e-07,
"loss": 0.5883,
"step": 5040
},
{
"epoch": 0.2807212205270458,
"grad_norm": 0.6439054012298584,
"learning_rate": 4.875443742323607e-07,
"loss": 0.5181,
"step": 5060
},
{
"epoch": 0.2818307905686546,
"grad_norm": 0.3375921845436096,
"learning_rate": 4.872025697580431e-07,
"loss": 0.5607,
"step": 5080
},
{
"epoch": 0.2829403606102635,
"grad_norm": 0.30383211374282837,
"learning_rate": 4.868562618070422e-07,
"loss": 0.517,
"step": 5100
},
{
"epoch": 0.2840499306518724,
"grad_norm": 0.40573009848594666,
"learning_rate": 4.865054569542859e-07,
"loss": 0.5974,
"step": 5120
},
{
"epoch": 0.2851595006934813,
"grad_norm": 0.2704174518585205,
"learning_rate": 4.861501618600794e-07,
"loss": 0.4676,
"step": 5140
},
{
"epoch": 0.28626907073509017,
"grad_norm": 0.30422112345695496,
"learning_rate": 4.857903832699784e-07,
"loss": 0.5631,
"step": 5160
},
{
"epoch": 0.287378640776699,
"grad_norm": 0.48548394441604614,
"learning_rate": 4.854261280146615e-07,
"loss": 0.6646,
"step": 5180
},
{
"epoch": 0.2884882108183079,
"grad_norm": 0.49318018555641174,
"learning_rate": 4.850574030097999e-07,
"loss": 0.5939,
"step": 5200
},
{
"epoch": 0.2895977808599168,
"grad_norm": 0.369650661945343,
"learning_rate": 4.846842152559272e-07,
"loss": 0.5602,
"step": 5220
},
{
"epoch": 0.29070735090152566,
"grad_norm": 0.5085413455963135,
"learning_rate": 4.843065718383051e-07,
"loss": 0.5528,
"step": 5240
},
{
"epoch": 0.29181692094313455,
"grad_norm": 0.30005863308906555,
"learning_rate": 4.839244799267899e-07,
"loss": 0.5668,
"step": 5260
},
{
"epoch": 0.29292649098474344,
"grad_norm": 1.0485389232635498,
"learning_rate": 4.83537946775696e-07,
"loss": 0.5668,
"step": 5280
},
{
"epoch": 0.29403606102635227,
"grad_norm": 0.38798120617866516,
"learning_rate": 4.831469797236582e-07,
"loss": 0.5526,
"step": 5300
},
{
"epoch": 0.29514563106796116,
"grad_norm": 0.42610159516334534,
"learning_rate": 4.827515861934924e-07,
"loss": 0.5549,
"step": 5320
},
{
"epoch": 0.29625520110957004,
"grad_norm": 0.2633446753025055,
"learning_rate": 4.823517736920546e-07,
"loss": 0.5283,
"step": 5340
},
{
"epoch": 0.29736477115117893,
"grad_norm": 0.4579598903656006,
"learning_rate": 4.819475498100985e-07,
"loss": 0.5362,
"step": 5360
},
{
"epoch": 0.2984743411927878,
"grad_norm": 0.39916613698005676,
"learning_rate": 4.815389222221313e-07,
"loss": 0.4562,
"step": 5380
},
{
"epoch": 0.29958391123439665,
"grad_norm": 0.559020459651947,
"learning_rate": 4.81125898686268e-07,
"loss": 0.481,
"step": 5400
},
{
"epoch": 0.30069348127600554,
"grad_norm": 0.40768253803253174,
"learning_rate": 4.80708487044084e-07,
"loss": 0.5346,
"step": 5420
},
{
"epoch": 0.3018030513176144,
"grad_norm": 0.22490708529949188,
"learning_rate": 4.802866952204667e-07,
"loss": 0.5692,
"step": 5440
},
{
"epoch": 0.3029126213592233,
"grad_norm": 0.3096957504749298,
"learning_rate": 4.798605312234643e-07,
"loss": 0.5559,
"step": 5460
},
{
"epoch": 0.3040221914008322,
"grad_norm": 0.3028647303581238,
"learning_rate": 4.794300031441342e-07,
"loss": 0.5313,
"step": 5480
},
{
"epoch": 0.30513176144244103,
"grad_norm": 0.7425801753997803,
"learning_rate": 4.789951191563895e-07,
"loss": 0.4875,
"step": 5500
},
{
"epoch": 0.3062413314840499,
"grad_norm": 0.25657814741134644,
"learning_rate": 4.785558875168434e-07,
"loss": 0.4611,
"step": 5520
},
{
"epoch": 0.3073509015256588,
"grad_norm": 0.374449759721756,
"learning_rate": 4.781123165646529e-07,
"loss": 0.5818,
"step": 5540
},
{
"epoch": 0.3084604715672677,
"grad_norm": 0.3681221008300781,
"learning_rate": 4.776644147213602e-07,
"loss": 0.4757,
"step": 5560
},
{
"epoch": 0.3095700416088766,
"grad_norm": 0.5280266404151917,
"learning_rate": 4.772121904907328e-07,
"loss": 0.4936,
"step": 5580
},
{
"epoch": 0.3106796116504854,
"grad_norm": 0.447544664144516,
"learning_rate": 4.7675565245860195e-07,
"loss": 0.5231,
"step": 5600
},
{
"epoch": 0.3117891816920943,
"grad_norm": 0.33150920271873474,
"learning_rate": 4.7629480929270014e-07,
"loss": 0.5644,
"step": 5620
},
{
"epoch": 0.3128987517337032,
"grad_norm": 0.3428071141242981,
"learning_rate": 4.7582966974249607e-07,
"loss": 0.6091,
"step": 5640
},
{
"epoch": 0.31400832177531207,
"grad_norm": 0.4063955545425415,
"learning_rate": 4.753602426390285e-07,
"loss": 0.5079,
"step": 5660
},
{
"epoch": 0.31511789181692096,
"grad_norm": 0.25844889879226685,
"learning_rate": 4.7488653689473903e-07,
"loss": 0.6156,
"step": 5680
},
{
"epoch": 0.31622746185852985,
"grad_norm": 0.2735048234462738,
"learning_rate": 4.744085615033023e-07,
"loss": 0.5386,
"step": 5700
},
{
"epoch": 0.3173370319001387,
"grad_norm": 0.49888360500335693,
"learning_rate": 4.739263255394559e-07,
"loss": 0.5374,
"step": 5720
},
{
"epoch": 0.31844660194174756,
"grad_norm": 0.26587429642677307,
"learning_rate": 4.734398381588274e-07,
"loss": 0.5424,
"step": 5740
},
{
"epoch": 0.31955617198335645,
"grad_norm": 0.28447648882865906,
"learning_rate": 4.7294910859776095e-07,
"loss": 0.5161,
"step": 5760
},
{
"epoch": 0.32066574202496534,
"grad_norm": 0.3417870104312897,
"learning_rate": 4.7245414617314193e-07,
"loss": 0.4308,
"step": 5780
},
{
"epoch": 0.3217753120665742,
"grad_norm": 0.3430401086807251,
"learning_rate": 4.719549602822199e-07,
"loss": 0.5222,
"step": 5800
},
{
"epoch": 0.32288488210818306,
"grad_norm": 0.31506893038749695,
"learning_rate": 4.7145156040243017e-07,
"loss": 0.4937,
"step": 5820
},
{
"epoch": 0.32399445214979194,
"grad_norm": 0.327404648065567,
"learning_rate": 4.709439560912139e-07,
"loss": 0.5163,
"step": 5840
},
{
"epoch": 0.32510402219140083,
"grad_norm": 0.291507750749588,
"learning_rate": 4.704321569858368e-07,
"loss": 0.4774,
"step": 5860
},
{
"epoch": 0.3262135922330097,
"grad_norm": 0.2654714286327362,
"learning_rate": 4.6991617280320614e-07,
"loss": 0.4485,
"step": 5880
},
{
"epoch": 0.3273231622746186,
"grad_norm": 0.38568347692489624,
"learning_rate": 4.6939601333968583e-07,
"loss": 0.5054,
"step": 5900
},
{
"epoch": 0.32843273231622744,
"grad_norm": 0.28820493817329407,
"learning_rate": 4.6887168847091085e-07,
"loss": 0.5271,
"step": 5920
},
{
"epoch": 0.3295423023578363,
"grad_norm": 0.2675837576389313,
"learning_rate": 4.683432081516e-07,
"loss": 0.4915,
"step": 5940
},
{
"epoch": 0.3306518723994452,
"grad_norm": 0.3644208014011383,
"learning_rate": 4.678105824153662e-07,
"loss": 0.5216,
"step": 5960
},
{
"epoch": 0.3317614424410541,
"grad_norm": 0.29700183868408203,
"learning_rate": 4.6727382137452644e-07,
"loss": 0.4904,
"step": 5980
},
{
"epoch": 0.332871012482663,
"grad_norm": 0.25862252712249756,
"learning_rate": 4.6673293521990966e-07,
"loss": 0.516,
"step": 6000
},
{
"epoch": 0.3339805825242718,
"grad_norm": 0.31422197818756104,
"learning_rate": 4.661879342206636e-07,
"loss": 0.4196,
"step": 6020
},
{
"epoch": 0.3350901525658807,
"grad_norm": 0.3729874789714813,
"learning_rate": 4.6563882872405924e-07,
"loss": 0.5395,
"step": 6040
},
{
"epoch": 0.3361997226074896,
"grad_norm": 0.7892670035362244,
"learning_rate": 4.650856291552948e-07,
"loss": 0.4989,
"step": 6060
},
{
"epoch": 0.3373092926490985,
"grad_norm": 0.33878469467163086,
"learning_rate": 4.645283460172976e-07,
"loss": 0.5837,
"step": 6080
},
{
"epoch": 0.33841886269070737,
"grad_norm": 0.37014704942703247,
"learning_rate": 4.6396698989052473e-07,
"loss": 0.4183,
"step": 6100
},
{
"epoch": 0.33952843273231625,
"grad_norm": 0.3007761240005493,
"learning_rate": 4.6340157143276233e-07,
"loss": 0.4898,
"step": 6120
},
{
"epoch": 0.3406380027739251,
"grad_norm": 1.216626524925232,
"learning_rate": 4.628321013789228e-07,
"loss": 0.5657,
"step": 6140
},
{
"epoch": 0.341747572815534,
"grad_norm": 0.4057266414165497,
"learning_rate": 4.622585905408414e-07,
"loss": 0.5154,
"step": 6160
},
{
"epoch": 0.34285714285714286,
"grad_norm": 0.2979249358177185,
"learning_rate": 4.6168104980707103e-07,
"loss": 0.5022,
"step": 6180
},
{
"epoch": 0.34396671289875175,
"grad_norm": 0.4504467248916626,
"learning_rate": 4.6109949014267494e-07,
"loss": 0.4424,
"step": 6200
},
{
"epoch": 0.34507628294036063,
"grad_norm": 0.32083943486213684,
"learning_rate": 4.605139225890192e-07,
"loss": 0.6416,
"step": 6220
},
{
"epoch": 0.34618585298196947,
"grad_norm": 0.2717413604259491,
"learning_rate": 4.5992435826356286e-07,
"loss": 0.5305,
"step": 6240
},
{
"epoch": 0.34729542302357835,
"grad_norm": 0.27425190806388855,
"learning_rate": 4.593308083596464e-07,
"loss": 0.5205,
"step": 6260
},
{
"epoch": 0.34840499306518724,
"grad_norm": 0.3109380304813385,
"learning_rate": 4.587332841462802e-07,
"loss": 0.4805,
"step": 6280
},
{
"epoch": 0.34951456310679613,
"grad_norm": 0.29059240221977234,
"learning_rate": 4.581317969679296e-07,
"loss": 0.5676,
"step": 6300
},
{
"epoch": 0.350624133148405,
"grad_norm": 0.2471311241388321,
"learning_rate": 4.575263582443e-07,
"loss": 0.5058,
"step": 6320
},
{
"epoch": 0.35173370319001385,
"grad_norm": 0.47161543369293213,
"learning_rate": 4.5691697947012016e-07,
"loss": 0.5187,
"step": 6340
},
{
"epoch": 0.35284327323162273,
"grad_norm": 0.3279658257961273,
"learning_rate": 4.563036722149236e-07,
"loss": 0.504,
"step": 6360
},
{
"epoch": 0.3539528432732316,
"grad_norm": 0.365337610244751,
"learning_rate": 4.556864481228293e-07,
"loss": 0.5314,
"step": 6380
},
{
"epoch": 0.3550624133148405,
"grad_norm": 0.3830949664115906,
"learning_rate": 4.5506531891232036e-07,
"loss": 0.4771,
"step": 6400
},
{
"epoch": 0.3561719833564494,
"grad_norm": 0.3073193430900574,
"learning_rate": 4.5444029637602154e-07,
"loss": 0.6175,
"step": 6420
},
{
"epoch": 0.3572815533980582,
"grad_norm": 0.30972909927368164,
"learning_rate": 4.5381139238047553e-07,
"loss": 0.5965,
"step": 6440
},
{
"epoch": 0.3583911234396671,
"grad_norm": 0.35211724042892456,
"learning_rate": 4.531786188659177e-07,
"loss": 0.4085,
"step": 6460
},
{
"epoch": 0.359500693481276,
"grad_norm": 0.3817974030971527,
"learning_rate": 4.525419878460489e-07,
"loss": 0.4394,
"step": 6480
},
{
"epoch": 0.3606102635228849,
"grad_norm": 0.26062262058258057,
"learning_rate": 4.519015114078082e-07,
"loss": 0.5138,
"step": 6500
},
{
"epoch": 0.3617198335644938,
"grad_norm": 0.26502180099487305,
"learning_rate": 4.5125720171114265e-07,
"loss": 0.5103,
"step": 6520
},
{
"epoch": 0.36282940360610266,
"grad_norm": 0.3360140025615692,
"learning_rate": 4.506090709887767e-07,
"loss": 0.5787,
"step": 6540
},
{
"epoch": 0.3639389736477115,
"grad_norm": 0.4174387454986572,
"learning_rate": 4.4995713154598014e-07,
"loss": 0.5905,
"step": 6560
},
{
"epoch": 0.3650485436893204,
"grad_norm": 0.30797651410102844,
"learning_rate": 4.493013957603342e-07,
"loss": 0.5341,
"step": 6580
},
{
"epoch": 0.36615811373092927,
"grad_norm": 0.29396864771842957,
"learning_rate": 4.4864187608149664e-07,
"loss": 0.5531,
"step": 6600
},
{
"epoch": 0.36726768377253816,
"grad_norm": 0.4340313971042633,
"learning_rate": 4.4797858503096553e-07,
"loss": 0.5408,
"step": 6620
},
{
"epoch": 0.36837725381414704,
"grad_norm": 0.33175596594810486,
"learning_rate": 4.473115352018412e-07,
"loss": 0.5338,
"step": 6640
},
{
"epoch": 0.3694868238557559,
"grad_norm": 0.2598438262939453,
"learning_rate": 4.4664073925858737e-07,
"loss": 0.4943,
"step": 6660
},
{
"epoch": 0.37059639389736476,
"grad_norm": 0.42363619804382324,
"learning_rate": 4.459662099367908e-07,
"loss": 0.5188,
"step": 6680
},
{
"epoch": 0.37170596393897365,
"grad_norm": 0.30839937925338745,
"learning_rate": 4.4528796004291937e-07,
"loss": 0.551,
"step": 6700
},
{
"epoch": 0.37281553398058254,
"grad_norm": 0.39423060417175293,
"learning_rate": 4.4460600245407876e-07,
"loss": 0.5298,
"step": 6720
},
{
"epoch": 0.3739251040221914,
"grad_norm": 0.2767972946166992,
"learning_rate": 4.439203501177683e-07,
"loss": 0.4744,
"step": 6740
},
{
"epoch": 0.37503467406380026,
"grad_norm": 0.3349682092666626,
"learning_rate": 4.432310160516348e-07,
"loss": 0.6472,
"step": 6760
},
{
"epoch": 0.37614424410540914,
"grad_norm": 0.2644417881965637,
"learning_rate": 4.42538013343226e-07,
"loss": 0.4105,
"step": 6780
},
{
"epoch": 0.37725381414701803,
"grad_norm": 0.5239447951316833,
"learning_rate": 4.4184135514974117e-07,
"loss": 0.5414,
"step": 6800
},
{
"epoch": 0.3783633841886269,
"grad_norm": 0.363673597574234,
"learning_rate": 4.411410546977823e-07,
"loss": 0.6091,
"step": 6820
},
{
"epoch": 0.3794729542302358,
"grad_norm": 0.39377179741859436,
"learning_rate": 4.4043712528310217e-07,
"loss": 0.4794,
"step": 6840
},
{
"epoch": 0.38058252427184464,
"grad_norm": 0.3427687883377075,
"learning_rate": 4.397295802703523e-07,
"loss": 0.532,
"step": 6860
},
{
"epoch": 0.3816920943134535,
"grad_norm": 0.3745235502719879,
"learning_rate": 4.390184330928295e-07,
"loss": 0.5059,
"step": 6880
},
{
"epoch": 0.3828016643550624,
"grad_norm": 0.38224560022354126,
"learning_rate": 4.3830369725222017e-07,
"loss": 0.6141,
"step": 6900
},
{
"epoch": 0.3839112343966713,
"grad_norm": 0.4920729696750641,
"learning_rate": 4.375853863183443e-07,
"loss": 0.5215,
"step": 6920
},
{
"epoch": 0.3850208044382802,
"grad_norm": 0.3441776633262634,
"learning_rate": 4.3686351392889793e-07,
"loss": 0.4538,
"step": 6940
},
{
"epoch": 0.386130374479889,
"grad_norm": 0.3994586765766144,
"learning_rate": 4.361380937891942e-07,
"loss": 0.4517,
"step": 6960
},
{
"epoch": 0.3872399445214979,
"grad_norm": 0.38988494873046875,
"learning_rate": 4.3540913967190286e-07,
"loss": 0.4544,
"step": 6980
},
{
"epoch": 0.3883495145631068,
"grad_norm": 0.31946954131126404,
"learning_rate": 4.346766654167893e-07,
"loss": 0.4662,
"step": 7000
},
{
"epoch": 0.3894590846047157,
"grad_norm": 0.336331844329834,
"learning_rate": 4.33940684930451e-07,
"loss": 0.4657,
"step": 7020
},
{
"epoch": 0.39056865464632456,
"grad_norm": 0.35121986269950867,
"learning_rate": 4.3320121218605454e-07,
"loss": 0.4843,
"step": 7040
},
{
"epoch": 0.39167822468793345,
"grad_norm": 0.32553473114967346,
"learning_rate": 4.324582612230694e-07,
"loss": 0.5287,
"step": 7060
},
{
"epoch": 0.3927877947295423,
"grad_norm": 0.2604405879974365,
"learning_rate": 4.3171184614700185e-07,
"loss": 0.5274,
"step": 7080
},
{
"epoch": 0.39389736477115117,
"grad_norm": 0.2987576723098755,
"learning_rate": 4.309619811291271e-07,
"loss": 0.4328,
"step": 7100
},
{
"epoch": 0.39500693481276006,
"grad_norm": 0.3672533631324768,
"learning_rate": 4.3020868040622023e-07,
"loss": 0.5229,
"step": 7120
},
{
"epoch": 0.39611650485436894,
"grad_norm": 0.46902307868003845,
"learning_rate": 4.294519582802857e-07,
"loss": 0.5167,
"step": 7140
},
{
"epoch": 0.39722607489597783,
"grad_norm": 0.5896915793418884,
"learning_rate": 4.2869182911828627e-07,
"loss": 0.5236,
"step": 7160
},
{
"epoch": 0.39833564493758666,
"grad_norm": 0.31331950426101685,
"learning_rate": 4.2792830735186976e-07,
"loss": 0.58,
"step": 7180
},
{
"epoch": 0.39944521497919555,
"grad_norm": 0.4323517382144928,
"learning_rate": 4.2716140747709516e-07,
"loss": 0.4798,
"step": 7200
},
{
"epoch": 0.40055478502080444,
"grad_norm": 0.24910062551498413,
"learning_rate": 4.2639114405415777e-07,
"loss": 0.5023,
"step": 7220
},
{
"epoch": 0.4016643550624133,
"grad_norm": 0.3129631578922272,
"learning_rate": 4.256175317071122e-07,
"loss": 0.5323,
"step": 7240
},
{
"epoch": 0.4027739251040222,
"grad_norm": 0.30831965804100037,
"learning_rate": 4.248405851235952e-07,
"loss": 0.6024,
"step": 7260
},
{
"epoch": 0.40388349514563104,
"grad_norm": 0.2090596854686737,
"learning_rate": 4.2406031905454664e-07,
"loss": 0.5647,
"step": 7280
},
{
"epoch": 0.40499306518723993,
"grad_norm": 0.3623282015323639,
"learning_rate": 4.2327674831392923e-07,
"loss": 0.4927,
"step": 7300
},
{
"epoch": 0.4061026352288488,
"grad_norm": 0.2708721458911896,
"learning_rate": 4.2248988777844756e-07,
"loss": 0.4644,
"step": 7320
},
{
"epoch": 0.4072122052704577,
"grad_norm": 0.3101958930492401,
"learning_rate": 4.216997523872656e-07,
"loss": 0.4676,
"step": 7340
},
{
"epoch": 0.4083217753120666,
"grad_norm": 0.34548845887184143,
"learning_rate": 4.2090635714172295e-07,
"loss": 0.4972,
"step": 7360
},
{
"epoch": 0.4094313453536754,
"grad_norm": 0.2833360433578491,
"learning_rate": 4.2010971710505024e-07,
"loss": 0.5035,
"step": 7380
},
{
"epoch": 0.4105409153952843,
"grad_norm": 0.4793650805950165,
"learning_rate": 4.1930984740208277e-07,
"loss": 0.5244,
"step": 7400
},
{
"epoch": 0.4116504854368932,
"grad_norm": 0.25977978110313416,
"learning_rate": 4.185067632189737e-07,
"loss": 0.4568,
"step": 7420
},
{
"epoch": 0.4127600554785021,
"grad_norm": 0.6289175152778625,
"learning_rate": 4.177004798029058e-07,
"loss": 0.4981,
"step": 7440
},
{
"epoch": 0.413869625520111,
"grad_norm": 0.33281320333480835,
"learning_rate": 4.1689101246180134e-07,
"loss": 0.5826,
"step": 7460
},
{
"epoch": 0.41497919556171986,
"grad_norm": 0.39740175008773804,
"learning_rate": 4.1607837656403245e-07,
"loss": 0.4544,
"step": 7480
},
{
"epoch": 0.4160887656033287,
"grad_norm": 0.24023501574993134,
"learning_rate": 4.1526258753812833e-07,
"loss": 0.5676,
"step": 7500
},
{
"epoch": 0.4171983356449376,
"grad_norm": 0.3031497001647949,
"learning_rate": 4.1444366087248304e-07,
"loss": 0.4852,
"step": 7520
},
{
"epoch": 0.41830790568654647,
"grad_norm": 1.512039303779602,
"learning_rate": 4.136216121150611e-07,
"loss": 0.6114,
"step": 7540
},
{
"epoch": 0.41941747572815535,
"grad_norm": 0.3467373549938202,
"learning_rate": 4.1279645687310245e-07,
"loss": 0.4715,
"step": 7560
},
{
"epoch": 0.42052704576976424,
"grad_norm": 0.2934076189994812,
"learning_rate": 4.11968210812826e-07,
"loss": 0.4977,
"step": 7580
},
{
"epoch": 0.42163661581137307,
"grad_norm": 0.3921898305416107,
"learning_rate": 4.111368896591323e-07,
"loss": 0.5787,
"step": 7600
},
{
"epoch": 0.42274618585298196,
"grad_norm": 1.2903082370758057,
"learning_rate": 4.10302509195305e-07,
"loss": 0.4826,
"step": 7620
},
{
"epoch": 0.42385575589459085,
"grad_norm": 0.30528688430786133,
"learning_rate": 4.0946508526271107e-07,
"loss": 0.5653,
"step": 7640
},
{
"epoch": 0.42496532593619973,
"grad_norm": 0.35863542556762695,
"learning_rate": 4.086246337605002e-07,
"loss": 0.4821,
"step": 7660
},
{
"epoch": 0.4260748959778086,
"grad_norm": 0.28547871112823486,
"learning_rate": 4.077811706453028e-07,
"loss": 0.4127,
"step": 7680
},
{
"epoch": 0.42718446601941745,
"grad_norm": 0.489197313785553,
"learning_rate": 4.069347119309271e-07,
"loss": 0.5363,
"step": 7700
},
{
"epoch": 0.42829403606102634,
"grad_norm": 0.2887154221534729,
"learning_rate": 4.060852736880553e-07,
"loss": 0.5618,
"step": 7720
},
{
"epoch": 0.4294036061026352,
"grad_norm": 0.2965123951435089,
"learning_rate": 4.0523287204393795e-07,
"loss": 0.4854,
"step": 7740
},
{
"epoch": 0.4305131761442441,
"grad_norm": 0.326860636472702,
"learning_rate": 4.0437752318208846e-07,
"loss": 0.4852,
"step": 7760
},
{
"epoch": 0.431622746185853,
"grad_norm": 0.33304309844970703,
"learning_rate": 4.0351924334197516e-07,
"loss": 0.4727,
"step": 7780
},
{
"epoch": 0.43273231622746183,
"grad_norm": 0.26137974858283997,
"learning_rate": 4.0265804881871366e-07,
"loss": 0.536,
"step": 7800
},
{
"epoch": 0.4338418862690707,
"grad_norm": 0.543980062007904,
"learning_rate": 4.0179395596275665e-07,
"loss": 0.5694,
"step": 7820
},
{
"epoch": 0.4349514563106796,
"grad_norm": 0.3294641971588135,
"learning_rate": 4.0092698117958447e-07,
"loss": 0.5928,
"step": 7840
},
{
"epoch": 0.4360610263522885,
"grad_norm": 0.3174699544906616,
"learning_rate": 4.0005714092939255e-07,
"loss": 0.5022,
"step": 7860
},
{
"epoch": 0.4371705963938974,
"grad_norm": 0.3462190628051758,
"learning_rate": 3.9918445172677995e-07,
"loss": 0.4936,
"step": 7880
},
{
"epoch": 0.43828016643550627,
"grad_norm": 0.2537282407283783,
"learning_rate": 3.983089301404351e-07,
"loss": 0.6196,
"step": 7900
},
{
"epoch": 0.4393897364771151,
"grad_norm": 0.4872954189777374,
"learning_rate": 3.9743059279282126e-07,
"loss": 0.46,
"step": 7920
},
{
"epoch": 0.440499306518724,
"grad_norm": 0.3006153702735901,
"learning_rate": 3.9654945635986155e-07,
"loss": 0.5201,
"step": 7940
},
{
"epoch": 0.4416088765603329,
"grad_norm": 0.28118792176246643,
"learning_rate": 3.9566553757062154e-07,
"loss": 0.548,
"step": 7960
},
{
"epoch": 0.44271844660194176,
"grad_norm": 0.4475691616535187,
"learning_rate": 3.947788532069923e-07,
"loss": 0.498,
"step": 7980
},
{
"epoch": 0.44382801664355065,
"grad_norm": 0.39291325211524963,
"learning_rate": 3.938894201033713e-07,
"loss": 0.4702,
"step": 8000
},
{
"epoch": 0.4449375866851595,
"grad_norm": 0.3244176506996155,
"learning_rate": 3.929972551463431e-07,
"loss": 0.4421,
"step": 8020
},
{
"epoch": 0.44604715672676837,
"grad_norm": 0.3121548295021057,
"learning_rate": 3.9210237527435864e-07,
"loss": 0.5523,
"step": 8040
},
{
"epoch": 0.44715672676837726,
"grad_norm": 0.33265426754951477,
"learning_rate": 3.9120479747741344e-07,
"loss": 0.5159,
"step": 8060
},
{
"epoch": 0.44826629680998614,
"grad_norm": 0.298480361700058,
"learning_rate": 3.903045387967256e-07,
"loss": 0.4688,
"step": 8080
},
{
"epoch": 0.44937586685159503,
"grad_norm": 0.2703563868999481,
"learning_rate": 3.8940161632441157e-07,
"loss": 0.4673,
"step": 8100
},
{
"epoch": 0.45048543689320386,
"grad_norm": 0.4872739911079407,
"learning_rate": 3.884960472031622e-07,
"loss": 0.5151,
"step": 8120
},
{
"epoch": 0.45159500693481275,
"grad_norm": 0.4316668212413788,
"learning_rate": 3.87587848625917e-07,
"loss": 0.5357,
"step": 8140
},
{
"epoch": 0.45270457697642164,
"grad_norm": 0.2701937258243561,
"learning_rate": 3.866770378355375e-07,
"loss": 0.4859,
"step": 8160
},
{
"epoch": 0.4538141470180305,
"grad_norm": 0.2920953035354614,
"learning_rate": 3.8576363212448057e-07,
"loss": 0.5778,
"step": 8180
},
{
"epoch": 0.4549237170596394,
"grad_norm": 0.4294867217540741,
"learning_rate": 3.8484764883446944e-07,
"loss": 0.5387,
"step": 8200
},
{
"epoch": 0.45603328710124824,
"grad_norm": 0.42127662897109985,
"learning_rate": 3.8392910535616476e-07,
"loss": 0.5998,
"step": 8220
},
{
"epoch": 0.45714285714285713,
"grad_norm": 0.3392697870731354,
"learning_rate": 3.8300801912883414e-07,
"loss": 0.4547,
"step": 8240
},
{
"epoch": 0.458252427184466,
"grad_norm": 0.2972329258918762,
"learning_rate": 3.820844076400216e-07,
"loss": 0.5605,
"step": 8260
},
{
"epoch": 0.4593619972260749,
"grad_norm": 0.3029995560646057,
"learning_rate": 3.8115828842521514e-07,
"loss": 0.5124,
"step": 8280
},
{
"epoch": 0.4604715672676838,
"grad_norm": 0.3053463101387024,
"learning_rate": 3.802296790675137e-07,
"loss": 0.5181,
"step": 8300
},
{
"epoch": 0.4615811373092926,
"grad_norm": 0.4185832738876343,
"learning_rate": 3.7929859719729394e-07,
"loss": 0.5177,
"step": 8320
},
{
"epoch": 0.4626907073509015,
"grad_norm": 0.3226168155670166,
"learning_rate": 3.783650604918746e-07,
"loss": 0.5008,
"step": 8340
},
{
"epoch": 0.4638002773925104,
"grad_norm": 0.3534069359302521,
"learning_rate": 3.7742908667518175e-07,
"loss": 0.529,
"step": 8360
},
{
"epoch": 0.4649098474341193,
"grad_norm": 0.37676820158958435,
"learning_rate": 3.7649069351741185e-07,
"loss": 0.5128,
"step": 8380
},
{
"epoch": 0.46601941747572817,
"grad_norm": 0.30804529786109924,
"learning_rate": 3.755498988346945e-07,
"loss": 0.5262,
"step": 8400
},
{
"epoch": 0.46712898751733706,
"grad_norm": 0.3896881639957428,
"learning_rate": 3.746067204887538e-07,
"loss": 0.4655,
"step": 8420
},
{
"epoch": 0.4682385575589459,
"grad_norm": 0.3813321888446808,
"learning_rate": 3.7366117638657e-07,
"loss": 0.4867,
"step": 8440
},
{
"epoch": 0.4693481276005548,
"grad_norm": 0.3187031149864197,
"learning_rate": 3.72713284480039e-07,
"loss": 0.4468,
"step": 8460
},
{
"epoch": 0.47045769764216366,
"grad_norm": 0.3183457851409912,
"learning_rate": 3.7176306276563126e-07,
"loss": 0.491,
"step": 8480
},
{
"epoch": 0.47156726768377255,
"grad_norm": 0.58977210521698,
"learning_rate": 3.708105292840509e-07,
"loss": 0.5106,
"step": 8500
},
{
"epoch": 0.47267683772538144,
"grad_norm": 0.3110608756542206,
"learning_rate": 3.698557021198925e-07,
"loss": 0.4688,
"step": 8520
},
{
"epoch": 0.47378640776699027,
"grad_norm": 0.23853054642677307,
"learning_rate": 3.6889859940129814e-07,
"loss": 0.5505,
"step": 8540
},
{
"epoch": 0.47489597780859916,
"grad_norm": 0.24323715269565582,
"learning_rate": 3.6793923929961296e-07,
"loss": 0.4357,
"step": 8560
},
{
"epoch": 0.47600554785020804,
"grad_norm": 0.39231613278388977,
"learning_rate": 3.669776400290403e-07,
"loss": 0.5529,
"step": 8580
},
{
"epoch": 0.47711511789181693,
"grad_norm": 0.3708134889602661,
"learning_rate": 3.66013819846296e-07,
"loss": 0.6198,
"step": 8600
},
{
"epoch": 0.4782246879334258,
"grad_norm": 0.2724001109600067,
"learning_rate": 3.6504779705026156e-07,
"loss": 0.4517,
"step": 8620
},
{
"epoch": 0.47933425797503465,
"grad_norm": 0.3848133683204651,
"learning_rate": 3.6407958998163687e-07,
"loss": 0.5878,
"step": 8640
},
{
"epoch": 0.48044382801664354,
"grad_norm": 0.3788512945175171,
"learning_rate": 3.6310921702259184e-07,
"loss": 0.4963,
"step": 8660
},
{
"epoch": 0.4815533980582524,
"grad_norm": 0.45098355412483215,
"learning_rate": 3.6213669659641757e-07,
"loss": 0.4629,
"step": 8680
},
{
"epoch": 0.4826629680998613,
"grad_norm": 0.5497767329216003,
"learning_rate": 3.611620471671766e-07,
"loss": 0.5813,
"step": 8700
},
{
"epoch": 0.4837725381414702,
"grad_norm": 0.3716546297073364,
"learning_rate": 3.6018528723935214e-07,
"loss": 0.498,
"step": 8720
},
{
"epoch": 0.48488210818307903,
"grad_norm": 0.323794960975647,
"learning_rate": 3.5920643535749696e-07,
"loss": 0.4899,
"step": 8740
},
{
"epoch": 0.4859916782246879,
"grad_norm": 0.2931385338306427,
"learning_rate": 3.582255101058811e-07,
"loss": 0.56,
"step": 8760
},
{
"epoch": 0.4871012482662968,
"grad_norm": 0.4833517372608185,
"learning_rate": 3.572425301081392e-07,
"loss": 0.5229,
"step": 8780
},
{
"epoch": 0.4882108183079057,
"grad_norm": 0.3123915195465088,
"learning_rate": 3.5625751402691693e-07,
"loss": 0.5081,
"step": 8800
},
{
"epoch": 0.4893203883495146,
"grad_norm": 0.24910438060760498,
"learning_rate": 3.5527048056351654e-07,
"loss": 0.5406,
"step": 8820
},
{
"epoch": 0.49042995839112347,
"grad_norm": 0.4142923355102539,
"learning_rate": 3.542814484575419e-07,
"loss": 0.4703,
"step": 8840
},
{
"epoch": 0.4915395284327323,
"grad_norm": 0.34447526931762695,
"learning_rate": 3.532904364865426e-07,
"loss": 0.4803,
"step": 8860
},
{
"epoch": 0.4926490984743412,
"grad_norm": 0.8663418292999268,
"learning_rate": 3.522974634656576e-07,
"loss": 0.4718,
"step": 8880
},
{
"epoch": 0.49375866851595007,
"grad_norm": 0.5423429012298584,
"learning_rate": 3.5130254824725787e-07,
"loss": 0.4895,
"step": 8900
},
{
"epoch": 0.49486823855755896,
"grad_norm": 0.4453887641429901,
"learning_rate": 3.503057097205885e-07,
"loss": 0.5677,
"step": 8920
},
{
"epoch": 0.49597780859916785,
"grad_norm": 0.41120442748069763,
"learning_rate": 3.4930696681141034e-07,
"loss": 0.5781,
"step": 8940
},
{
"epoch": 0.4970873786407767,
"grad_norm": 0.38804373145103455,
"learning_rate": 3.4830633848164006e-07,
"loss": 0.5453,
"step": 8960
},
{
"epoch": 0.49819694868238557,
"grad_norm": 0.4749200940132141,
"learning_rate": 3.473038437289907e-07,
"loss": 0.516,
"step": 8980
},
{
"epoch": 0.49930651872399445,
"grad_norm": 0.34202054142951965,
"learning_rate": 3.462995015866109e-07,
"loss": 0.6462,
"step": 9000
},
{
"epoch": 0.5004160887656033,
"grad_norm": 0.29288092255592346,
"learning_rate": 3.452933311227232e-07,
"loss": 0.4496,
"step": 9020
},
{
"epoch": 0.5015256588072122,
"grad_norm": 0.4564000070095062,
"learning_rate": 3.442853514402626e-07,
"loss": 0.5254,
"step": 9040
},
{
"epoch": 0.5026352288488211,
"grad_norm": 0.4368923604488373,
"learning_rate": 3.432755816765131e-07,
"loss": 0.3723,
"step": 9060
},
{
"epoch": 0.50374479889043,
"grad_norm": 0.42233774065971375,
"learning_rate": 3.422640410027451e-07,
"loss": 0.4816,
"step": 9080
},
{
"epoch": 0.5048543689320388,
"grad_norm": 0.5136800408363342,
"learning_rate": 3.412507486238512e-07,
"loss": 0.48,
"step": 9100
},
{
"epoch": 0.5059639389736477,
"grad_norm": 0.6266002655029297,
"learning_rate": 3.4023572377798116e-07,
"loss": 0.4605,
"step": 9120
},
{
"epoch": 0.5070735090152566,
"grad_norm": 0.38814643025398254,
"learning_rate": 3.3921898573617715e-07,
"loss": 0.4482,
"step": 9140
},
{
"epoch": 0.5081830790568654,
"grad_norm": 0.3517705202102661,
"learning_rate": 3.382005538020078e-07,
"loss": 0.4514,
"step": 9160
},
{
"epoch": 0.5092926490984744,
"grad_norm": 0.25241366028785706,
"learning_rate": 3.371804473112014e-07,
"loss": 0.5004,
"step": 9180
},
{
"epoch": 0.5104022191400832,
"grad_norm": 0.5189043283462524,
"learning_rate": 3.3615868563127937e-07,
"loss": 0.5339,
"step": 9200
},
{
"epoch": 0.511511789181692,
"grad_norm": 0.42029428482055664,
"learning_rate": 3.3513528816118775e-07,
"loss": 0.551,
"step": 9220
},
{
"epoch": 0.512621359223301,
"grad_norm": 0.5070712566375732,
"learning_rate": 3.341102743309296e-07,
"loss": 0.5013,
"step": 9240
},
{
"epoch": 0.5137309292649098,
"grad_norm": 0.2703372836112976,
"learning_rate": 3.3308366360119584e-07,
"loss": 0.5446,
"step": 9260
},
{
"epoch": 0.5148404993065188,
"grad_norm": 0.39520278573036194,
"learning_rate": 3.3205547546299575e-07,
"loss": 0.5542,
"step": 9280
},
{
"epoch": 0.5159500693481276,
"grad_norm": 0.4130886197090149,
"learning_rate": 3.3102572943728673e-07,
"loss": 0.4348,
"step": 9300
},
{
"epoch": 0.5170596393897364,
"grad_norm": 0.30212274193763733,
"learning_rate": 3.2999444507460437e-07,
"loss": 0.4626,
"step": 9320
},
{
"epoch": 0.5181692094313454,
"grad_norm": 0.24097274243831635,
"learning_rate": 3.2896164195469033e-07,
"loss": 0.5347,
"step": 9340
},
{
"epoch": 0.5192787794729542,
"grad_norm": 0.2773560583591461,
"learning_rate": 3.279273396861214e-07,
"loss": 0.6156,
"step": 9360
},
{
"epoch": 0.5203883495145631,
"grad_norm": 0.4777432382106781,
"learning_rate": 3.268915579059366e-07,
"loss": 0.4348,
"step": 9380
},
{
"epoch": 0.521497919556172,
"grad_norm": 0.30696791410446167,
"learning_rate": 3.2585431627926476e-07,
"loss": 0.6035,
"step": 9400
},
{
"epoch": 0.5226074895977808,
"grad_norm": 0.2942405343055725,
"learning_rate": 3.248156344989512e-07,
"loss": 0.4254,
"step": 9420
},
{
"epoch": 0.5237170596393897,
"grad_norm": 0.498976469039917,
"learning_rate": 3.237755322851834e-07,
"loss": 0.5464,
"step": 9440
},
{
"epoch": 0.5248266296809986,
"grad_norm": 0.5058267712593079,
"learning_rate": 3.2273402938511706e-07,
"loss": 0.5168,
"step": 9460
},
{
"epoch": 0.5259361997226075,
"grad_norm": 0.164671391248703,
"learning_rate": 3.2169114557250103e-07,
"loss": 0.5291,
"step": 9480
},
{
"epoch": 0.5270457697642164,
"grad_norm": 0.45560675859451294,
"learning_rate": 3.206469006473017e-07,
"loss": 0.5577,
"step": 9500
},
{
"epoch": 0.5281553398058253,
"grad_norm": 0.4424368441104889,
"learning_rate": 3.196013144353274e-07,
"loss": 0.4828,
"step": 9520
},
{
"epoch": 0.5292649098474341,
"grad_norm": 0.4512025713920593,
"learning_rate": 3.185544067878518e-07,
"loss": 0.5543,
"step": 9540
},
{
"epoch": 0.530374479889043,
"grad_norm": 0.30722030997276306,
"learning_rate": 3.175061975812371e-07,
"loss": 0.5418,
"step": 9560
},
{
"epoch": 0.5314840499306519,
"grad_norm": 0.38087835907936096,
"learning_rate": 3.1645670671655645e-07,
"loss": 0.4493,
"step": 9580
},
{
"epoch": 0.5325936199722607,
"grad_norm": 0.3499601483345032,
"learning_rate": 3.154059541192164e-07,
"loss": 0.5878,
"step": 9600
},
{
"epoch": 0.5337031900138697,
"grad_norm": 0.4532395601272583,
"learning_rate": 3.1435395973857876e-07,
"loss": 0.5829,
"step": 9620
},
{
"epoch": 0.5348127600554785,
"grad_norm": 0.30889692902565,
"learning_rate": 3.1330074354758094e-07,
"loss": 0.5797,
"step": 9640
},
{
"epoch": 0.5359223300970873,
"grad_norm": 0.4182281494140625,
"learning_rate": 3.12246325542358e-07,
"loss": 0.5118,
"step": 9660
},
{
"epoch": 0.5370319001386963,
"grad_norm": 0.35436221957206726,
"learning_rate": 3.11190725741862e-07,
"loss": 0.6517,
"step": 9680
},
{
"epoch": 0.5381414701803051,
"grad_norm": 0.33836600184440613,
"learning_rate": 3.1013396418748234e-07,
"loss": 0.4423,
"step": 9700
},
{
"epoch": 0.5392510402219141,
"grad_norm": 0.34394633769989014,
"learning_rate": 3.090760609426655e-07,
"loss": 0.4322,
"step": 9720
},
{
"epoch": 0.5403606102635229,
"grad_norm": 0.3953171670436859,
"learning_rate": 3.080170360925336e-07,
"loss": 0.443,
"step": 9740
},
{
"epoch": 0.5414701803051317,
"grad_norm": 0.27416878938674927,
"learning_rate": 3.069569097435033e-07,
"loss": 0.4737,
"step": 9760
},
{
"epoch": 0.5425797503467407,
"grad_norm": 0.325531005859375,
"learning_rate": 3.0589570202290433e-07,
"loss": 0.4698,
"step": 9780
},
{
"epoch": 0.5436893203883495,
"grad_norm": 0.44285646080970764,
"learning_rate": 3.0483343307859663e-07,
"loss": 0.4886,
"step": 9800
},
{
"epoch": 0.5447988904299584,
"grad_norm": 0.2852041721343994,
"learning_rate": 3.0377012307858904e-07,
"loss": 0.5289,
"step": 9820
},
{
"epoch": 0.5459084604715673,
"grad_norm": 0.3136042356491089,
"learning_rate": 3.027057922106549e-07,
"loss": 0.494,
"step": 9840
},
{
"epoch": 0.5470180305131761,
"grad_norm": 0.3801943361759186,
"learning_rate": 3.0164046068195e-07,
"loss": 0.4818,
"step": 9860
},
{
"epoch": 0.548127600554785,
"grad_norm": 0.3817514479160309,
"learning_rate": 3.0057414871862816e-07,
"loss": 0.5448,
"step": 9880
},
{
"epoch": 0.5492371705963939,
"grad_norm": 0.3009171485900879,
"learning_rate": 2.9950687656545787e-07,
"loss": 0.4765,
"step": 9900
},
{
"epoch": 0.5503467406380028,
"grad_norm": 0.4833567440509796,
"learning_rate": 2.9843866448543727e-07,
"loss": 0.5342,
"step": 9920
},
{
"epoch": 0.5514563106796116,
"grad_norm": 0.4005512297153473,
"learning_rate": 2.973695327594099e-07,
"loss": 0.4766,
"step": 9940
},
{
"epoch": 0.5525658807212205,
"grad_norm": 0.33436936140060425,
"learning_rate": 2.9629950168567954e-07,
"loss": 0.4826,
"step": 9960
},
{
"epoch": 0.5536754507628294,
"grad_norm": 0.33340537548065186,
"learning_rate": 2.9522859157962454e-07,
"loss": 0.5473,
"step": 9980
},
{
"epoch": 0.5547850208044383,
"grad_norm": 0.33686235547065735,
"learning_rate": 2.9415682277331265e-07,
"loss": 0.5534,
"step": 10000
},
{
"epoch": 0.5558945908460472,
"grad_norm": 0.3106544315814972,
"learning_rate": 2.930842156151146e-07,
"loss": 0.497,
"step": 10020
},
{
"epoch": 0.557004160887656,
"grad_norm": 0.428392231464386,
"learning_rate": 2.920107904693178e-07,
"loss": 0.483,
"step": 10040
},
{
"epoch": 0.5581137309292649,
"grad_norm": 0.3546343743801117,
"learning_rate": 2.9093656771574006e-07,
"loss": 0.4438,
"step": 10060
},
{
"epoch": 0.5592233009708738,
"grad_norm": 0.5527733564376831,
"learning_rate": 2.8986156774934204e-07,
"loss": 0.5118,
"step": 10080
},
{
"epoch": 0.5603328710124826,
"grad_norm": 0.33526676893234253,
"learning_rate": 2.8878581097984075e-07,
"loss": 0.565,
"step": 10100
},
{
"epoch": 0.5614424410540916,
"grad_norm": 0.4417477548122406,
"learning_rate": 2.877093178313214e-07,
"loss": 0.4793,
"step": 10120
},
{
"epoch": 0.5625520110957004,
"grad_norm": 0.2999446392059326,
"learning_rate": 2.8663210874185013e-07,
"loss": 0.5449,
"step": 10140
},
{
"epoch": 0.5636615811373092,
"grad_norm": 0.38563647866249084,
"learning_rate": 2.8555420416308573e-07,
"loss": 0.5037,
"step": 10160
},
{
"epoch": 0.5647711511789182,
"grad_norm": 0.8459754586219788,
"learning_rate": 2.8447562455989134e-07,
"loss": 0.5474,
"step": 10180
},
{
"epoch": 0.565880721220527,
"grad_norm": 0.3799205720424652,
"learning_rate": 2.8339639040994604e-07,
"loss": 0.609,
"step": 10200
},
{
"epoch": 0.566990291262136,
"grad_norm": 0.51907879114151,
"learning_rate": 2.8231652220335603e-07,
"loss": 0.4621,
"step": 10220
},
{
"epoch": 0.5680998613037448,
"grad_norm": 0.3080946207046509,
"learning_rate": 2.812360404422653e-07,
"loss": 0.5304,
"step": 10240
},
{
"epoch": 0.5692094313453536,
"grad_norm": 0.35099002718925476,
"learning_rate": 2.80154965640467e-07,
"loss": 0.5226,
"step": 10260
},
{
"epoch": 0.5703190013869626,
"grad_norm": 0.3252660036087036,
"learning_rate": 2.790733183230136e-07,
"loss": 0.4481,
"step": 10280
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.3965422511100769,
"learning_rate": 2.7799111902582693e-07,
"loss": 0.5616,
"step": 10300
},
{
"epoch": 0.5725381414701803,
"grad_norm": 0.4213715195655823,
"learning_rate": 2.7690838829530886e-07,
"loss": 0.4895,
"step": 10320
},
{
"epoch": 0.5736477115117892,
"grad_norm": 0.27551645040512085,
"learning_rate": 2.758251466879508e-07,
"loss": 0.532,
"step": 10340
},
{
"epoch": 0.574757281553398,
"grad_norm": 0.6193405389785767,
"learning_rate": 2.7474141476994366e-07,
"loss": 0.5517,
"step": 10360
},
{
"epoch": 0.575866851595007,
"grad_norm": 0.4643997251987457,
"learning_rate": 2.736572131167872e-07,
"loss": 0.6117,
"step": 10380
},
{
"epoch": 0.5769764216366158,
"grad_norm": 0.3855043351650238,
"learning_rate": 2.725725623128994e-07,
"loss": 0.4358,
"step": 10400
},
{
"epoch": 0.5780859916782247,
"grad_norm": 0.2621045410633087,
"learning_rate": 2.71487482951226e-07,
"loss": 0.5391,
"step": 10420
},
{
"epoch": 0.5791955617198336,
"grad_norm": 0.4209457337856293,
"learning_rate": 2.7040199563284894e-07,
"loss": 0.5641,
"step": 10440
},
{
"epoch": 0.5803051317614425,
"grad_norm": 0.3612224757671356,
"learning_rate": 2.6931612096659566e-07,
"loss": 0.5176,
"step": 10460
},
{
"epoch": 0.5814147018030513,
"grad_norm": 0.30091843008995056,
"learning_rate": 2.682298795686478e-07,
"loss": 0.4728,
"step": 10480
},
{
"epoch": 0.5825242718446602,
"grad_norm": 0.33328065276145935,
"learning_rate": 2.671432920621495e-07,
"loss": 0.5307,
"step": 10500
},
{
"epoch": 0.5836338418862691,
"grad_norm": 0.2998218536376953,
"learning_rate": 2.6605637907681613e-07,
"loss": 0.5042,
"step": 10520
},
{
"epoch": 0.5847434119278779,
"grad_norm": 0.576810896396637,
"learning_rate": 2.6496916124854244e-07,
"loss": 0.5064,
"step": 10540
},
{
"epoch": 0.5858529819694869,
"grad_norm": 0.40196332335472107,
"learning_rate": 2.638816592190112e-07,
"loss": 0.5932,
"step": 10560
},
{
"epoch": 0.5869625520110957,
"grad_norm": 0.524108350276947,
"learning_rate": 2.627938936353006e-07,
"loss": 0.5463,
"step": 10580
},
{
"epoch": 0.5880721220527045,
"grad_norm": 0.45371681451797485,
"learning_rate": 2.617058851494927e-07,
"loss": 0.5356,
"step": 10600
},
{
"epoch": 0.5891816920943135,
"grad_norm": 0.3213278651237488,
"learning_rate": 2.606176544182813e-07,
"loss": 0.5075,
"step": 10620
},
{
"epoch": 0.5902912621359223,
"grad_norm": 0.34761932492256165,
"learning_rate": 2.5952922210257964e-07,
"loss": 0.5104,
"step": 10640
},
{
"epoch": 0.5914008321775313,
"grad_norm": 0.32400938868522644,
"learning_rate": 2.584406088671284e-07,
"loss": 0.4889,
"step": 10660
},
{
"epoch": 0.5925104022191401,
"grad_norm": 0.30970096588134766,
"learning_rate": 2.573518353801028e-07,
"loss": 0.6171,
"step": 10680
},
{
"epoch": 0.5936199722607489,
"grad_norm": 0.3451375365257263,
"learning_rate": 2.5626292231272086e-07,
"loss": 0.4881,
"step": 10700
},
{
"epoch": 0.5947295423023579,
"grad_norm": 0.41977164149284363,
"learning_rate": 2.5517389033885056e-07,
"loss": 0.5399,
"step": 10720
},
{
"epoch": 0.5958391123439667,
"grad_norm": 0.3118828535079956,
"learning_rate": 2.540847601346173e-07,
"loss": 0.4543,
"step": 10740
},
{
"epoch": 0.5969486823855756,
"grad_norm": 0.36801040172576904,
"learning_rate": 2.5299555237801176e-07,
"loss": 0.4706,
"step": 10760
},
{
"epoch": 0.5980582524271845,
"grad_norm": 0.3566454350948334,
"learning_rate": 2.5190628774849667e-07,
"loss": 0.5271,
"step": 10780
},
{
"epoch": 0.5991678224687933,
"grad_norm": 0.436729371547699,
"learning_rate": 2.5081698692661475e-07,
"loss": 0.5308,
"step": 10800
},
{
"epoch": 0.6002773925104022,
"grad_norm": 0.39320242404937744,
"learning_rate": 2.497276705935957e-07,
"loss": 0.5804,
"step": 10820
},
{
"epoch": 0.6013869625520111,
"grad_norm": 0.2793132960796356,
"learning_rate": 2.4863835943096386e-07,
"loss": 0.4361,
"step": 10840
},
{
"epoch": 0.60249653259362,
"grad_norm": 0.31615859270095825,
"learning_rate": 2.4754907412014526e-07,
"loss": 0.468,
"step": 10860
},
{
"epoch": 0.6036061026352288,
"grad_norm": 0.27115607261657715,
"learning_rate": 2.464598353420754e-07,
"loss": 0.4934,
"step": 10880
},
{
"epoch": 0.6047156726768377,
"grad_norm": 0.41032761335372925,
"learning_rate": 2.45370663776806e-07,
"loss": 0.6101,
"step": 10900
},
{
"epoch": 0.6058252427184466,
"grad_norm": 0.3607085943222046,
"learning_rate": 2.442815801031128e-07,
"loss": 0.4863,
"step": 10920
},
{
"epoch": 0.6069348127600555,
"grad_norm": 0.3773590624332428,
"learning_rate": 2.431926049981029e-07,
"loss": 0.4498,
"step": 10940
},
{
"epoch": 0.6080443828016644,
"grad_norm": 0.3329433500766754,
"learning_rate": 2.4210375913682203e-07,
"loss": 0.5377,
"step": 10960
},
{
"epoch": 0.6091539528432732,
"grad_norm": 0.3469200134277344,
"learning_rate": 2.4101506319186234e-07,
"loss": 0.5213,
"step": 10980
},
{
"epoch": 0.6102635228848821,
"grad_norm": 0.5309900045394897,
"learning_rate": 2.399265378329694e-07,
"loss": 0.5984,
"step": 11000
},
{
"epoch": 0.611373092926491,
"grad_norm": 0.28028154373168945,
"learning_rate": 2.388382037266504e-07,
"loss": 0.5899,
"step": 11020
},
{
"epoch": 0.6124826629680998,
"grad_norm": 0.3442562520503998,
"learning_rate": 2.3775008153578108e-07,
"loss": 0.4739,
"step": 11040
},
{
"epoch": 0.6135922330097088,
"grad_norm": 0.38204333186149597,
"learning_rate": 2.366621919192141e-07,
"loss": 0.4846,
"step": 11060
},
{
"epoch": 0.6147018030513176,
"grad_norm": 0.4290561378002167,
"learning_rate": 2.3557455553138645e-07,
"loss": 0.5242,
"step": 11080
},
{
"epoch": 0.6158113730929264,
"grad_norm": 0.33580636978149414,
"learning_rate": 2.3448719302192729e-07,
"loss": 0.4827,
"step": 11100
},
{
"epoch": 0.6169209431345354,
"grad_norm": 0.3730616569519043,
"learning_rate": 2.3340012503526607e-07,
"loss": 0.55,
"step": 11120
},
{
"epoch": 0.6180305131761442,
"grad_norm": 0.5171761512756348,
"learning_rate": 2.323133722102404e-07,
"loss": 0.514,
"step": 11140
},
{
"epoch": 0.6191400832177532,
"grad_norm": 0.328640878200531,
"learning_rate": 2.3122695517970434e-07,
"loss": 0.5019,
"step": 11160
},
{
"epoch": 0.620249653259362,
"grad_norm": 0.34641680121421814,
"learning_rate": 2.3014089457013675e-07,
"loss": 0.5429,
"step": 11180
},
{
"epoch": 0.6213592233009708,
"grad_norm": 0.3420324921607971,
"learning_rate": 2.2905521100124935e-07,
"loss": 0.4482,
"step": 11200
},
{
"epoch": 0.6224687933425798,
"grad_norm": 0.44696226716041565,
"learning_rate": 2.2796992508559563e-07,
"loss": 0.5247,
"step": 11220
},
{
"epoch": 0.6235783633841886,
"grad_norm": 0.3765786290168762,
"learning_rate": 2.2688505742817916e-07,
"loss": 0.5924,
"step": 11240
},
{
"epoch": 0.6246879334257975,
"grad_norm": 0.32200539112091064,
"learning_rate": 2.258006286260626e-07,
"loss": 0.5605,
"step": 11260
},
{
"epoch": 0.6257975034674064,
"grad_norm": 0.455307275056839,
"learning_rate": 2.2471665926797676e-07,
"loss": 0.5314,
"step": 11280
},
{
"epoch": 0.6269070735090153,
"grad_norm": 0.25619956851005554,
"learning_rate": 2.2363316993392932e-07,
"loss": 0.551,
"step": 11300
},
{
"epoch": 0.6280166435506241,
"grad_norm": 0.3347276747226715,
"learning_rate": 2.225501811948145e-07,
"loss": 0.4912,
"step": 11320
},
{
"epoch": 0.629126213592233,
"grad_norm": 0.6607474088668823,
"learning_rate": 2.2146771361202215e-07,
"loss": 0.5319,
"step": 11340
},
{
"epoch": 0.6302357836338419,
"grad_norm": 0.39210301637649536,
"learning_rate": 2.203857877370477e-07,
"loss": 0.4812,
"step": 11360
},
{
"epoch": 0.6313453536754507,
"grad_norm": 0.36486566066741943,
"learning_rate": 2.193044241111018e-07,
"loss": 0.5066,
"step": 11380
},
{
"epoch": 0.6324549237170597,
"grad_norm": 0.25275692343711853,
"learning_rate": 2.182236432647204e-07,
"loss": 0.4351,
"step": 11400
},
{
"epoch": 0.6335644937586685,
"grad_norm": 0.32072117924690247,
"learning_rate": 2.1714346571737485e-07,
"loss": 0.4902,
"step": 11420
},
{
"epoch": 0.6346740638002774,
"grad_norm": 0.3688344657421112,
"learning_rate": 2.160639119770824e-07,
"loss": 0.4802,
"step": 11440
},
{
"epoch": 0.6357836338418863,
"grad_norm": 0.3499152958393097,
"learning_rate": 2.1498500254001683e-07,
"loss": 0.5426,
"step": 11460
},
{
"epoch": 0.6368932038834951,
"grad_norm": 0.5433036088943481,
"learning_rate": 2.1390675789011945e-07,
"loss": 0.5413,
"step": 11480
},
{
"epoch": 0.6380027739251041,
"grad_norm": 0.32321110367774963,
"learning_rate": 2.128291984987099e-07,
"loss": 0.5557,
"step": 11500
},
{
"epoch": 0.6391123439667129,
"grad_norm": 0.3584541380405426,
"learning_rate": 2.117523448240977e-07,
"loss": 0.6089,
"step": 11520
},
{
"epoch": 0.6402219140083217,
"grad_norm": 0.35222071409225464,
"learning_rate": 2.1067621731119384e-07,
"loss": 0.4796,
"step": 11540
},
{
"epoch": 0.6413314840499307,
"grad_norm": 0.3017160892486572,
"learning_rate": 2.0960083639112243e-07,
"loss": 0.4427,
"step": 11560
},
{
"epoch": 0.6424410540915395,
"grad_norm": 0.3295774459838867,
"learning_rate": 2.0852622248083308e-07,
"loss": 0.4628,
"step": 11580
},
{
"epoch": 0.6435506241331485,
"grad_norm": 0.5670339465141296,
"learning_rate": 2.0745239598271312e-07,
"loss": 0.5061,
"step": 11600
},
{
"epoch": 0.6446601941747573,
"grad_norm": 0.3149929344654083,
"learning_rate": 2.0637937728420008e-07,
"loss": 0.4442,
"step": 11620
},
{
"epoch": 0.6457697642163661,
"grad_norm": 0.5195226669311523,
"learning_rate": 2.0530718675739488e-07,
"loss": 0.5651,
"step": 11640
},
{
"epoch": 0.6468793342579751,
"grad_norm": 0.34720999002456665,
"learning_rate": 2.0423584475867504e-07,
"loss": 0.5341,
"step": 11660
},
{
"epoch": 0.6479889042995839,
"grad_norm": 0.41216403245925903,
"learning_rate": 2.0316537162830784e-07,
"loss": 0.4756,
"step": 11680
},
{
"epoch": 0.6490984743411928,
"grad_norm": 0.4366797208786011,
"learning_rate": 2.020957876900648e-07,
"loss": 0.5297,
"step": 11700
},
{
"epoch": 0.6502080443828017,
"grad_norm": 0.3360641598701477,
"learning_rate": 2.0102711325083513e-07,
"loss": 0.6328,
"step": 11720
},
{
"epoch": 0.6513176144244105,
"grad_norm": 0.37930089235305786,
"learning_rate": 1.999593686002406e-07,
"loss": 0.5153,
"step": 11740
},
{
"epoch": 0.6524271844660194,
"grad_norm": 0.44946834444999695,
"learning_rate": 1.9889257401025015e-07,
"loss": 0.5232,
"step": 11760
},
{
"epoch": 0.6535367545076283,
"grad_norm": 0.43730628490448,
"learning_rate": 1.978267497347951e-07,
"loss": 0.4573,
"step": 11780
},
{
"epoch": 0.6546463245492372,
"grad_norm": 0.4298498332500458,
"learning_rate": 1.9676191600938474e-07,
"loss": 0.3999,
"step": 11800
},
{
"epoch": 0.655755894590846,
"grad_norm": 0.3248540759086609,
"learning_rate": 1.9569809305072177e-07,
"loss": 0.5563,
"step": 11820
},
{
"epoch": 0.6568654646324549,
"grad_norm": 0.3533152639865875,
"learning_rate": 1.9463530105631877e-07,
"loss": 0.5788,
"step": 11840
},
{
"epoch": 0.6579750346740638,
"grad_norm": 0.4187324047088623,
"learning_rate": 1.9357356020411475e-07,
"loss": 0.4424,
"step": 11860
},
{
"epoch": 0.6590846047156727,
"grad_norm": 0.39432525634765625,
"learning_rate": 1.925128906520917e-07,
"loss": 0.5991,
"step": 11880
},
{
"epoch": 0.6601941747572816,
"grad_norm": 0.5896446108818054,
"learning_rate": 1.9145331253789253e-07,
"loss": 0.5596,
"step": 11900
},
{
"epoch": 0.6613037447988904,
"grad_norm": 0.36868834495544434,
"learning_rate": 1.90394845978438e-07,
"loss": 0.5265,
"step": 11920
},
{
"epoch": 0.6624133148404993,
"grad_norm": 0.386819452047348,
"learning_rate": 1.8933751106954535e-07,
"loss": 0.5376,
"step": 11940
},
{
"epoch": 0.6635228848821082,
"grad_norm": 0.3883580267429352,
"learning_rate": 1.8828132788554638e-07,
"loss": 0.4808,
"step": 11960
},
{
"epoch": 0.664632454923717,
"grad_norm": 0.6747732162475586,
"learning_rate": 1.8722631647890657e-07,
"loss": 0.4478,
"step": 11980
},
{
"epoch": 0.665742024965326,
"grad_norm": 0.2752344012260437,
"learning_rate": 1.8617249687984434e-07,
"loss": 0.5043,
"step": 12000
},
{
"epoch": 0.6668515950069348,
"grad_norm": 0.39787495136260986,
"learning_rate": 1.8511988909595067e-07,
"loss": 0.4899,
"step": 12020
},
{
"epoch": 0.6679611650485436,
"grad_norm": 0.4791197180747986,
"learning_rate": 1.8406851311180926e-07,
"loss": 0.5389,
"step": 12040
},
{
"epoch": 0.6690707350901526,
"grad_norm": 0.3360481858253479,
"learning_rate": 1.8301838888861709e-07,
"loss": 0.4804,
"step": 12060
},
{
"epoch": 0.6701803051317614,
"grad_norm": 0.3347693681716919,
"learning_rate": 1.819695363638055e-07,
"loss": 0.558,
"step": 12080
},
{
"epoch": 0.6712898751733704,
"grad_norm": 0.30149486660957336,
"learning_rate": 1.809219754506618e-07,
"loss": 0.4088,
"step": 12100
},
{
"epoch": 0.6723994452149792,
"grad_norm": 0.30244728922843933,
"learning_rate": 1.7987572603795078e-07,
"loss": 0.5592,
"step": 12120
},
{
"epoch": 0.673509015256588,
"grad_norm": 0.3256188929080963,
"learning_rate": 1.7883080798953754e-07,
"loss": 0.6117,
"step": 12140
},
{
"epoch": 0.674618585298197,
"grad_norm": 0.36098477244377136,
"learning_rate": 1.777872411440101e-07,
"loss": 0.4261,
"step": 12160
},
{
"epoch": 0.6757281553398058,
"grad_norm": 0.33403322100639343,
"learning_rate": 1.767450453143029e-07,
"loss": 0.503,
"step": 12180
},
{
"epoch": 0.6768377253814147,
"grad_norm": 0.5139286518096924,
"learning_rate": 1.757042402873205e-07,
"loss": 0.44,
"step": 12200
},
{
"epoch": 0.6779472954230236,
"grad_norm": 0.368712842464447,
"learning_rate": 1.7466484582356212e-07,
"loss": 0.4188,
"step": 12220
},
{
"epoch": 0.6790568654646325,
"grad_norm": 0.3538356423377991,
"learning_rate": 1.736268816567461e-07,
"loss": 0.4575,
"step": 12240
},
{
"epoch": 0.6801664355062413,
"grad_norm": 0.30539965629577637,
"learning_rate": 1.725903674934357e-07,
"loss": 0.4657,
"step": 12260
},
{
"epoch": 0.6812760055478502,
"grad_norm": 0.2921498417854309,
"learning_rate": 1.715553230126645e-07,
"loss": 0.5406,
"step": 12280
},
{
"epoch": 0.6823855755894591,
"grad_norm": 1.0049635171890259,
"learning_rate": 1.705217678655633e-07,
"loss": 0.5792,
"step": 12300
},
{
"epoch": 0.683495145631068,
"grad_norm": 0.3126891851425171,
"learning_rate": 1.6948972167498649e-07,
"loss": 0.4519,
"step": 12320
},
{
"epoch": 0.6846047156726769,
"grad_norm": 0.36534181237220764,
"learning_rate": 1.684592040351398e-07,
"loss": 0.4744,
"step": 12340
},
{
"epoch": 0.6857142857142857,
"grad_norm": 0.32556411623954773,
"learning_rate": 1.674302345112083e-07,
"loss": 0.5786,
"step": 12360
},
{
"epoch": 0.6868238557558946,
"grad_norm": 0.42282480001449585,
"learning_rate": 1.664028326389847e-07,
"loss": 0.6001,
"step": 12380
},
{
"epoch": 0.6879334257975035,
"grad_norm": 0.2856026887893677,
"learning_rate": 1.6537701792449882e-07,
"loss": 0.4948,
"step": 12400
},
{
"epoch": 0.6890429958391123,
"grad_norm": 0.30700036883354187,
"learning_rate": 1.6435280984364692e-07,
"loss": 0.515,
"step": 12420
},
{
"epoch": 0.6901525658807213,
"grad_norm": 0.4423519968986511,
"learning_rate": 1.633302278418221e-07,
"loss": 0.4761,
"step": 12440
},
{
"epoch": 0.6912621359223301,
"grad_norm": 0.3314474821090698,
"learning_rate": 1.6230929133354506e-07,
"loss": 0.5463,
"step": 12460
},
{
"epoch": 0.6923717059639389,
"grad_norm": 0.5163138508796692,
"learning_rate": 1.6129001970209552e-07,
"loss": 0.4718,
"step": 12480
},
{
"epoch": 0.6934812760055479,
"grad_norm": 0.3462437093257904,
"learning_rate": 1.6027243229914414e-07,
"loss": 0.4545,
"step": 12500
},
{
"epoch": 0.6945908460471567,
"grad_norm": 0.32218673825263977,
"learning_rate": 1.5925654844438536e-07,
"loss": 0.5148,
"step": 12520
},
{
"epoch": 0.6957004160887656,
"grad_norm": 0.44593995809555054,
"learning_rate": 1.582423874251703e-07,
"loss": 0.4836,
"step": 12540
},
{
"epoch": 0.6968099861303745,
"grad_norm": 0.3792048990726471,
"learning_rate": 1.5722996849614066e-07,
"loss": 0.5882,
"step": 12560
},
{
"epoch": 0.6979195561719833,
"grad_norm": 0.33511149883270264,
"learning_rate": 1.5621931087886324e-07,
"loss": 0.5293,
"step": 12580
},
{
"epoch": 0.6990291262135923,
"grad_norm": 0.4749152660369873,
"learning_rate": 1.5521043376146494e-07,
"loss": 0.4484,
"step": 12600
},
{
"epoch": 0.7001386962552011,
"grad_norm": 0.36789247393608093,
"learning_rate": 1.5420335629826856e-07,
"loss": 0.5205,
"step": 12620
},
{
"epoch": 0.70124826629681,
"grad_norm": 0.3322623074054718,
"learning_rate": 1.5319809760942896e-07,
"loss": 0.4483,
"step": 12640
},
{
"epoch": 0.7023578363384189,
"grad_norm": 0.3089485466480255,
"learning_rate": 1.5219467678057017e-07,
"loss": 0.4467,
"step": 12660
},
{
"epoch": 0.7034674063800277,
"grad_norm": 0.3379858136177063,
"learning_rate": 1.511931128624231e-07,
"loss": 0.5278,
"step": 12680
},
{
"epoch": 0.7045769764216366,
"grad_norm": 0.3589012622833252,
"learning_rate": 1.5019342487046355e-07,
"loss": 0.4508,
"step": 12700
},
{
"epoch": 0.7056865464632455,
"grad_norm": 0.36919447779655457,
"learning_rate": 1.4919563178455153e-07,
"loss": 0.4822,
"step": 12720
},
{
"epoch": 0.7067961165048544,
"grad_norm": 0.3971253037452698,
"learning_rate": 1.4819975254857066e-07,
"loss": 0.5558,
"step": 12740
},
{
"epoch": 0.7079056865464632,
"grad_norm": 0.37145859003067017,
"learning_rate": 1.472058060700689e-07,
"loss": 0.5777,
"step": 12760
},
{
"epoch": 0.7090152565880721,
"grad_norm": 0.33810412883758545,
"learning_rate": 1.46213811219899e-07,
"loss": 0.4974,
"step": 12780
},
{
"epoch": 0.710124826629681,
"grad_norm": 0.28394588828086853,
"learning_rate": 1.452237868318606e-07,
"loss": 0.4865,
"step": 12800
},
{
"epoch": 0.7112343966712898,
"grad_norm": 0.6261323690414429,
"learning_rate": 1.4423575170234267e-07,
"loss": 0.5135,
"step": 12820
},
{
"epoch": 0.7123439667128988,
"grad_norm": 0.5439554452896118,
"learning_rate": 1.4324972458996638e-07,
"loss": 0.596,
"step": 12840
},
{
"epoch": 0.7134535367545076,
"grad_norm": 0.37857553362846375,
"learning_rate": 1.422657242152293e-07,
"loss": 0.6137,
"step": 12860
},
{
"epoch": 0.7145631067961165,
"grad_norm": 0.5687951445579529,
"learning_rate": 1.4128376926014957e-07,
"loss": 0.425,
"step": 12880
},
{
"epoch": 0.7156726768377254,
"grad_norm": 0.39318007230758667,
"learning_rate": 1.4030387836791164e-07,
"loss": 0.4716,
"step": 12900
},
{
"epoch": 0.7167822468793342,
"grad_norm": 0.4043221175670624,
"learning_rate": 1.3932607014251218e-07,
"loss": 0.5187,
"step": 12920
},
{
"epoch": 0.7178918169209432,
"grad_norm": 0.2890317440032959,
"learning_rate": 1.3835036314840643e-07,
"loss": 0.5747,
"step": 12940
},
{
"epoch": 0.719001386962552,
"grad_norm": 0.2909344434738159,
"learning_rate": 1.3737677591015657e-07,
"loss": 0.4737,
"step": 12960
},
{
"epoch": 0.7201109570041608,
"grad_norm": 0.373444139957428,
"learning_rate": 1.364053269120791e-07,
"loss": 0.5127,
"step": 12980
},
{
"epoch": 0.7212205270457698,
"grad_norm": 0.3974605202674866,
"learning_rate": 1.3543603459789466e-07,
"loss": 0.4898,
"step": 13000
},
{
"epoch": 0.7223300970873786,
"grad_norm": 0.3636089563369751,
"learning_rate": 1.3446891737037762e-07,
"loss": 0.5415,
"step": 13020
},
{
"epoch": 0.7234396671289876,
"grad_norm": 0.38493213057518005,
"learning_rate": 1.3350399359100623e-07,
"loss": 0.4693,
"step": 13040
},
{
"epoch": 0.7245492371705964,
"grad_norm": 0.3705246150493622,
"learning_rate": 1.3254128157961486e-07,
"loss": 0.5556,
"step": 13060
},
{
"epoch": 0.7256588072122053,
"grad_norm": 0.501947820186615,
"learning_rate": 1.3158079961404534e-07,
"loss": 0.5332,
"step": 13080
},
{
"epoch": 0.7267683772538142,
"grad_norm": 0.2879910171031952,
"learning_rate": 1.3062256592980064e-07,
"loss": 0.5306,
"step": 13100
},
{
"epoch": 0.727877947295423,
"grad_norm": 0.6388247013092041,
"learning_rate": 1.296665987196983e-07,
"loss": 0.4829,
"step": 13120
},
{
"epoch": 0.7289875173370319,
"grad_norm": 0.34465721249580383,
"learning_rate": 1.2871291613352477e-07,
"loss": 0.4307,
"step": 13140
},
{
"epoch": 0.7300970873786408,
"grad_norm": 0.34914347529411316,
"learning_rate": 1.2776153627769159e-07,
"loss": 0.5307,
"step": 13160
},
{
"epoch": 0.7312066574202497,
"grad_norm": 0.3551192283630371,
"learning_rate": 1.2681247721489074e-07,
"loss": 0.5591,
"step": 13180
},
{
"epoch": 0.7323162274618585,
"grad_norm": 0.43952858448028564,
"learning_rate": 1.2586575696375238e-07,
"loss": 0.5065,
"step": 13200
},
{
"epoch": 0.7334257975034674,
"grad_norm": 0.4221738576889038,
"learning_rate": 1.249213934985025e-07,
"loss": 0.5212,
"step": 13220
},
{
"epoch": 0.7345353675450763,
"grad_norm": 0.5308628678321838,
"learning_rate": 1.2397940474862144e-07,
"loss": 0.4936,
"step": 13240
},
{
"epoch": 0.7356449375866851,
"grad_norm": 0.45746496319770813,
"learning_rate": 1.2303980859850402e-07,
"loss": 0.4479,
"step": 13260
},
{
"epoch": 0.7367545076282941,
"grad_norm": 0.38960978388786316,
"learning_rate": 1.2210262288711933e-07,
"loss": 0.4848,
"step": 13280
},
{
"epoch": 0.7378640776699029,
"grad_norm": 0.7810402512550354,
"learning_rate": 1.2116786540767267e-07,
"loss": 0.4522,
"step": 13300
},
{
"epoch": 0.7389736477115117,
"grad_norm": 0.7264504432678223,
"learning_rate": 1.2023555390726748e-07,
"loss": 0.5517,
"step": 13320
},
{
"epoch": 0.7400832177531207,
"grad_norm": 0.3533945381641388,
"learning_rate": 1.1930570608656803e-07,
"loss": 0.4049,
"step": 13340
},
{
"epoch": 0.7411927877947295,
"grad_norm": 0.25890418887138367,
"learning_rate": 1.183783395994641e-07,
"loss": 0.5448,
"step": 13360
},
{
"epoch": 0.7423023578363385,
"grad_norm": 0.279067724943161,
"learning_rate": 1.1745347205273506e-07,
"loss": 0.5113,
"step": 13380
},
{
"epoch": 0.7434119278779473,
"grad_norm": 0.31982362270355225,
"learning_rate": 1.1653112100571619e-07,
"loss": 0.5634,
"step": 13400
},
{
"epoch": 0.7445214979195561,
"grad_norm": 0.3901461064815521,
"learning_rate": 1.1561130396996508e-07,
"loss": 0.5766,
"step": 13420
},
{
"epoch": 0.7456310679611651,
"grad_norm": 0.32104188203811646,
"learning_rate": 1.146940384089288e-07,
"loss": 0.4248,
"step": 13440
},
{
"epoch": 0.7467406380027739,
"grad_norm": 0.27771735191345215,
"learning_rate": 1.1377934173761311e-07,
"loss": 0.4721,
"step": 13460
},
{
"epoch": 0.7478502080443828,
"grad_norm": 0.4484061300754547,
"learning_rate": 1.1286723132225095e-07,
"loss": 0.4968,
"step": 13480
},
{
"epoch": 0.7489597780859917,
"grad_norm": 0.2738656997680664,
"learning_rate": 1.1195772447997348e-07,
"loss": 0.5468,
"step": 13500
},
{
"epoch": 0.7500693481276005,
"grad_norm": 0.3913407325744629,
"learning_rate": 1.1105083847848101e-07,
"loss": 0.5727,
"step": 13520
},
{
"epoch": 0.7511789181692095,
"grad_norm": 0.46406638622283936,
"learning_rate": 1.1014659053571476e-07,
"loss": 0.4827,
"step": 13540
},
{
"epoch": 0.7522884882108183,
"grad_norm": 0.30459386110305786,
"learning_rate": 1.092449978195308e-07,
"loss": 0.5731,
"step": 13560
},
{
"epoch": 0.7533980582524272,
"grad_norm": 0.4219912588596344,
"learning_rate": 1.0834607744737329e-07,
"loss": 0.5629,
"step": 13580
},
{
"epoch": 0.7545076282940361,
"grad_norm": 0.5843199491500854,
"learning_rate": 1.0744984648595006e-07,
"loss": 0.5359,
"step": 13600
},
{
"epoch": 0.7556171983356449,
"grad_norm": 0.31454548239707947,
"learning_rate": 1.0655632195090822e-07,
"loss": 0.4659,
"step": 13620
},
{
"epoch": 0.7567267683772538,
"grad_norm": 0.4814988374710083,
"learning_rate": 1.0566552080651133e-07,
"loss": 0.4961,
"step": 13640
},
{
"epoch": 0.7578363384188627,
"grad_norm": 0.32601091265678406,
"learning_rate": 1.0477745996531739e-07,
"loss": 0.4892,
"step": 13660
},
{
"epoch": 0.7589459084604716,
"grad_norm": 0.46707651019096375,
"learning_rate": 1.0389215628785725e-07,
"loss": 0.4755,
"step": 13680
},
{
"epoch": 0.7600554785020804,
"grad_norm": 0.31303638219833374,
"learning_rate": 1.0300962658231521e-07,
"loss": 0.4734,
"step": 13700
},
{
"epoch": 0.7611650485436893,
"grad_norm": 0.3001532554626465,
"learning_rate": 1.0212988760420918e-07,
"loss": 0.5897,
"step": 13720
},
{
"epoch": 0.7622746185852982,
"grad_norm": 0.26135823130607605,
"learning_rate": 1.0125295605607324e-07,
"loss": 0.5347,
"step": 13740
},
{
"epoch": 0.763384188626907,
"grad_norm": 0.5185014009475708,
"learning_rate": 1.0037884858714012e-07,
"loss": 0.4531,
"step": 13760
},
{
"epoch": 0.764493758668516,
"grad_norm": 0.4258882999420166,
"learning_rate": 9.950758179302504e-08,
"loss": 0.5889,
"step": 13780
},
{
"epoch": 0.7656033287101248,
"grad_norm": 0.2963704466819763,
"learning_rate": 9.863917221541104e-08,
"loss": 0.4763,
"step": 13800
},
{
"epoch": 0.7667128987517337,
"grad_norm": 0.41408637166023254,
"learning_rate": 9.777363634173436e-08,
"loss": 0.4918,
"step": 13820
},
{
"epoch": 0.7678224687933426,
"grad_norm": 0.563586950302124,
"learning_rate": 9.691099060487196e-08,
"loss": 0.5427,
"step": 13840
},
{
"epoch": 0.7689320388349514,
"grad_norm": 0.41873815655708313,
"learning_rate": 9.605125138282935e-08,
"loss": 0.4846,
"step": 13860
},
{
"epoch": 0.7700416088765604,
"grad_norm": 0.333218514919281,
"learning_rate": 9.519443499842919e-08,
"loss": 0.4646,
"step": 13880
},
{
"epoch": 0.7711511789181692,
"grad_norm": 0.3226572573184967,
"learning_rate": 9.434055771900227e-08,
"loss": 0.5374,
"step": 13900
},
{
"epoch": 0.772260748959778,
"grad_norm": 0.3277279734611511,
"learning_rate": 9.348963575607771e-08,
"loss": 0.5319,
"step": 13920
},
{
"epoch": 0.773370319001387,
"grad_norm": 0.3328832983970642,
"learning_rate": 9.264168526507593e-08,
"loss": 0.593,
"step": 13940
},
{
"epoch": 0.7744798890429958,
"grad_norm": 0.4502674341201782,
"learning_rate": 9.179672234500166e-08,
"loss": 0.4532,
"step": 13960
},
{
"epoch": 0.7755894590846047,
"grad_norm": 0.3654020130634308,
"learning_rate": 9.095476303813796e-08,
"loss": 0.4858,
"step": 13980
},
{
"epoch": 0.7766990291262136,
"grad_norm": 0.2795443832874298,
"learning_rate": 9.011582332974227e-08,
"loss": 0.4836,
"step": 14000
},
{
"epoch": 0.7778085991678225,
"grad_norm": 0.4435333013534546,
"learning_rate": 8.927991914774227e-08,
"loss": 0.5314,
"step": 14020
},
{
"epoch": 0.7789181692094314,
"grad_norm": 0.41879114508628845,
"learning_rate": 8.844706636243404e-08,
"loss": 0.4772,
"step": 14040
},
{
"epoch": 0.7800277392510402,
"grad_norm": 0.2757185399532318,
"learning_rate": 8.761728078618049e-08,
"loss": 0.513,
"step": 14060
},
{
"epoch": 0.7811373092926491,
"grad_norm": 0.4560401439666748,
"learning_rate": 8.679057817311095e-08,
"loss": 0.5303,
"step": 14080
},
{
"epoch": 0.782246879334258,
"grad_norm": 0.3912280797958374,
"learning_rate": 8.596697421882257e-08,
"loss": 0.4567,
"step": 14100
},
{
"epoch": 0.7833564493758669,
"grad_norm": 0.5057780146598816,
"learning_rate": 8.514648456008173e-08,
"loss": 0.4742,
"step": 14120
},
{
"epoch": 0.7844660194174757,
"grad_norm": 0.33308303356170654,
"learning_rate": 8.43291247745277e-08,
"loss": 0.5547,
"step": 14140
},
{
"epoch": 0.7855755894590846,
"grad_norm": 0.3485460877418518,
"learning_rate": 8.351491038037662e-08,
"loss": 0.4894,
"step": 14160
},
{
"epoch": 0.7866851595006935,
"grad_norm": 0.46615713834762573,
"learning_rate": 8.270385683612674e-08,
"loss": 0.3763,
"step": 14180
},
{
"epoch": 0.7877947295423023,
"grad_norm": 0.3317703902721405,
"learning_rate": 8.189597954026539e-08,
"loss": 0.4526,
"step": 14200
},
{
"epoch": 0.7889042995839113,
"grad_norm": 0.4380096197128296,
"learning_rate": 8.1091293830976e-08,
"loss": 0.5891,
"step": 14220
},
{
"epoch": 0.7900138696255201,
"grad_norm": 0.3943984806537628,
"learning_rate": 8.028981498584745e-08,
"loss": 0.563,
"step": 14240
},
{
"epoch": 0.791123439667129,
"grad_norm": 0.3383914828300476,
"learning_rate": 7.949155822158385e-08,
"loss": 0.5196,
"step": 14260
},
{
"epoch": 0.7922330097087379,
"grad_norm": 0.33651596307754517,
"learning_rate": 7.869653869371528e-08,
"loss": 0.5427,
"step": 14280
},
{
"epoch": 0.7933425797503467,
"grad_norm": 0.42295658588409424,
"learning_rate": 7.790477149631072e-08,
"loss": 0.5018,
"step": 14300
},
{
"epoch": 0.7944521497919557,
"grad_norm": 0.39132261276245117,
"learning_rate": 7.711627166169073e-08,
"loss": 0.4734,
"step": 14320
},
{
"epoch": 0.7955617198335645,
"grad_norm": 0.36338910460472107,
"learning_rate": 7.633105416014277e-08,
"loss": 0.4265,
"step": 14340
},
{
"epoch": 0.7966712898751733,
"grad_norm": 0.3046381175518036,
"learning_rate": 7.554913389963646e-08,
"loss": 0.4241,
"step": 14360
},
{
"epoch": 0.7977808599167823,
"grad_norm": 0.3695002794265747,
"learning_rate": 7.477052572554065e-08,
"loss": 0.4685,
"step": 14380
},
{
"epoch": 0.7988904299583911,
"grad_norm": 0.3680543601512909,
"learning_rate": 7.399524442034188e-08,
"loss": 0.5151,
"step": 14400
},
{
"epoch": 0.8,
"grad_norm": 0.4030877649784088,
"learning_rate": 7.322330470336313e-08,
"loss": 0.568,
"step": 14420
},
{
"epoch": 0.8011095700416089,
"grad_norm": 0.5655102729797363,
"learning_rate": 7.245472123048499e-08,
"loss": 0.4919,
"step": 14440
},
{
"epoch": 0.8022191400832177,
"grad_norm": 0.3112393319606781,
"learning_rate": 7.168950859386714e-08,
"loss": 0.5639,
"step": 14460
},
{
"epoch": 0.8033287101248267,
"grad_norm": 2.190739393234253,
"learning_rate": 7.092768132167098e-08,
"loss": 0.5222,
"step": 14480
},
{
"epoch": 0.8044382801664355,
"grad_norm": 0.33552542328834534,
"learning_rate": 7.01692538777845e-08,
"loss": 0.5515,
"step": 14500
},
{
"epoch": 0.8055478502080444,
"grad_norm": 0.5092620253562927,
"learning_rate": 6.941424066154697e-08,
"loss": 0.6103,
"step": 14520
},
{
"epoch": 0.8066574202496533,
"grad_norm": 0.4809440076351166,
"learning_rate": 6.866265600747604e-08,
"loss": 0.5302,
"step": 14540
},
{
"epoch": 0.8077669902912621,
"grad_norm": 0.41583776473999023,
"learning_rate": 6.79145141849955e-08,
"loss": 0.5071,
"step": 14560
},
{
"epoch": 0.808876560332871,
"grad_norm": 0.3800281882286072,
"learning_rate": 6.716982939816398e-08,
"loss": 0.42,
"step": 14580
},
{
"epoch": 0.8099861303744799,
"grad_norm": 0.32890620827674866,
"learning_rate": 6.642861578540595e-08,
"loss": 0.422,
"step": 14600
},
{
"epoch": 0.8110957004160888,
"grad_norm": 0.33213478326797485,
"learning_rate": 6.569088741924261e-08,
"loss": 0.4859,
"step": 14620
},
{
"epoch": 0.8122052704576976,
"grad_norm": 0.2846478521823883,
"learning_rate": 6.495665830602518e-08,
"loss": 0.5174,
"step": 14640
},
{
"epoch": 0.8133148404993065,
"grad_norm": 0.5666544437408447,
"learning_rate": 6.42259423856689e-08,
"loss": 0.4581,
"step": 14660
},
{
"epoch": 0.8144244105409154,
"grad_norm": 0.39947426319122314,
"learning_rate": 6.349875353138801e-08,
"loss": 0.4929,
"step": 14680
},
{
"epoch": 0.8155339805825242,
"grad_norm": 0.7812756299972534,
"learning_rate": 6.277510554943294e-08,
"loss": 0.5503,
"step": 14700
},
{
"epoch": 0.8166435506241332,
"grad_norm": 0.2935800850391388,
"learning_rate": 6.205501217882766e-08,
"loss": 0.5464,
"step": 14720
},
{
"epoch": 0.817753120665742,
"grad_norm": 0.5092408061027527,
"learning_rate": 6.13384870911092e-08,
"loss": 0.515,
"step": 14740
},
{
"epoch": 0.8188626907073508,
"grad_norm": 0.3239974081516266,
"learning_rate": 6.062554389006794e-08,
"loss": 0.5617,
"step": 14760
},
{
"epoch": 0.8199722607489598,
"grad_norm": 0.3289891481399536,
"learning_rate": 5.991619611148918e-08,
"loss": 0.4832,
"step": 14780
},
{
"epoch": 0.8210818307905686,
"grad_norm": 0.37206193804740906,
"learning_rate": 5.9210457222896524e-08,
"loss": 0.4863,
"step": 14800
},
{
"epoch": 0.8221914008321776,
"grad_norm": 0.3518655300140381,
"learning_rate": 5.850834062329574e-08,
"loss": 0.4942,
"step": 14820
},
{
"epoch": 0.8233009708737864,
"grad_norm": 0.3711952567100525,
"learning_rate": 5.780985964292079e-08,
"loss": 0.5641,
"step": 14840
},
{
"epoch": 0.8244105409153952,
"grad_norm": 0.41170036792755127,
"learning_rate": 5.711502754298059e-08,
"loss": 0.4882,
"step": 14860
},
{
"epoch": 0.8255201109570042,
"grad_norm": 0.5306410193443298,
"learning_rate": 5.6423857515406876e-08,
"loss": 0.5864,
"step": 14880
},
{
"epoch": 0.826629680998613,
"grad_norm": 0.34095829725265503,
"learning_rate": 5.573636268260451e-08,
"loss": 0.5834,
"step": 14900
},
{
"epoch": 0.827739251040222,
"grad_norm": 0.245326429605484,
"learning_rate": 5.5052556097201525e-08,
"loss": 0.4505,
"step": 14920
},
{
"epoch": 0.8288488210818308,
"grad_norm": 0.514102041721344,
"learning_rate": 5.437245074180191e-08,
"loss": 0.4891,
"step": 14940
},
{
"epoch": 0.8299583911234397,
"grad_norm": 0.8509578704833984,
"learning_rate": 5.369605952873887e-08,
"loss": 0.6081,
"step": 14960
},
{
"epoch": 0.8310679611650486,
"grad_norm": 0.35718920826911926,
"learning_rate": 5.302339529982961e-08,
"loss": 0.5393,
"step": 14980
},
{
"epoch": 0.8321775312066574,
"grad_norm": 0.35100287199020386,
"learning_rate": 5.2354470826131785e-08,
"loss": 0.5476,
"step": 15000
},
{
"epoch": 0.8332871012482663,
"grad_norm": 0.37764522433280945,
"learning_rate": 5.168929880770062e-08,
"loss": 0.475,
"step": 15020
},
{
"epoch": 0.8343966712898752,
"grad_norm": 0.4380689859390259,
"learning_rate": 5.102789187334827e-08,
"loss": 0.4952,
"step": 15040
},
{
"epoch": 0.8355062413314841,
"grad_norm": 0.40711092948913574,
"learning_rate": 5.0370262580403775e-08,
"loss": 0.4711,
"step": 15060
},
{
"epoch": 0.8366158113730929,
"grad_norm": 0.3597396910190582,
"learning_rate": 4.9716423414474515e-08,
"loss": 0.4656,
"step": 15080
},
{
"epoch": 0.8377253814147018,
"grad_norm": 0.30235543847084045,
"learning_rate": 4.906638678920963e-08,
"loss": 0.5144,
"step": 15100
},
{
"epoch": 0.8388349514563107,
"grad_norm": 0.3047267198562622,
"learning_rate": 4.842016504606375e-08,
"loss": 0.4962,
"step": 15120
},
{
"epoch": 0.8399445214979195,
"grad_norm": 0.296040415763855,
"learning_rate": 4.777777045406314e-08,
"loss": 0.4285,
"step": 15140
},
{
"epoch": 0.8410540915395285,
"grad_norm": 0.2601630985736847,
"learning_rate": 4.71392152095727e-08,
"loss": 0.4683,
"step": 15160
},
{
"epoch": 0.8421636615811373,
"grad_norm": 0.42486798763275146,
"learning_rate": 4.6504511436064014e-08,
"loss": 0.5188,
"step": 15180
},
{
"epoch": 0.8432732316227461,
"grad_norm": 0.4439660310745239,
"learning_rate": 4.587367118388577e-08,
"loss": 0.4948,
"step": 15200
},
{
"epoch": 0.8443828016643551,
"grad_norm": 0.36130619049072266,
"learning_rate": 4.5246706430034445e-08,
"loss": 0.52,
"step": 15220
},
{
"epoch": 0.8454923717059639,
"grad_norm": 0.3554399609565735,
"learning_rate": 4.4623629077927296e-08,
"loss": 0.4171,
"step": 15240
},
{
"epoch": 0.8466019417475729,
"grad_norm": 0.3487074673175812,
"learning_rate": 4.40044509571762e-08,
"loss": 0.4701,
"step": 15260
},
{
"epoch": 0.8477115117891817,
"grad_norm": 0.7752673029899597,
"learning_rate": 4.338918382336296e-08,
"loss": 0.4984,
"step": 15280
},
{
"epoch": 0.8488210818307905,
"grad_norm": 0.3020077347755432,
"learning_rate": 4.277783935781637e-08,
"loss": 0.4251,
"step": 15300
},
{
"epoch": 0.8499306518723995,
"grad_norm": 3.607598304748535,
"learning_rate": 4.217042916739011e-08,
"loss": 0.4703,
"step": 15320
},
{
"epoch": 0.8510402219140083,
"grad_norm": 0.4934079945087433,
"learning_rate": 4.156696478424279e-08,
"loss": 0.4898,
"step": 15340
},
{
"epoch": 0.8521497919556172,
"grad_norm": 0.3440416753292084,
"learning_rate": 4.096745766561857e-08,
"loss": 0.4242,
"step": 15360
},
{
"epoch": 0.8532593619972261,
"grad_norm": 0.43925386667251587,
"learning_rate": 4.0371919193629975e-08,
"loss": 0.5167,
"step": 15380
},
{
"epoch": 0.8543689320388349,
"grad_norm": 0.40181154012680054,
"learning_rate": 3.9780360675041675e-08,
"loss": 0.4832,
"step": 15400
},
{
"epoch": 0.8554785020804438,
"grad_norm": 0.49073562026023865,
"learning_rate": 3.9192793341055655e-08,
"loss": 0.4619,
"step": 15420
},
{
"epoch": 0.8565880721220527,
"grad_norm": 0.3399178087711334,
"learning_rate": 3.860922834709832e-08,
"loss": 0.4904,
"step": 15440
},
{
"epoch": 0.8576976421636616,
"grad_norm": 0.3309305012226105,
"learning_rate": 3.8029676772608324e-08,
"loss": 0.5175,
"step": 15460
},
{
"epoch": 0.8588072122052705,
"grad_norm": 0.33893635869026184,
"learning_rate": 3.745414962082655e-08,
"loss": 0.5904,
"step": 15480
},
{
"epoch": 0.8599167822468793,
"grad_norm": 0.4869129955768585,
"learning_rate": 3.688265781858707e-08,
"loss": 0.4194,
"step": 15500
},
{
"epoch": 0.8610263522884882,
"grad_norm": 0.4826425015926361,
"learning_rate": 3.631521221610953e-08,
"loss": 0.4774,
"step": 15520
},
{
"epoch": 0.8621359223300971,
"grad_norm": 0.4436647295951843,
"learning_rate": 3.575182358679349e-08,
"loss": 0.5091,
"step": 15540
},
{
"epoch": 0.863245492371706,
"grad_norm": 0.3870086669921875,
"learning_rate": 3.5192502627013535e-08,
"loss": 0.4934,
"step": 15560
},
{
"epoch": 0.8643550624133148,
"grad_norm": 0.3462676405906677,
"learning_rate": 3.463725995591646e-08,
"loss": 0.5185,
"step": 15580
},
{
"epoch": 0.8654646324549237,
"grad_norm": 0.3750855028629303,
"learning_rate": 3.408610611521959e-08,
"loss": 0.4889,
"step": 15600
},
{
"epoch": 0.8665742024965326,
"grad_norm": 0.39250943064689636,
"learning_rate": 3.3539051569010376e-08,
"loss": 0.5594,
"step": 15620
},
{
"epoch": 0.8676837725381414,
"grad_norm": 0.6177974343299866,
"learning_rate": 3.29961067035483e-08,
"loss": 0.5567,
"step": 15640
},
{
"epoch": 0.8687933425797504,
"grad_norm": 0.8788308501243591,
"learning_rate": 3.245728182706695e-08,
"loss": 0.5487,
"step": 15660
},
{
"epoch": 0.8699029126213592,
"grad_norm": 0.3534790277481079,
"learning_rate": 3.1922587169578965e-08,
"loss": 0.5047,
"step": 15680
},
{
"epoch": 0.871012482662968,
"grad_norm": 0.7439823746681213,
"learning_rate": 3.1392032882681524e-08,
"loss": 0.619,
"step": 15700
},
{
"epoch": 0.872122052704577,
"grad_norm": 0.43660464882850647,
"learning_rate": 3.086562903936343e-08,
"loss": 0.5613,
"step": 15720
},
{
"epoch": 0.8732316227461858,
"grad_norm": 0.2849920988082886,
"learning_rate": 3.0343385633814336e-08,
"loss": 0.5407,
"step": 15740
},
{
"epoch": 0.8743411927877948,
"grad_norm": 0.46800124645233154,
"learning_rate": 2.982531258123447e-08,
"loss": 0.5268,
"step": 15760
},
{
"epoch": 0.8754507628294036,
"grad_norm": 0.33923402428627014,
"learning_rate": 2.931141971764675e-08,
"loss": 0.5359,
"step": 15780
},
{
"epoch": 0.8765603328710125,
"grad_norm": 0.5203589200973511,
"learning_rate": 2.880171679971005e-08,
"loss": 0.4298,
"step": 15800
},
{
"epoch": 0.8776699029126214,
"grad_norm": 0.48814857006073,
"learning_rate": 2.8296213504533596e-08,
"loss": 0.4622,
"step": 15820
},
{
"epoch": 0.8787794729542302,
"grad_norm": 0.32717978954315186,
"learning_rate": 2.779491942949369e-08,
"loss": 0.4351,
"step": 15840
},
{
"epoch": 0.8798890429958391,
"grad_norm": 0.33301976323127747,
"learning_rate": 2.7297844092051104e-08,
"loss": 0.4853,
"step": 15860
},
{
"epoch": 0.880998613037448,
"grad_norm": 0.42914196848869324,
"learning_rate": 2.680499692957078e-08,
"loss": 0.5133,
"step": 15880
},
{
"epoch": 0.8821081830790569,
"grad_norm": 0.3375394344329834,
"learning_rate": 2.6316387299142374e-08,
"loss": 0.514,
"step": 15900
},
{
"epoch": 0.8832177531206657,
"grad_norm": 0.33002445101737976,
"learning_rate": 2.5832024477402543e-08,
"loss": 0.4487,
"step": 15920
},
{
"epoch": 0.8843273231622746,
"grad_norm": 0.5125333666801453,
"learning_rate": 2.535191766035913e-08,
"loss": 0.5942,
"step": 15940
},
{
"epoch": 0.8854368932038835,
"grad_norm": 0.424376517534256,
"learning_rate": 2.4876075963216226e-08,
"loss": 0.5574,
"step": 15960
},
{
"epoch": 0.8865464632454924,
"grad_norm": 1.1631702184677124,
"learning_rate": 2.4404508420201446e-08,
"loss": 0.5152,
"step": 15980
},
{
"epoch": 0.8876560332871013,
"grad_norm": 0.42409613728523254,
"learning_rate": 2.3937223984394212e-08,
"loss": 0.5859,
"step": 16000
},
{
"epoch": 0.8887656033287101,
"grad_norm": 0.32093319296836853,
"learning_rate": 2.3474231527555595e-08,
"loss": 0.5776,
"step": 16020
},
{
"epoch": 0.889875173370319,
"grad_norm": 0.44207271933555603,
"learning_rate": 2.301553983996041e-08,
"loss": 0.5397,
"step": 16040
},
{
"epoch": 0.8909847434119279,
"grad_norm": 0.29899469017982483,
"learning_rate": 2.2561157630229673e-08,
"loss": 0.5171,
"step": 16060
},
{
"epoch": 0.8920943134535367,
"grad_norm": 0.49811238050460815,
"learning_rate": 2.2111093525165826e-08,
"loss": 0.5837,
"step": 16080
},
{
"epoch": 0.8932038834951457,
"grad_norm": 0.36482110619544983,
"learning_rate": 2.1665356069588607e-08,
"loss": 0.5252,
"step": 16100
},
{
"epoch": 0.8943134535367545,
"grad_norm": 0.40325450897216797,
"learning_rate": 2.1223953726172917e-08,
"loss": 0.5157,
"step": 16120
},
{
"epoch": 0.8954230235783633,
"grad_norm": 0.2874036431312561,
"learning_rate": 2.078689487528823e-08,
"loss": 0.5223,
"step": 16140
},
{
"epoch": 0.8965325936199723,
"grad_norm": 0.460287481546402,
"learning_rate": 2.0354187814839248e-08,
"loss": 0.6041,
"step": 16160
},
{
"epoch": 0.8976421636615811,
"grad_norm": 0.3832845687866211,
"learning_rate": 1.992584076010867e-08,
"loss": 0.5905,
"step": 16180
},
{
"epoch": 0.8987517337031901,
"grad_norm": 0.47805336117744446,
"learning_rate": 1.9501861843601114e-08,
"loss": 0.4894,
"step": 16200
},
{
"epoch": 0.8998613037447989,
"grad_norm": 0.34525078535079956,
"learning_rate": 1.9082259114888477e-08,
"loss": 0.555,
"step": 16220
},
{
"epoch": 0.9009708737864077,
"grad_norm": 0.37707841396331787,
"learning_rate": 1.8667040540457423e-08,
"loss": 0.4523,
"step": 16240
},
{
"epoch": 0.9020804438280167,
"grad_norm": 0.4305242896080017,
"learning_rate": 1.8256214003558035e-08,
"loss": 0.5538,
"step": 16260
},
{
"epoch": 0.9031900138696255,
"grad_norm": 0.4085891842842102,
"learning_rate": 1.7849787304054093e-08,
"loss": 0.5101,
"step": 16280
},
{
"epoch": 0.9042995839112344,
"grad_norm": 0.44601982831954956,
"learning_rate": 1.7447768158274923e-08,
"loss": 0.4732,
"step": 16300
},
{
"epoch": 0.9054091539528433,
"grad_norm": 0.3422205150127411,
"learning_rate": 1.7050164198869148e-08,
"loss": 0.4478,
"step": 16320
},
{
"epoch": 0.9065187239944521,
"grad_norm": 0.26549020409584045,
"learning_rate": 1.6656982974659563e-08,
"loss": 0.5429,
"step": 16340
},
{
"epoch": 0.907628294036061,
"grad_norm": 0.47383949160575867,
"learning_rate": 1.6268231950499727e-08,
"loss": 0.5087,
"step": 16360
},
{
"epoch": 0.9087378640776699,
"grad_norm": 1.3218978643417358,
"learning_rate": 1.5883918507132637e-08,
"loss": 0.5044,
"step": 16380
},
{
"epoch": 0.9098474341192788,
"grad_norm": 0.3553486168384552,
"learning_rate": 1.550404994105009e-08,
"loss": 0.5442,
"step": 16400
},
{
"epoch": 0.9109570041608877,
"grad_norm": 0.2937772274017334,
"learning_rate": 1.5128633464354584e-08,
"loss": 0.4458,
"step": 16420
},
{
"epoch": 0.9120665742024965,
"grad_norm": 0.49511197209358215,
"learning_rate": 1.475767620462215e-08,
"loss": 0.4199,
"step": 16440
},
{
"epoch": 0.9131761442441054,
"grad_norm": 0.7402114272117615,
"learning_rate": 1.439118520476701e-08,
"loss": 0.5255,
"step": 16460
},
{
"epoch": 0.9142857142857143,
"grad_norm": 0.44214996695518494,
"learning_rate": 1.4029167422908105e-08,
"loss": 0.4961,
"step": 16480
},
{
"epoch": 0.9153952843273232,
"grad_norm": 0.42241570353507996,
"learning_rate": 1.3671629732236679e-08,
"loss": 0.5096,
"step": 16500
},
{
"epoch": 0.916504854368932,
"grad_norm": 0.5015203952789307,
"learning_rate": 1.3318578920886003e-08,
"loss": 0.549,
"step": 16520
},
{
"epoch": 0.9176144244105409,
"grad_norm": 0.4513172209262848,
"learning_rate": 1.2970021691802475e-08,
"loss": 0.5027,
"step": 16540
},
{
"epoch": 0.9187239944521498,
"grad_norm": 0.3667598366737366,
"learning_rate": 1.2625964662618172e-08,
"loss": 0.4524,
"step": 16560
},
{
"epoch": 0.9198335644937586,
"grad_norm": 0.3975818157196045,
"learning_rate": 1.2286414365525494e-08,
"loss": 0.4872,
"step": 16580
},
{
"epoch": 0.9209431345353676,
"grad_norm": 0.4363032281398773,
"learning_rate": 1.1951377247152867e-08,
"loss": 0.6175,
"step": 16600
},
{
"epoch": 0.9220527045769764,
"grad_norm": 0.2995266318321228,
"learning_rate": 1.162085966844259e-08,
"loss": 0.5223,
"step": 16620
},
{
"epoch": 0.9231622746185852,
"grad_norm": 0.29473140835762024,
"learning_rate": 1.1294867904529992e-08,
"loss": 0.5011,
"step": 16640
},
{
"epoch": 0.9242718446601942,
"grad_norm": 0.3131171464920044,
"learning_rate": 1.097340814462408e-08,
"loss": 0.5525,
"step": 16660
},
{
"epoch": 0.925381414701803,
"grad_norm": 0.29646238684654236,
"learning_rate": 1.065648649189041e-08,
"loss": 0.4261,
"step": 16680
},
{
"epoch": 0.926490984743412,
"grad_norm": 0.3700522482395172,
"learning_rate": 1.0344108963334847e-08,
"loss": 0.4667,
"step": 16700
},
{
"epoch": 0.9276005547850208,
"grad_norm": 0.5135470628738403,
"learning_rate": 1.003628148968963e-08,
"loss": 0.5734,
"step": 16720
},
{
"epoch": 0.9287101248266297,
"grad_norm": 0.3333655595779419,
"learning_rate": 9.733009915300628e-09,
"loss": 0.5045,
"step": 16740
},
{
"epoch": 0.9298196948682386,
"grad_norm": 0.5014081001281738,
"learning_rate": 9.434299998016287e-09,
"loss": 0.5693,
"step": 16760
},
{
"epoch": 0.9309292649098474,
"grad_norm": 0.33439749479293823,
"learning_rate": 9.140157409078559e-09,
"loss": 0.5434,
"step": 16780
},
{
"epoch": 0.9320388349514563,
"grad_norm": 0.28119370341300964,
"learning_rate": 8.850587733014947e-09,
"loss": 0.5789,
"step": 16800
},
{
"epoch": 0.9331484049930652,
"grad_norm": 0.3373952805995941,
"learning_rate": 8.565596467532715e-09,
"loss": 0.4614,
"step": 16820
},
{
"epoch": 0.9342579750346741,
"grad_norm": 0.4669179916381836,
"learning_rate": 8.28518902341438e-09,
"loss": 0.5021,
"step": 16840
},
{
"epoch": 0.935367545076283,
"grad_norm": 0.45809802412986755,
"learning_rate": 8.009370724415015e-09,
"loss": 0.5104,
"step": 16860
},
{
"epoch": 0.9364771151178918,
"grad_norm": 0.40159252285957336,
"learning_rate": 7.738146807161255e-09,
"loss": 0.5569,
"step": 16880
},
{
"epoch": 0.9375866851595007,
"grad_norm": 0.34096261858940125,
"learning_rate": 7.471522421051618e-09,
"loss": 0.5477,
"step": 16900
},
{
"epoch": 0.9386962552011096,
"grad_norm": 0.4823736250400543,
"learning_rate": 7.209502628159142e-09,
"loss": 0.4552,
"step": 16920
},
{
"epoch": 0.9398058252427185,
"grad_norm": 0.37772753834724426,
"learning_rate": 6.952092403134851e-09,
"loss": 0.4999,
"step": 16940
},
{
"epoch": 0.9409153952843273,
"grad_norm": 0.44477227330207825,
"learning_rate": 6.69929663311361e-09,
"loss": 0.596,
"step": 16960
},
{
"epoch": 0.9420249653259362,
"grad_norm": 0.27438193559646606,
"learning_rate": 6.451120117621306e-09,
"loss": 0.483,
"step": 16980
},
{
"epoch": 0.9431345353675451,
"grad_norm": 0.3919523060321808,
"learning_rate": 6.2075675684835075e-09,
"loss": 0.4991,
"step": 17000
},
{
"epoch": 0.9442441054091539,
"grad_norm": 0.36142680048942566,
"learning_rate": 5.968643609736257e-09,
"loss": 0.4884,
"step": 17020
},
{
"epoch": 0.9453536754507629,
"grad_norm": 0.37626388669013977,
"learning_rate": 5.734352777538143e-09,
"loss": 0.473,
"step": 17040
},
{
"epoch": 0.9464632454923717,
"grad_norm": 0.8457902073860168,
"learning_rate": 5.504699520084227e-09,
"loss": 0.4457,
"step": 17060
},
{
"epoch": 0.9475728155339805,
"grad_norm": 0.26701247692108154,
"learning_rate": 5.279688197521643e-09,
"loss": 0.506,
"step": 17080
},
{
"epoch": 0.9486823855755895,
"grad_norm": 0.33689752221107483,
"learning_rate": 5.059323081866601e-09,
"loss": 0.4893,
"step": 17100
},
{
"epoch": 0.9497919556171983,
"grad_norm": 0.30724218487739563,
"learning_rate": 4.8436083569236004e-09,
"loss": 0.4775,
"step": 17120
},
{
"epoch": 0.9509015256588073,
"grad_norm": 0.3921775817871094,
"learning_rate": 4.632548118205681e-09,
"loss": 0.6024,
"step": 17140
},
{
"epoch": 0.9520110957004161,
"grad_norm": 0.465593159198761,
"learning_rate": 4.4261463728569315e-09,
"loss": 0.5698,
"step": 17160
},
{
"epoch": 0.9531206657420249,
"grad_norm": 0.3770931661128998,
"learning_rate": 4.224407039576244e-09,
"loss": 0.4477,
"step": 17180
},
{
"epoch": 0.9542302357836339,
"grad_norm": 0.7365812063217163,
"learning_rate": 4.027333948542932e-09,
"loss": 0.4356,
"step": 17200
},
{
"epoch": 0.9553398058252427,
"grad_norm": 0.4825473427772522,
"learning_rate": 3.834930841344119e-09,
"loss": 0.5003,
"step": 17220
},
{
"epoch": 0.9564493758668516,
"grad_norm": 0.4335998296737671,
"learning_rate": 3.6472013709035464e-09,
"loss": 0.5292,
"step": 17240
},
{
"epoch": 0.9575589459084605,
"grad_norm": 0.3976069688796997,
"learning_rate": 3.4641491014123224e-09,
"loss": 0.6148,
"step": 17260
},
{
"epoch": 0.9586685159500693,
"grad_norm": 0.45377933979034424,
"learning_rate": 3.2857775082613115e-09,
"loss": 0.5478,
"step": 17280
},
{
"epoch": 0.9597780859916782,
"grad_norm": 0.35374292731285095,
"learning_rate": 3.1120899779749354e-09,
"loss": 0.4997,
"step": 17300
},
{
"epoch": 0.9608876560332871,
"grad_norm": 0.9163030385971069,
"learning_rate": 2.9430898081471144e-09,
"loss": 0.5127,
"step": 17320
},
{
"epoch": 0.961997226074896,
"grad_norm": 0.4174667000770569,
"learning_rate": 2.7787802073784563e-09,
"loss": 0.4448,
"step": 17340
},
{
"epoch": 0.9631067961165048,
"grad_norm": 0.41283664107322693,
"learning_rate": 2.619164295215581e-09,
"loss": 0.5467,
"step": 17360
},
{
"epoch": 0.9642163661581137,
"grad_norm": 0.39738011360168457,
"learning_rate": 2.4642451020916165e-09,
"loss": 0.5459,
"step": 17380
},
{
"epoch": 0.9653259361997226,
"grad_norm": 0.4814308285713196,
"learning_rate": 2.314025569268879e-09,
"loss": 0.4956,
"step": 17400
},
{
"epoch": 0.9664355062413315,
"grad_norm": 0.3121536076068878,
"learning_rate": 2.1685085487829493e-09,
"loss": 0.5044,
"step": 17420
},
{
"epoch": 0.9675450762829404,
"grad_norm": 0.5314778089523315,
"learning_rate": 2.0276968033884347e-09,
"loss": 0.5479,
"step": 17440
},
{
"epoch": 0.9686546463245492,
"grad_norm": 0.49265632033348083,
"learning_rate": 1.8915930065067365e-09,
"loss": 0.4362,
"step": 17460
},
{
"epoch": 0.9697642163661581,
"grad_norm": 0.3280515670776367,
"learning_rate": 1.760199742175089e-09,
"loss": 0.4533,
"step": 17480
},
{
"epoch": 0.970873786407767,
"grad_norm": 0.3209471106529236,
"learning_rate": 1.6335195049975992e-09,
"loss": 0.523,
"step": 17500
},
{
"epoch": 0.9719833564493758,
"grad_norm": 0.4209744334220886,
"learning_rate": 1.5115547000978113e-09,
"loss": 0.4551,
"step": 17520
},
{
"epoch": 0.9730929264909848,
"grad_norm": 0.4232068359851837,
"learning_rate": 1.3943076430731614e-09,
"loss": 0.4994,
"step": 17540
},
{
"epoch": 0.9742024965325936,
"grad_norm": 0.513118326663971,
"learning_rate": 1.2817805599509014e-09,
"loss": 0.5737,
"step": 17560
},
{
"epoch": 0.9753120665742026,
"grad_norm": 0.4443225860595703,
"learning_rate": 1.173975587145909e-09,
"loss": 0.5862,
"step": 17580
},
{
"epoch": 0.9764216366158114,
"grad_norm": 0.40851354598999023,
"learning_rate": 1.0708947714200557e-09,
"loss": 0.5229,
"step": 17600
},
{
"epoch": 0.9775312066574202,
"grad_norm": 0.2527603209018707,
"learning_rate": 9.725400698434583e-10,
"loss": 0.5171,
"step": 17620
},
{
"epoch": 0.9786407766990292,
"grad_norm": 0.43800926208496094,
"learning_rate": 8.789133497571488e-10,
"loss": 0.5315,
"step": 17640
},
{
"epoch": 0.979750346740638,
"grad_norm": 0.33943334221839905,
"learning_rate": 7.900163887377964e-10,
"loss": 0.5071,
"step": 17660
},
{
"epoch": 0.9808599167822469,
"grad_norm": 0.5179576277732849,
"learning_rate": 7.058508745639014e-10,
"loss": 0.5261,
"step": 17680
},
{
"epoch": 0.9819694868238558,
"grad_norm": 0.6271900534629822,
"learning_rate": 6.264184051837096e-10,
"loss": 0.4876,
"step": 17700
},
{
"epoch": 0.9830790568654646,
"grad_norm": 0.5471246838569641,
"learning_rate": 5.517204886848758e-10,
"loss": 0.4974,
"step": 17720
},
{
"epoch": 0.9841886269070735,
"grad_norm": 0.30157485604286194,
"learning_rate": 4.817585432659032e-10,
"loss": 0.4899,
"step": 17740
},
{
"epoch": 0.9852981969486824,
"grad_norm": 0.7236303091049194,
"learning_rate": 4.1653389720916474e-10,
"loss": 0.5567,
"step": 17760
},
{
"epoch": 0.9864077669902913,
"grad_norm": 0.3008541762828827,
"learning_rate": 3.5604778885564567e-10,
"loss": 0.5912,
"step": 17780
},
{
"epoch": 0.9875173370319001,
"grad_norm": 0.4916051924228668,
"learning_rate": 3.0030136658157343e-10,
"loss": 0.5003,
"step": 17800
},
{
"epoch": 0.988626907073509,
"grad_norm": 0.34007692337036133,
"learning_rate": 2.492956887764075e-10,
"loss": 0.5146,
"step": 17820
},
{
"epoch": 0.9897364771151179,
"grad_norm": 0.35913458466529846,
"learning_rate": 2.0303172382293843e-10,
"loss": 0.5003,
"step": 17840
},
{
"epoch": 0.9908460471567268,
"grad_norm": 0.38319680094718933,
"learning_rate": 1.6151035007883062e-10,
"loss": 0.458,
"step": 17860
},
{
"epoch": 0.9919556171983357,
"grad_norm": 0.4455585181713104,
"learning_rate": 1.2473235585983012e-10,
"loss": 0.4971,
"step": 17880
},
{
"epoch": 0.9930651872399445,
"grad_norm": 0.521392822265625,
"learning_rate": 9.269843942505407e-11,
"loss": 0.5466,
"step": 17900
},
{
"epoch": 0.9941747572815534,
"grad_norm": 0.2978236675262451,
"learning_rate": 6.54092089634739e-11,
"loss": 0.5089,
"step": 17920
},
{
"epoch": 0.9952843273231623,
"grad_norm": 0.4199017584323883,
"learning_rate": 4.286518258250771e-11,
"loss": 0.4961,
"step": 17940
},
{
"epoch": 0.9963938973647711,
"grad_norm": 0.39249876141548157,
"learning_rate": 2.506678829819475e-11,
"loss": 0.4764,
"step": 17960
},
{
"epoch": 0.9975034674063801,
"grad_norm": 0.41262540221214294,
"learning_rate": 1.2014364026979862e-11,
"loss": 0.5336,
"step": 17980
},
{
"epoch": 0.9986130374479889,
"grad_norm": 0.33444827795028687,
"learning_rate": 3.708157579357385e-12,
"loss": 0.5269,
"step": 18000
},
{
"epoch": 0.9997226074895977,
"grad_norm": 0.423446387052536,
"learning_rate": 1.4832665518049737e-13,
"loss": 0.4982,
"step": 18020
},
{
"epoch": 1.0,
"step": 18025,
"total_flos": 4.409726465817354e+17,
"train_loss": 0.606607598028302,
"train_runtime": 18158.8522,
"train_samples_per_second": 0.993,
"train_steps_per_second": 0.993
}
],
"logging_steps": 20,
"max_steps": 18025,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.409726465817354e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}