all-MiniLM-L6-v2-pairscore / trainer_state.json
youssefkhalil320's picture
Upload folder using huggingface_hub
39647cf verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 63788,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006270771932024832,
"grad_norm": 92.5528793334961,
"learning_rate": 3.041229032763756e-07,
"loss": 11.9622,
"step": 100
},
{
"epoch": 0.012541543864049664,
"grad_norm": 85.98809814453125,
"learning_rate": 6.176516695406804e-07,
"loss": 11.265,
"step": 200
},
{
"epoch": 0.018812315796074498,
"grad_norm": 82.95439910888672,
"learning_rate": 9.311804358049851e-07,
"loss": 10.5195,
"step": 300
},
{
"epoch": 0.025083087728099328,
"grad_norm": 74.84368896484375,
"learning_rate": 1.2447092020692899e-06,
"loss": 9.4744,
"step": 400
},
{
"epoch": 0.03135385966012416,
"grad_norm": 71.96393585205078,
"learning_rate": 1.5582379683335947e-06,
"loss": 8.4815,
"step": 500
},
{
"epoch": 0.03135385966012416,
"eval_loss": 8.621713638305664,
"eval_runtime": 232.9735,
"eval_samples_per_second": 547.582,
"eval_steps_per_second": 17.114,
"step": 500
},
{
"epoch": 0.037624631592148995,
"grad_norm": 82.08424377441406,
"learning_rate": 1.8717667345978996e-06,
"loss": 7.6105,
"step": 600
},
{
"epoch": 0.04389540352417383,
"grad_norm": 82.3794937133789,
"learning_rate": 2.1852955008622044e-06,
"loss": 6.8023,
"step": 700
},
{
"epoch": 0.050166175456198656,
"grad_norm": 42.44660186767578,
"learning_rate": 2.4988242671265088e-06,
"loss": 6.1258,
"step": 800
},
{
"epoch": 0.05643694738822349,
"grad_norm": 42.748497009277344,
"learning_rate": 2.812353033390814e-06,
"loss": 5.5032,
"step": 900
},
{
"epoch": 0.06270771932024832,
"grad_norm": 49.40472412109375,
"learning_rate": 3.1258817996551187e-06,
"loss": 5.0397,
"step": 1000
},
{
"epoch": 0.06270771932024832,
"eval_loss": 5.194935321807861,
"eval_runtime": 248.6107,
"eval_samples_per_second": 513.14,
"eval_steps_per_second": 16.037,
"step": 1000
},
{
"epoch": 0.06897849125227315,
"grad_norm": 54.32060623168945,
"learning_rate": 3.439410565919423e-06,
"loss": 4.6909,
"step": 1100
},
{
"epoch": 0.07524926318429799,
"grad_norm": 50.955474853515625,
"learning_rate": 3.7529393321837283e-06,
"loss": 4.5716,
"step": 1200
},
{
"epoch": 0.08152003511632282,
"grad_norm": 42.99276351928711,
"learning_rate": 4.066468098448033e-06,
"loss": 4.3983,
"step": 1300
},
{
"epoch": 0.08779080704834766,
"grad_norm": 56.20285415649414,
"learning_rate": 4.379996864712338e-06,
"loss": 4.2073,
"step": 1400
},
{
"epoch": 0.09406157898037248,
"grad_norm": 47.72187805175781,
"learning_rate": 4.693525630976643e-06,
"loss": 4.2164,
"step": 1500
},
{
"epoch": 0.09406157898037248,
"eval_loss": 4.142153263092041,
"eval_runtime": 255.6642,
"eval_samples_per_second": 498.983,
"eval_steps_per_second": 15.595,
"step": 1500
},
{
"epoch": 0.10033235091239731,
"grad_norm": 53.83956527709961,
"learning_rate": 5.0070543972409465e-06,
"loss": 4.0921,
"step": 1600
},
{
"epoch": 0.10660312284442215,
"grad_norm": 35.98961639404297,
"learning_rate": 5.320583163505252e-06,
"loss": 4.1785,
"step": 1700
},
{
"epoch": 0.11287389477644698,
"grad_norm": 90.68247985839844,
"learning_rate": 5.634111929769557e-06,
"loss": 4.0503,
"step": 1800
},
{
"epoch": 0.11914466670847182,
"grad_norm": 64.07307434082031,
"learning_rate": 5.947640696033862e-06,
"loss": 3.8969,
"step": 1900
},
{
"epoch": 0.12541543864049665,
"grad_norm": 58.8675537109375,
"learning_rate": 6.2611694622981665e-06,
"loss": 3.8538,
"step": 2000
},
{
"epoch": 0.12541543864049665,
"eval_loss": 3.910861015319824,
"eval_runtime": 257.3903,
"eval_samples_per_second": 495.636,
"eval_steps_per_second": 15.49,
"step": 2000
},
{
"epoch": 0.13168621057252147,
"grad_norm": 58.32042694091797,
"learning_rate": 6.574698228562471e-06,
"loss": 3.872,
"step": 2100
},
{
"epoch": 0.1379569825045463,
"grad_norm": 38.084102630615234,
"learning_rate": 6.885091707164133e-06,
"loss": 3.851,
"step": 2200
},
{
"epoch": 0.14422775443657115,
"grad_norm": 62.679237365722656,
"learning_rate": 7.198620473428438e-06,
"loss": 3.6301,
"step": 2300
},
{
"epoch": 0.15049852636859598,
"grad_norm": 60.0799446105957,
"learning_rate": 7.512149239692742e-06,
"loss": 3.5202,
"step": 2400
},
{
"epoch": 0.1567692983006208,
"grad_norm": 30.11835289001465,
"learning_rate": 7.825678005957047e-06,
"loss": 3.6759,
"step": 2500
},
{
"epoch": 0.1567692983006208,
"eval_loss": 3.638855218887329,
"eval_runtime": 259.3655,
"eval_samples_per_second": 491.862,
"eval_steps_per_second": 15.372,
"step": 2500
},
{
"epoch": 0.16304007023264563,
"grad_norm": 72.76881408691406,
"learning_rate": 8.139206772221352e-06,
"loss": 3.4106,
"step": 2600
},
{
"epoch": 0.16931084216467046,
"grad_norm": 89.71743774414062,
"learning_rate": 8.452735538485657e-06,
"loss": 3.69,
"step": 2700
},
{
"epoch": 0.17558161409669532,
"grad_norm": 81.42703247070312,
"learning_rate": 8.766264304749962e-06,
"loss": 3.6336,
"step": 2800
},
{
"epoch": 0.18185238602872014,
"grad_norm": 84.91463470458984,
"learning_rate": 9.079793071014266e-06,
"loss": 3.4715,
"step": 2900
},
{
"epoch": 0.18812315796074497,
"grad_norm": 50.145713806152344,
"learning_rate": 9.393321837278571e-06,
"loss": 3.2166,
"step": 3000
},
{
"epoch": 0.18812315796074497,
"eval_loss": 3.273871898651123,
"eval_runtime": 257.8101,
"eval_samples_per_second": 494.829,
"eval_steps_per_second": 15.465,
"step": 3000
},
{
"epoch": 0.1943939298927698,
"grad_norm": 62.13496780395508,
"learning_rate": 9.706850603542876e-06,
"loss": 3.3844,
"step": 3100
},
{
"epoch": 0.20066470182479462,
"grad_norm": 74.26377868652344,
"learning_rate": 1.002037936980718e-05,
"loss": 3.4449,
"step": 3200
},
{
"epoch": 0.20693547375681948,
"grad_norm": 112.08758544921875,
"learning_rate": 1.0333908136071484e-05,
"loss": 3.0811,
"step": 3300
},
{
"epoch": 0.2132062456888443,
"grad_norm": 40.37177276611328,
"learning_rate": 1.0644301614673146e-05,
"loss": 3.2777,
"step": 3400
},
{
"epoch": 0.21947701762086913,
"grad_norm": 65.6463623046875,
"learning_rate": 1.0957830380937451e-05,
"loss": 2.9505,
"step": 3500
},
{
"epoch": 0.21947701762086913,
"eval_loss": 3.0864851474761963,
"eval_runtime": 252.3303,
"eval_samples_per_second": 505.575,
"eval_steps_per_second": 15.801,
"step": 3500
},
{
"epoch": 0.22574778955289396,
"grad_norm": 103.7303466796875,
"learning_rate": 1.1271359147201758e-05,
"loss": 3.1534,
"step": 3600
},
{
"epoch": 0.23201856148491878,
"grad_norm": 145.92767333984375,
"learning_rate": 1.158488791346606e-05,
"loss": 2.9669,
"step": 3700
},
{
"epoch": 0.23828933341694364,
"grad_norm": 78.69353485107422,
"learning_rate": 1.1898416679730367e-05,
"loss": 2.9416,
"step": 3800
},
{
"epoch": 0.24456010534896847,
"grad_norm": 55.99378204345703,
"learning_rate": 1.221194544599467e-05,
"loss": 2.9637,
"step": 3900
},
{
"epoch": 0.2508308772809933,
"grad_norm": 104.32599639892578,
"learning_rate": 1.2525474212258977e-05,
"loss": 2.9322,
"step": 4000
},
{
"epoch": 0.2508308772809933,
"eval_loss": 2.844682455062866,
"eval_runtime": 252.142,
"eval_samples_per_second": 505.953,
"eval_steps_per_second": 15.813,
"step": 4000
},
{
"epoch": 0.25710164921301815,
"grad_norm": 68.12931823730469,
"learning_rate": 1.283900297852328e-05,
"loss": 2.6926,
"step": 4100
},
{
"epoch": 0.26337242114504295,
"grad_norm": 208.65591430664062,
"learning_rate": 1.3152531744787585e-05,
"loss": 2.9353,
"step": 4200
},
{
"epoch": 0.2696431930770678,
"grad_norm": 91.44706726074219,
"learning_rate": 1.3466060511051891e-05,
"loss": 2.635,
"step": 4300
},
{
"epoch": 0.2759139650090926,
"grad_norm": 92.03852081298828,
"learning_rate": 1.3779589277316194e-05,
"loss": 2.5692,
"step": 4400
},
{
"epoch": 0.28218473694111745,
"grad_norm": 138.34088134765625,
"learning_rate": 1.4089982755917857e-05,
"loss": 3.0283,
"step": 4500
},
{
"epoch": 0.28218473694111745,
"eval_loss": 2.9033422470092773,
"eval_runtime": 248.9921,
"eval_samples_per_second": 512.354,
"eval_steps_per_second": 16.013,
"step": 4500
},
{
"epoch": 0.2884555088731423,
"grad_norm": 57.657936096191406,
"learning_rate": 1.4403511522182162e-05,
"loss": 2.5804,
"step": 4600
},
{
"epoch": 0.2947262808051671,
"grad_norm": 52.86611557006836,
"learning_rate": 1.4717040288446466e-05,
"loss": 3.1374,
"step": 4700
},
{
"epoch": 0.30099705273719196,
"grad_norm": 80.26192474365234,
"learning_rate": 1.5030569054710771e-05,
"loss": 2.8479,
"step": 4800
},
{
"epoch": 0.30726782466921676,
"grad_norm": 7.4570465087890625,
"learning_rate": 1.5344097820975076e-05,
"loss": 2.6809,
"step": 4900
},
{
"epoch": 0.3135385966012416,
"grad_norm": 4.543123722076416,
"learning_rate": 1.5657626587239382e-05,
"loss": 2.8267,
"step": 5000
},
{
"epoch": 0.3135385966012416,
"eval_loss": 2.6946306228637695,
"eval_runtime": 249.1581,
"eval_samples_per_second": 512.012,
"eval_steps_per_second": 16.002,
"step": 5000
},
{
"epoch": 0.31980936853326647,
"grad_norm": 29.848108291625977,
"learning_rate": 1.5971155353503685e-05,
"loss": 2.7341,
"step": 5100
},
{
"epoch": 0.32608014046529127,
"grad_norm": 139.14234924316406,
"learning_rate": 1.6284684119767992e-05,
"loss": 2.8157,
"step": 5200
},
{
"epoch": 0.3323509123973161,
"grad_norm": 65.15583038330078,
"learning_rate": 1.659507759836965e-05,
"loss": 2.5867,
"step": 5300
},
{
"epoch": 0.3386216843293409,
"grad_norm": 76.32029724121094,
"learning_rate": 1.6908606364633958e-05,
"loss": 2.8622,
"step": 5400
},
{
"epoch": 0.3448924562613658,
"grad_norm": 85.2268295288086,
"learning_rate": 1.722213513089826e-05,
"loss": 2.9063,
"step": 5500
},
{
"epoch": 0.3448924562613658,
"eval_loss": 2.6115071773529053,
"eval_runtime": 249.5988,
"eval_samples_per_second": 511.108,
"eval_steps_per_second": 15.974,
"step": 5500
},
{
"epoch": 0.35116322819339063,
"grad_norm": 21.968101501464844,
"learning_rate": 1.7535663897162567e-05,
"loss": 2.1514,
"step": 5600
},
{
"epoch": 0.35743400012541543,
"grad_norm": 159.9650421142578,
"learning_rate": 1.784919266342687e-05,
"loss": 2.3755,
"step": 5700
},
{
"epoch": 0.3637047720574403,
"grad_norm": 53.702919006347656,
"learning_rate": 1.8162721429691173e-05,
"loss": 2.5055,
"step": 5800
},
{
"epoch": 0.3699755439894651,
"grad_norm": 16.580570220947266,
"learning_rate": 1.847625019595548e-05,
"loss": 3.3237,
"step": 5900
},
{
"epoch": 0.37624631592148994,
"grad_norm": 77.9209213256836,
"learning_rate": 1.8789778962219786e-05,
"loss": 2.561,
"step": 6000
},
{
"epoch": 0.37624631592148994,
"eval_loss": 2.7512075901031494,
"eval_runtime": 252.3158,
"eval_samples_per_second": 505.605,
"eval_steps_per_second": 15.802,
"step": 6000
},
{
"epoch": 0.3825170878535148,
"grad_norm": 181.68307495117188,
"learning_rate": 1.910330772848409e-05,
"loss": 2.4351,
"step": 6100
},
{
"epoch": 0.3887878597855396,
"grad_norm": 405.9890441894531,
"learning_rate": 1.9416836494748396e-05,
"loss": 2.8472,
"step": 6200
},
{
"epoch": 0.39505863171756445,
"grad_norm": 65.42109680175781,
"learning_rate": 1.9730365261012702e-05,
"loss": 2.76,
"step": 6300
},
{
"epoch": 0.40132940364958924,
"grad_norm": 1.5880606174468994,
"learning_rate": 1.999512271595046e-05,
"loss": 2.1947,
"step": 6400
},
{
"epoch": 0.4076001755816141,
"grad_norm": 147.59877014160156,
"learning_rate": 1.9960284972739466e-05,
"loss": 2.6409,
"step": 6500
},
{
"epoch": 0.4076001755816141,
"eval_loss": 2.536679267883301,
"eval_runtime": 247.9993,
"eval_samples_per_second": 514.405,
"eval_steps_per_second": 16.077,
"step": 6500
},
{
"epoch": 0.41387094751363895,
"grad_norm": 147.17579650878906,
"learning_rate": 1.9925447229528472e-05,
"loss": 2.7262,
"step": 6600
},
{
"epoch": 0.42014171944566375,
"grad_norm": 42.11772155761719,
"learning_rate": 1.989060948631748e-05,
"loss": 2.7781,
"step": 6700
},
{
"epoch": 0.4264124913776886,
"grad_norm": 194.7137451171875,
"learning_rate": 1.985577174310648e-05,
"loss": 2.4718,
"step": 6800
},
{
"epoch": 0.4326832633097134,
"grad_norm": 63.2336540222168,
"learning_rate": 1.9820933999895488e-05,
"loss": 2.567,
"step": 6900
},
{
"epoch": 0.43895403524173826,
"grad_norm": 122.4419174194336,
"learning_rate": 1.9786096256684494e-05,
"loss": 2.4215,
"step": 7000
},
{
"epoch": 0.43895403524173826,
"eval_loss": 2.340890407562256,
"eval_runtime": 249.0613,
"eval_samples_per_second": 512.211,
"eval_steps_per_second": 16.008,
"step": 7000
},
{
"epoch": 0.4452248071737631,
"grad_norm": 173.85031127929688,
"learning_rate": 1.97512585134735e-05,
"loss": 1.9308,
"step": 7100
},
{
"epoch": 0.4514955791057879,
"grad_norm": 200.34971618652344,
"learning_rate": 1.9716420770262504e-05,
"loss": 2.1232,
"step": 7200
},
{
"epoch": 0.45776635103781277,
"grad_norm": 208.45030212402344,
"learning_rate": 1.968158302705151e-05,
"loss": 2.421,
"step": 7300
},
{
"epoch": 0.46403712296983757,
"grad_norm": 148.36253356933594,
"learning_rate": 1.9646745283840513e-05,
"loss": 2.3232,
"step": 7400
},
{
"epoch": 0.4703078949018624,
"grad_norm": 24.392248153686523,
"learning_rate": 1.961190754062952e-05,
"loss": 2.8543,
"step": 7500
},
{
"epoch": 0.4703078949018624,
"eval_loss": 2.3705639839172363,
"eval_runtime": 247.4495,
"eval_samples_per_second": 515.548,
"eval_steps_per_second": 16.112,
"step": 7500
},
{
"epoch": 0.4765786668338873,
"grad_norm": 217.60328674316406,
"learning_rate": 1.9577069797418526e-05,
"loss": 2.4276,
"step": 7600
},
{
"epoch": 0.4828494387659121,
"grad_norm": 91.55315399169922,
"learning_rate": 1.9542232054207532e-05,
"loss": 2.4507,
"step": 7700
},
{
"epoch": 0.48912021069793693,
"grad_norm": 246.22488403320312,
"learning_rate": 1.9507394310996535e-05,
"loss": 2.1963,
"step": 7800
},
{
"epoch": 0.49539098262996173,
"grad_norm": 76.7205810546875,
"learning_rate": 1.947255656778554e-05,
"loss": 2.4247,
"step": 7900
},
{
"epoch": 0.5016617545619866,
"grad_norm": 0.794611394405365,
"learning_rate": 1.9437718824574544e-05,
"loss": 2.1948,
"step": 8000
},
{
"epoch": 0.5016617545619866,
"eval_loss": 2.5728752613067627,
"eval_runtime": 248.1179,
"eval_samples_per_second": 514.159,
"eval_steps_per_second": 16.069,
"step": 8000
},
{
"epoch": 0.5079325264940114,
"grad_norm": 75.7978744506836,
"learning_rate": 1.940288108136355e-05,
"loss": 2.4069,
"step": 8100
},
{
"epoch": 0.5142032984260363,
"grad_norm": 81.46521759033203,
"learning_rate": 1.9368043338152557e-05,
"loss": 2.4328,
"step": 8200
},
{
"epoch": 0.520474070358061,
"grad_norm": 153.08226013183594,
"learning_rate": 1.9333205594941563e-05,
"loss": 2.2198,
"step": 8300
},
{
"epoch": 0.5267448422900859,
"grad_norm": 1.353060245513916,
"learning_rate": 1.9298367851730566e-05,
"loss": 2.1746,
"step": 8400
},
{
"epoch": 0.5330156142221107,
"grad_norm": 298.2365417480469,
"learning_rate": 1.9263878485951682e-05,
"loss": 2.2618,
"step": 8500
},
{
"epoch": 0.5330156142221107,
"eval_loss": 2.345949411392212,
"eval_runtime": 249.317,
"eval_samples_per_second": 511.686,
"eval_steps_per_second": 15.992,
"step": 8500
},
{
"epoch": 0.5392863861541356,
"grad_norm": 409.5244140625,
"learning_rate": 1.922904074274069e-05,
"loss": 2.3909,
"step": 8600
},
{
"epoch": 0.5455571580861605,
"grad_norm": 0.6597223877906799,
"learning_rate": 1.919420299952969e-05,
"loss": 2.035,
"step": 8700
},
{
"epoch": 0.5518279300181852,
"grad_norm": 365.05914306640625,
"learning_rate": 1.9159365256318698e-05,
"loss": 2.2626,
"step": 8800
},
{
"epoch": 0.55809870195021,
"grad_norm": 103.37579345703125,
"learning_rate": 1.91245275131077e-05,
"loss": 2.1541,
"step": 8900
},
{
"epoch": 0.5643694738822349,
"grad_norm": 4.599234104156494,
"learning_rate": 1.9089689769896707e-05,
"loss": 1.9424,
"step": 9000
},
{
"epoch": 0.5643694738822349,
"eval_loss": 2.1624536514282227,
"eval_runtime": 248.0391,
"eval_samples_per_second": 514.322,
"eval_steps_per_second": 16.074,
"step": 9000
},
{
"epoch": 0.5706402458142598,
"grad_norm": 0.6885708570480347,
"learning_rate": 1.9054852026685714e-05,
"loss": 2.5152,
"step": 9100
},
{
"epoch": 0.5769110177462846,
"grad_norm": 103.164794921875,
"learning_rate": 1.9020014283474716e-05,
"loss": 2.0462,
"step": 9200
},
{
"epoch": 0.5831817896783094,
"grad_norm": 0.7507800459861755,
"learning_rate": 1.8985176540263723e-05,
"loss": 1.6124,
"step": 9300
},
{
"epoch": 0.5894525616103342,
"grad_norm": 38.5381965637207,
"learning_rate": 1.895033879705273e-05,
"loss": 2.2236,
"step": 9400
},
{
"epoch": 0.5957233335423591,
"grad_norm": 374.18011474609375,
"learning_rate": 1.8915501053841735e-05,
"loss": 2.4706,
"step": 9500
},
{
"epoch": 0.5957233335423591,
"eval_loss": 2.0568950176239014,
"eval_runtime": 250.2023,
"eval_samples_per_second": 509.875,
"eval_steps_per_second": 15.935,
"step": 9500
},
{
"epoch": 0.6019941054743839,
"grad_norm": 115.00419616699219,
"learning_rate": 1.888066331063074e-05,
"loss": 2.4612,
"step": 9600
},
{
"epoch": 0.6082648774064088,
"grad_norm": 302.7066955566406,
"learning_rate": 1.8845825567419745e-05,
"loss": 2.2784,
"step": 9700
},
{
"epoch": 0.6145356493384335,
"grad_norm": 0.18385061621665955,
"learning_rate": 1.8810987824208748e-05,
"loss": 1.9335,
"step": 9800
},
{
"epoch": 0.6208064212704584,
"grad_norm": 9.742902755737305,
"learning_rate": 1.8776150080997754e-05,
"loss": 2.3779,
"step": 9900
},
{
"epoch": 0.6270771932024832,
"grad_norm": 12.202372550964355,
"learning_rate": 1.874131233778676e-05,
"loss": 1.6778,
"step": 10000
},
{
"epoch": 0.6270771932024832,
"eval_loss": 2.112342596054077,
"eval_runtime": 247.5759,
"eval_samples_per_second": 515.284,
"eval_steps_per_second": 16.104,
"step": 10000
},
{
"epoch": 0.6333479651345081,
"grad_norm": 47.51719284057617,
"learning_rate": 1.8706474594575767e-05,
"loss": 2.4721,
"step": 10100
},
{
"epoch": 0.6396187370665329,
"grad_norm": 330.02703857421875,
"learning_rate": 1.867163685136477e-05,
"loss": 1.7822,
"step": 10200
},
{
"epoch": 0.6458895089985577,
"grad_norm": 110.14346313476562,
"learning_rate": 1.8636799108153776e-05,
"loss": 2.077,
"step": 10300
},
{
"epoch": 0.6521602809305825,
"grad_norm": 28.561458587646484,
"learning_rate": 1.860196136494278e-05,
"loss": 1.9223,
"step": 10400
},
{
"epoch": 0.6584310528626074,
"grad_norm": 14.915325164794922,
"learning_rate": 1.8567123621731785e-05,
"loss": 2.3513,
"step": 10500
},
{
"epoch": 0.6584310528626074,
"eval_loss": 1.8402663469314575,
"eval_runtime": 246.8028,
"eval_samples_per_second": 516.899,
"eval_steps_per_second": 16.155,
"step": 10500
},
{
"epoch": 0.6647018247946322,
"grad_norm": 142.3553009033203,
"learning_rate": 1.853228587852079e-05,
"loss": 2.1387,
"step": 10600
},
{
"epoch": 0.6709725967266571,
"grad_norm": 2.4230360984802246,
"learning_rate": 1.8497448135309798e-05,
"loss": 2.1853,
"step": 10700
},
{
"epoch": 0.6772433686586818,
"grad_norm": 85.05690002441406,
"learning_rate": 1.84626103920988e-05,
"loss": 1.8715,
"step": 10800
},
{
"epoch": 0.6835141405907067,
"grad_norm": 88.9746322631836,
"learning_rate": 1.8427772648887807e-05,
"loss": 1.8581,
"step": 10900
},
{
"epoch": 0.6897849125227316,
"grad_norm": 191.67779541015625,
"learning_rate": 1.839293490567681e-05,
"loss": 2.0076,
"step": 11000
},
{
"epoch": 0.6897849125227316,
"eval_loss": 2.00632643699646,
"eval_runtime": 246.7856,
"eval_samples_per_second": 516.934,
"eval_steps_per_second": 16.156,
"step": 11000
},
{
"epoch": 0.6960556844547564,
"grad_norm": 157.76986694335938,
"learning_rate": 1.8358097162465817e-05,
"loss": 2.3144,
"step": 11100
},
{
"epoch": 0.7023264563867813,
"grad_norm": 52.53676223754883,
"learning_rate": 1.8323259419254823e-05,
"loss": 2.0942,
"step": 11200
},
{
"epoch": 0.708597228318806,
"grad_norm": 61.30582046508789,
"learning_rate": 1.828842167604383e-05,
"loss": 1.9117,
"step": 11300
},
{
"epoch": 0.7148680002508309,
"grad_norm": 146.37437438964844,
"learning_rate": 1.8253583932832832e-05,
"loss": 2.2214,
"step": 11400
},
{
"epoch": 0.7211387721828557,
"grad_norm": 214.81398010253906,
"learning_rate": 1.821874618962184e-05,
"loss": 1.9678,
"step": 11500
},
{
"epoch": 0.7211387721828557,
"eval_loss": 1.9028793573379517,
"eval_runtime": 244.7222,
"eval_samples_per_second": 521.293,
"eval_steps_per_second": 16.292,
"step": 11500
},
{
"epoch": 0.7274095441148806,
"grad_norm": 5.435591220855713,
"learning_rate": 1.818390844641084e-05,
"loss": 1.7459,
"step": 11600
},
{
"epoch": 0.7336803160469054,
"grad_norm": 107.97034454345703,
"learning_rate": 1.8149070703199848e-05,
"loss": 2.0616,
"step": 11700
},
{
"epoch": 0.7399510879789302,
"grad_norm": 63.21007537841797,
"learning_rate": 1.8114232959988854e-05,
"loss": 1.6169,
"step": 11800
},
{
"epoch": 0.746221859910955,
"grad_norm": 113.56210327148438,
"learning_rate": 1.8079743594209967e-05,
"loss": 1.5674,
"step": 11900
},
{
"epoch": 0.7524926318429799,
"grad_norm": 107.1183090209961,
"learning_rate": 1.8044905850998973e-05,
"loss": 1.4956,
"step": 12000
},
{
"epoch": 0.7524926318429799,
"eval_loss": 1.8266816139221191,
"eval_runtime": 244.3373,
"eval_samples_per_second": 522.114,
"eval_steps_per_second": 16.318,
"step": 12000
},
{
"epoch": 0.7587634037750047,
"grad_norm": 151.79904174804688,
"learning_rate": 1.801006810778798e-05,
"loss": 2.3816,
"step": 12100
},
{
"epoch": 0.7650341757070296,
"grad_norm": 323.1309814453125,
"learning_rate": 1.7975230364576983e-05,
"loss": 2.2387,
"step": 12200
},
{
"epoch": 0.7713049476390543,
"grad_norm": 4.0979743003845215,
"learning_rate": 1.794039262136599e-05,
"loss": 1.4625,
"step": 12300
},
{
"epoch": 0.7775757195710792,
"grad_norm": 126.16666412353516,
"learning_rate": 1.7905554878154995e-05,
"loss": 2.028,
"step": 12400
},
{
"epoch": 0.783846491503104,
"grad_norm": 42.80760955810547,
"learning_rate": 1.7870717134944e-05,
"loss": 2.151,
"step": 12500
},
{
"epoch": 0.783846491503104,
"eval_loss": 1.7581337690353394,
"eval_runtime": 244.6209,
"eval_samples_per_second": 521.509,
"eval_steps_per_second": 16.299,
"step": 12500
},
{
"epoch": 0.7901172634351289,
"grad_norm": 0.3076690435409546,
"learning_rate": 1.7835879391733005e-05,
"loss": 1.6896,
"step": 12600
},
{
"epoch": 0.7963880353671537,
"grad_norm": 1.4938758611679077,
"learning_rate": 1.780104164852201e-05,
"loss": 1.8526,
"step": 12700
},
{
"epoch": 0.8026588072991785,
"grad_norm": 208.20004272460938,
"learning_rate": 1.7766203905311014e-05,
"loss": 1.9745,
"step": 12800
},
{
"epoch": 0.8089295792312033,
"grad_norm": 14.515748023986816,
"learning_rate": 1.773136616210002e-05,
"loss": 2.1042,
"step": 12900
},
{
"epoch": 0.8152003511632282,
"grad_norm": 170.497314453125,
"learning_rate": 1.7696528418889027e-05,
"loss": 1.83,
"step": 13000
},
{
"epoch": 0.8152003511632282,
"eval_loss": 1.5666632652282715,
"eval_runtime": 244.9634,
"eval_samples_per_second": 520.78,
"eval_steps_per_second": 16.276,
"step": 13000
},
{
"epoch": 0.821471123095253,
"grad_norm": 37.14794158935547,
"learning_rate": 1.7661690675678033e-05,
"loss": 1.7451,
"step": 13100
},
{
"epoch": 0.8277418950272779,
"grad_norm": 97.6008529663086,
"learning_rate": 1.7626852932467036e-05,
"loss": 1.568,
"step": 13200
},
{
"epoch": 0.8340126669593027,
"grad_norm": 1.4752888679504395,
"learning_rate": 1.7592015189256042e-05,
"loss": 1.4432,
"step": 13300
},
{
"epoch": 0.8402834388913275,
"grad_norm": 100.85454559326172,
"learning_rate": 1.7557177446045045e-05,
"loss": 1.9172,
"step": 13400
},
{
"epoch": 0.8465542108233524,
"grad_norm": 169.63970947265625,
"learning_rate": 1.752233970283405e-05,
"loss": 1.9438,
"step": 13500
},
{
"epoch": 0.8465542108233524,
"eval_loss": 1.6055145263671875,
"eval_runtime": 239.684,
"eval_samples_per_second": 532.251,
"eval_steps_per_second": 16.634,
"step": 13500
},
{
"epoch": 0.8528249827553772,
"grad_norm": 145.4659881591797,
"learning_rate": 1.7487501959623058e-05,
"loss": 1.6488,
"step": 13600
},
{
"epoch": 0.8590957546874021,
"grad_norm": 9.112565994262695,
"learning_rate": 1.7452664216412064e-05,
"loss": 1.8166,
"step": 13700
},
{
"epoch": 0.8653665266194268,
"grad_norm": 122.40379333496094,
"learning_rate": 1.7417826473201067e-05,
"loss": 1.5929,
"step": 13800
},
{
"epoch": 0.8716372985514517,
"grad_norm": 1.4977953433990479,
"learning_rate": 1.7382988729990073e-05,
"loss": 1.2476,
"step": 13900
},
{
"epoch": 0.8779080704834765,
"grad_norm": 264.9580078125,
"learning_rate": 1.7348150986779076e-05,
"loss": 1.5236,
"step": 14000
},
{
"epoch": 0.8779080704834765,
"eval_loss": 1.8921126127243042,
"eval_runtime": 242.2787,
"eval_samples_per_second": 526.551,
"eval_steps_per_second": 16.456,
"step": 14000
},
{
"epoch": 0.8841788424155014,
"grad_norm": 0.0032478359062224627,
"learning_rate": 1.7313661621000193e-05,
"loss": 1.6538,
"step": 14100
},
{
"epoch": 0.8904496143475262,
"grad_norm": 169.41224670410156,
"learning_rate": 1.72788238777892e-05,
"loss": 1.8689,
"step": 14200
},
{
"epoch": 0.896720386279551,
"grad_norm": 91.79679107666016,
"learning_rate": 1.7243986134578202e-05,
"loss": 1.0831,
"step": 14300
},
{
"epoch": 0.9029911582115758,
"grad_norm": 1.378010869026184,
"learning_rate": 1.7209148391367208e-05,
"loss": 1.7765,
"step": 14400
},
{
"epoch": 0.9092619301436007,
"grad_norm": 86.2571792602539,
"learning_rate": 1.7174310648156215e-05,
"loss": 1.3548,
"step": 14500
},
{
"epoch": 0.9092619301436007,
"eval_loss": 1.668320894241333,
"eval_runtime": 242.597,
"eval_samples_per_second": 525.86,
"eval_steps_per_second": 16.435,
"step": 14500
},
{
"epoch": 0.9155327020756255,
"grad_norm": 42.63466262817383,
"learning_rate": 1.7139472904945218e-05,
"loss": 1.7792,
"step": 14600
},
{
"epoch": 0.9218034740076504,
"grad_norm": 31.874799728393555,
"learning_rate": 1.7104635161734224e-05,
"loss": 1.73,
"step": 14700
},
{
"epoch": 0.9280742459396751,
"grad_norm": 288.0302734375,
"learning_rate": 1.7069797418523227e-05,
"loss": 1.5979,
"step": 14800
},
{
"epoch": 0.9343450178717,
"grad_norm": 76.91877746582031,
"learning_rate": 1.7034959675312233e-05,
"loss": 1.3678,
"step": 14900
},
{
"epoch": 0.9406157898037248,
"grad_norm": 153.2476348876953,
"learning_rate": 1.700012193210124e-05,
"loss": 2.0664,
"step": 15000
},
{
"epoch": 0.9406157898037248,
"eval_loss": 1.5160768032073975,
"eval_runtime": 241.7632,
"eval_samples_per_second": 527.673,
"eval_steps_per_second": 16.491,
"step": 15000
},
{
"epoch": 0.9468865617357497,
"grad_norm": 204.87367248535156,
"learning_rate": 1.6965284188890246e-05,
"loss": 1.4472,
"step": 15100
},
{
"epoch": 0.9531573336677746,
"grad_norm": 107.19727325439453,
"learning_rate": 1.693044644567925e-05,
"loss": 1.447,
"step": 15200
},
{
"epoch": 0.9594281055997993,
"grad_norm": 0.9635588526725769,
"learning_rate": 1.6895608702468255e-05,
"loss": 1.7261,
"step": 15300
},
{
"epoch": 0.9656988775318242,
"grad_norm": 21.72879981994629,
"learning_rate": 1.686077095925726e-05,
"loss": 1.4881,
"step": 15400
},
{
"epoch": 0.971969649463849,
"grad_norm": 3.110539197921753,
"learning_rate": 1.6825933216046268e-05,
"loss": 1.313,
"step": 15500
},
{
"epoch": 0.971969649463849,
"eval_loss": 1.6226599216461182,
"eval_runtime": 241.336,
"eval_samples_per_second": 528.607,
"eval_steps_per_second": 16.521,
"step": 15500
},
{
"epoch": 0.9782404213958739,
"grad_norm": 4.804477691650391,
"learning_rate": 1.679109547283527e-05,
"loss": 1.4587,
"step": 15600
},
{
"epoch": 0.9845111933278987,
"grad_norm": 159.54579162597656,
"learning_rate": 1.6756257729624277e-05,
"loss": 2.0982,
"step": 15700
},
{
"epoch": 0.9907819652599235,
"grad_norm": 0.04496179521083832,
"learning_rate": 1.672141998641328e-05,
"loss": 1.4854,
"step": 15800
},
{
"epoch": 0.9970527371919483,
"grad_norm": 178.064453125,
"learning_rate": 1.6686582243202286e-05,
"loss": 1.343,
"step": 15900
},
{
"epoch": 1.0033235091239732,
"grad_norm": 60.21414566040039,
"learning_rate": 1.6651744499991293e-05,
"loss": 1.1795,
"step": 16000
},
{
"epoch": 1.0033235091239732,
"eval_loss": 1.5639160871505737,
"eval_runtime": 239.9545,
"eval_samples_per_second": 531.651,
"eval_steps_per_second": 16.616,
"step": 16000
},
{
"epoch": 1.009594281055998,
"grad_norm": 28.01744842529297,
"learning_rate": 1.66169067567803e-05,
"loss": 1.4001,
"step": 16100
},
{
"epoch": 1.0158650529880229,
"grad_norm": 0.9447069764137268,
"learning_rate": 1.6582069013569302e-05,
"loss": 1.3867,
"step": 16200
},
{
"epoch": 1.0221358249200476,
"grad_norm": 271.91583251953125,
"learning_rate": 1.654723127035831e-05,
"loss": 1.5191,
"step": 16300
},
{
"epoch": 1.0284065968520726,
"grad_norm": 50.53108596801758,
"learning_rate": 1.651239352714731e-05,
"loss": 1.4693,
"step": 16400
},
{
"epoch": 1.0346773687840973,
"grad_norm": 37.87648010253906,
"learning_rate": 1.6477555783936318e-05,
"loss": 1.628,
"step": 16500
},
{
"epoch": 1.0346773687840973,
"eval_loss": 1.4715627431869507,
"eval_runtime": 243.7121,
"eval_samples_per_second": 523.454,
"eval_steps_per_second": 16.359,
"step": 16500
},
{
"epoch": 1.040948140716122,
"grad_norm": 0.5571967363357544,
"learning_rate": 1.6442718040725324e-05,
"loss": 1.0041,
"step": 16600
},
{
"epoch": 1.047218912648147,
"grad_norm": 410.87158203125,
"learning_rate": 1.6408228674946437e-05,
"loss": 1.7728,
"step": 16700
},
{
"epoch": 1.0534896845801718,
"grad_norm": 0.04839416220784187,
"learning_rate": 1.6373390931735443e-05,
"loss": 1.5586,
"step": 16800
},
{
"epoch": 1.0597604565121967,
"grad_norm": 15.377680778503418,
"learning_rate": 1.633855318852445e-05,
"loss": 1.7229,
"step": 16900
},
{
"epoch": 1.0660312284442215,
"grad_norm": 156.4866943359375,
"learning_rate": 1.6304063822745562e-05,
"loss": 1.5556,
"step": 17000
},
{
"epoch": 1.0660312284442215,
"eval_loss": 1.467575192451477,
"eval_runtime": 244.9859,
"eval_samples_per_second": 520.732,
"eval_steps_per_second": 16.274,
"step": 17000
},
{
"epoch": 1.0723020003762462,
"grad_norm": 0.4987052083015442,
"learning_rate": 1.626922607953457e-05,
"loss": 1.2529,
"step": 17100
},
{
"epoch": 1.0785727723082712,
"grad_norm": 0.12283490598201752,
"learning_rate": 1.6234388336323575e-05,
"loss": 1.4787,
"step": 17200
},
{
"epoch": 1.084843544240296,
"grad_norm": 0.2928747534751892,
"learning_rate": 1.6199550593112578e-05,
"loss": 1.1947,
"step": 17300
},
{
"epoch": 1.091114316172321,
"grad_norm": 0.06402698904275894,
"learning_rate": 1.6164712849901584e-05,
"loss": 1.3014,
"step": 17400
},
{
"epoch": 1.0973850881043457,
"grad_norm": 38.54865646362305,
"learning_rate": 1.612987510669059e-05,
"loss": 1.3743,
"step": 17500
},
{
"epoch": 1.0973850881043457,
"eval_loss": 1.4624249935150146,
"eval_runtime": 242.5321,
"eval_samples_per_second": 526.0,
"eval_steps_per_second": 16.439,
"step": 17500
},
{
"epoch": 1.1036558600363704,
"grad_norm": 4.770035266876221,
"learning_rate": 1.6095037363479597e-05,
"loss": 1.3397,
"step": 17600
},
{
"epoch": 1.1099266319683954,
"grad_norm": 73.70013427734375,
"learning_rate": 1.60601996202686e-05,
"loss": 1.3062,
"step": 17700
},
{
"epoch": 1.11619740390042,
"grad_norm": 0.7905834317207336,
"learning_rate": 1.6025361877057606e-05,
"loss": 1.3288,
"step": 17800
},
{
"epoch": 1.122468175832445,
"grad_norm": 85.46574401855469,
"learning_rate": 1.599052413384661e-05,
"loss": 2.0002,
"step": 17900
},
{
"epoch": 1.1287389477644698,
"grad_norm": 101.38238525390625,
"learning_rate": 1.5955686390635616e-05,
"loss": 2.0294,
"step": 18000
},
{
"epoch": 1.1287389477644698,
"eval_loss": 1.4184610843658447,
"eval_runtime": 243.202,
"eval_samples_per_second": 524.552,
"eval_steps_per_second": 16.394,
"step": 18000
},
{
"epoch": 1.1350097196964946,
"grad_norm": 177.34451293945312,
"learning_rate": 1.5920848647424622e-05,
"loss": 1.5053,
"step": 18100
},
{
"epoch": 1.1412804916285195,
"grad_norm": 0.12398409098386765,
"learning_rate": 1.5886010904213628e-05,
"loss": 1.3657,
"step": 18200
},
{
"epoch": 1.1475512635605443,
"grad_norm": 1.1212390661239624,
"learning_rate": 1.585117316100263e-05,
"loss": 1.3877,
"step": 18300
},
{
"epoch": 1.1538220354925692,
"grad_norm": 132.34060668945312,
"learning_rate": 1.5816335417791638e-05,
"loss": 1.9034,
"step": 18400
},
{
"epoch": 1.160092807424594,
"grad_norm": 8.030499458312988,
"learning_rate": 1.578149767458064e-05,
"loss": 1.4001,
"step": 18500
},
{
"epoch": 1.160092807424594,
"eval_loss": 1.3812620639801025,
"eval_runtime": 242.7932,
"eval_samples_per_second": 525.435,
"eval_steps_per_second": 16.421,
"step": 18500
},
{
"epoch": 1.1663635793566187,
"grad_norm": 100.8308334350586,
"learning_rate": 1.5746659931369647e-05,
"loss": 1.7503,
"step": 18600
},
{
"epoch": 1.1726343512886437,
"grad_norm": 57.332176208496094,
"learning_rate": 1.5711822188158653e-05,
"loss": 1.1482,
"step": 18700
},
{
"epoch": 1.1789051232206684,
"grad_norm": 0.38618066906929016,
"learning_rate": 1.567698444494766e-05,
"loss": 1.0958,
"step": 18800
},
{
"epoch": 1.1851758951526934,
"grad_norm": 29.31690216064453,
"learning_rate": 1.5642146701736662e-05,
"loss": 1.2657,
"step": 18900
},
{
"epoch": 1.1914466670847181,
"grad_norm": 98.16004180908203,
"learning_rate": 1.560730895852567e-05,
"loss": 1.3721,
"step": 19000
},
{
"epoch": 1.1914466670847181,
"eval_loss": 1.4701639413833618,
"eval_runtime": 242.7602,
"eval_samples_per_second": 525.506,
"eval_steps_per_second": 16.424,
"step": 19000
},
{
"epoch": 1.1977174390167429,
"grad_norm": 18.174930572509766,
"learning_rate": 1.5572471215314672e-05,
"loss": 1.2361,
"step": 19100
},
{
"epoch": 1.2039882109487678,
"grad_norm": 18.77554702758789,
"learning_rate": 1.5537633472103678e-05,
"loss": 1.003,
"step": 19200
},
{
"epoch": 1.2102589828807926,
"grad_norm": 105.063720703125,
"learning_rate": 1.5502795728892684e-05,
"loss": 1.3677,
"step": 19300
},
{
"epoch": 1.2165297548128176,
"grad_norm": 65.42724609375,
"learning_rate": 1.546795798568169e-05,
"loss": 1.668,
"step": 19400
},
{
"epoch": 1.2228005267448423,
"grad_norm": 57.190792083740234,
"learning_rate": 1.5433120242470694e-05,
"loss": 1.2026,
"step": 19500
},
{
"epoch": 1.2228005267448423,
"eval_loss": 1.3641443252563477,
"eval_runtime": 244.6626,
"eval_samples_per_second": 521.42,
"eval_steps_per_second": 16.296,
"step": 19500
},
{
"epoch": 1.229071298676867,
"grad_norm": 291.0449523925781,
"learning_rate": 1.53982824992597e-05,
"loss": 1.1754,
"step": 19600
},
{
"epoch": 1.235342070608892,
"grad_norm": 0.6484419703483582,
"learning_rate": 1.5363444756048703e-05,
"loss": 1.3196,
"step": 19700
},
{
"epoch": 1.2416128425409168,
"grad_norm": 10.18918514251709,
"learning_rate": 1.532860701283771e-05,
"loss": 1.4766,
"step": 19800
},
{
"epoch": 1.2478836144729417,
"grad_norm": 0.408495157957077,
"learning_rate": 1.5293769269626716e-05,
"loss": 1.389,
"step": 19900
},
{
"epoch": 1.2541543864049665,
"grad_norm": 1.5292593240737915,
"learning_rate": 1.525893152641572e-05,
"loss": 1.6974,
"step": 20000
},
{
"epoch": 1.2541543864049665,
"eval_loss": 1.3344130516052246,
"eval_runtime": 243.5732,
"eval_samples_per_second": 523.752,
"eval_steps_per_second": 16.369,
"step": 20000
},
{
"epoch": 1.2604251583369912,
"grad_norm": 0.13364413380622864,
"learning_rate": 1.5224093783204725e-05,
"loss": 1.5036,
"step": 20100
},
{
"epoch": 1.2666959302690162,
"grad_norm": 68.8973617553711,
"learning_rate": 1.518925603999373e-05,
"loss": 1.1728,
"step": 20200
},
{
"epoch": 1.272966702201041,
"grad_norm": 2.0211031436920166,
"learning_rate": 1.5154418296782736e-05,
"loss": 1.6058,
"step": 20300
},
{
"epoch": 1.2792374741330659,
"grad_norm": 16.78483009338379,
"learning_rate": 1.511958055357174e-05,
"loss": 1.5191,
"step": 20400
},
{
"epoch": 1.2855082460650906,
"grad_norm": 36.06229019165039,
"learning_rate": 1.5084742810360747e-05,
"loss": 1.4516,
"step": 20500
},
{
"epoch": 1.2855082460650906,
"eval_loss": 1.320965051651001,
"eval_runtime": 237.9506,
"eval_samples_per_second": 536.128,
"eval_steps_per_second": 16.756,
"step": 20500
},
{
"epoch": 1.2917790179971154,
"grad_norm": 0.42136240005493164,
"learning_rate": 1.504990506714975e-05,
"loss": 1.3485,
"step": 20600
},
{
"epoch": 1.2980497899291403,
"grad_norm": 69.18399810791016,
"learning_rate": 1.5015067323938756e-05,
"loss": 1.2598,
"step": 20700
},
{
"epoch": 1.304320561861165,
"grad_norm": 0.9956406354904175,
"learning_rate": 1.4980229580727761e-05,
"loss": 1.5871,
"step": 20800
},
{
"epoch": 1.31059133379319,
"grad_norm": 296.9071044921875,
"learning_rate": 1.4945391837516767e-05,
"loss": 1.1965,
"step": 20900
},
{
"epoch": 1.3168621057252148,
"grad_norm": 135.63108825683594,
"learning_rate": 1.4910554094305772e-05,
"loss": 1.3983,
"step": 21000
},
{
"epoch": 1.3168621057252148,
"eval_loss": 1.2516661882400513,
"eval_runtime": 243.3539,
"eval_samples_per_second": 524.224,
"eval_steps_per_second": 16.384,
"step": 21000
},
{
"epoch": 1.3231328776572395,
"grad_norm": 0.29125073552131653,
"learning_rate": 1.4875716351094778e-05,
"loss": 1.2605,
"step": 21100
},
{
"epoch": 1.3294036495892645,
"grad_norm": 120.13431549072266,
"learning_rate": 1.4840878607883781e-05,
"loss": 1.5629,
"step": 21200
},
{
"epoch": 1.3356744215212892,
"grad_norm": 0.6574529409408569,
"learning_rate": 1.4806040864672787e-05,
"loss": 1.0668,
"step": 21300
},
{
"epoch": 1.3419451934533142,
"grad_norm": 0.08501740545034409,
"learning_rate": 1.4771203121461792e-05,
"loss": 1.1879,
"step": 21400
},
{
"epoch": 1.348215965385339,
"grad_norm": 0.06920505315065384,
"learning_rate": 1.4736365378250798e-05,
"loss": 1.132,
"step": 21500
},
{
"epoch": 1.348215965385339,
"eval_loss": 1.3881497383117676,
"eval_runtime": 239.3274,
"eval_samples_per_second": 533.044,
"eval_steps_per_second": 16.659,
"step": 21500
},
{
"epoch": 1.3544867373173637,
"grad_norm": 119.1258773803711,
"learning_rate": 1.4701876012471915e-05,
"loss": 1.7231,
"step": 21600
},
{
"epoch": 1.3607575092493887,
"grad_norm": 219.5289764404297,
"learning_rate": 1.4667038269260918e-05,
"loss": 1.7636,
"step": 21700
},
{
"epoch": 1.3670282811814134,
"grad_norm": 27.880413055419922,
"learning_rate": 1.4632548903482034e-05,
"loss": 1.1193,
"step": 21800
},
{
"epoch": 1.3732990531134384,
"grad_norm": 5.331712245941162,
"learning_rate": 1.459771116027104e-05,
"loss": 1.4662,
"step": 21900
},
{
"epoch": 1.379569825045463,
"grad_norm": 53.37089538574219,
"learning_rate": 1.4562873417060043e-05,
"loss": 2.0394,
"step": 22000
},
{
"epoch": 1.379569825045463,
"eval_loss": 1.1926569938659668,
"eval_runtime": 241.0069,
"eval_samples_per_second": 529.329,
"eval_steps_per_second": 16.543,
"step": 22000
},
{
"epoch": 1.3858405969774878,
"grad_norm": 22.226316452026367,
"learning_rate": 1.452803567384905e-05,
"loss": 1.1535,
"step": 22100
},
{
"epoch": 1.3921113689095128,
"grad_norm": 2.272599458694458,
"learning_rate": 1.4493197930638054e-05,
"loss": 1.4592,
"step": 22200
},
{
"epoch": 1.3983821408415376,
"grad_norm": 25.961870193481445,
"learning_rate": 1.445836018742706e-05,
"loss": 1.276,
"step": 22300
},
{
"epoch": 1.4046529127735625,
"grad_norm": 73.93904113769531,
"learning_rate": 1.4423522444216065e-05,
"loss": 1.2984,
"step": 22400
},
{
"epoch": 1.4109236847055873,
"grad_norm": 122.38665771484375,
"learning_rate": 1.438868470100507e-05,
"loss": 0.9741,
"step": 22500
},
{
"epoch": 1.4109236847055873,
"eval_loss": 1.2707290649414062,
"eval_runtime": 241.8021,
"eval_samples_per_second": 527.588,
"eval_steps_per_second": 16.489,
"step": 22500
},
{
"epoch": 1.417194456637612,
"grad_norm": 402.4999084472656,
"learning_rate": 1.4353846957794076e-05,
"loss": 1.4253,
"step": 22600
},
{
"epoch": 1.423465228569637,
"grad_norm": 0.6434441208839417,
"learning_rate": 1.4319009214583079e-05,
"loss": 1.0769,
"step": 22700
},
{
"epoch": 1.4297360005016617,
"grad_norm": 43.16348648071289,
"learning_rate": 1.4284171471372085e-05,
"loss": 0.8276,
"step": 22800
},
{
"epoch": 1.4360067724336867,
"grad_norm": 91.6303482055664,
"learning_rate": 1.424933372816109e-05,
"loss": 1.2689,
"step": 22900
},
{
"epoch": 1.4422775443657114,
"grad_norm": 123.81659698486328,
"learning_rate": 1.4214495984950096e-05,
"loss": 1.4817,
"step": 23000
},
{
"epoch": 1.4422775443657114,
"eval_loss": 1.2094941139221191,
"eval_runtime": 235.8497,
"eval_samples_per_second": 540.904,
"eval_steps_per_second": 16.905,
"step": 23000
},
{
"epoch": 1.4485483162977362,
"grad_norm": 33.1621208190918,
"learning_rate": 1.4179658241739101e-05,
"loss": 1.1522,
"step": 23100
},
{
"epoch": 1.4548190882297611,
"grad_norm": 0.6552605628967285,
"learning_rate": 1.4144820498528107e-05,
"loss": 0.8978,
"step": 23200
},
{
"epoch": 1.4610898601617859,
"grad_norm": 1.6054786443710327,
"learning_rate": 1.410998275531711e-05,
"loss": 1.015,
"step": 23300
},
{
"epoch": 1.4673606320938108,
"grad_norm": 30.116901397705078,
"learning_rate": 1.4075145012106117e-05,
"loss": 1.0351,
"step": 23400
},
{
"epoch": 1.4736314040258356,
"grad_norm": 74.74423217773438,
"learning_rate": 1.4040307268895121e-05,
"loss": 1.3959,
"step": 23500
},
{
"epoch": 1.4736314040258356,
"eval_loss": 1.1969189643859863,
"eval_runtime": 238.5287,
"eval_samples_per_second": 534.829,
"eval_steps_per_second": 16.715,
"step": 23500
},
{
"epoch": 1.4799021759578603,
"grad_norm": 4.446337699890137,
"learning_rate": 1.4005469525684128e-05,
"loss": 1.2879,
"step": 23600
},
{
"epoch": 1.4861729478898853,
"grad_norm": 132.61671447753906,
"learning_rate": 1.3970631782473132e-05,
"loss": 1.0651,
"step": 23700
},
{
"epoch": 1.49244371982191,
"grad_norm": 64.33197784423828,
"learning_rate": 1.3935794039262139e-05,
"loss": 1.1601,
"step": 23800
},
{
"epoch": 1.498714491753935,
"grad_norm": 0.5995836853981018,
"learning_rate": 1.3900956296051142e-05,
"loss": 1.0034,
"step": 23900
},
{
"epoch": 1.5049852636859598,
"grad_norm": 0.1931271255016327,
"learning_rate": 1.3866118552840148e-05,
"loss": 1.3386,
"step": 24000
},
{
"epoch": 1.5049852636859598,
"eval_loss": 1.1590368747711182,
"eval_runtime": 241.3271,
"eval_samples_per_second": 528.627,
"eval_steps_per_second": 16.521,
"step": 24000
},
{
"epoch": 1.5112560356179845,
"grad_norm": 38.5876579284668,
"learning_rate": 1.3831280809629153e-05,
"loss": 1.142,
"step": 24100
},
{
"epoch": 1.5175268075500095,
"grad_norm": 8.049750328063965,
"learning_rate": 1.3796443066418159e-05,
"loss": 1.3495,
"step": 24200
},
{
"epoch": 1.5237975794820342,
"grad_norm": 32.30927658081055,
"learning_rate": 1.3761605323207164e-05,
"loss": 0.9993,
"step": 24300
},
{
"epoch": 1.5300683514140592,
"grad_norm": 0.47087952494621277,
"learning_rate": 1.372676757999617e-05,
"loss": 0.9363,
"step": 24400
},
{
"epoch": 1.536339123346084,
"grad_norm": 160.12139892578125,
"learning_rate": 1.3691929836785175e-05,
"loss": 1.4402,
"step": 24500
},
{
"epoch": 1.536339123346084,
"eval_loss": 1.2178274393081665,
"eval_runtime": 241.2634,
"eval_samples_per_second": 528.767,
"eval_steps_per_second": 16.526,
"step": 24500
},
{
"epoch": 1.5426098952781087,
"grad_norm": 43.729827880859375,
"learning_rate": 1.3657092093574181e-05,
"loss": 1.0648,
"step": 24600
},
{
"epoch": 1.5488806672101336,
"grad_norm": 16.396068572998047,
"learning_rate": 1.3622254350363184e-05,
"loss": 1.5102,
"step": 24700
},
{
"epoch": 1.5551514391421584,
"grad_norm": 280.9241027832031,
"learning_rate": 1.358741660715219e-05,
"loss": 1.3415,
"step": 24800
},
{
"epoch": 1.5614222110741833,
"grad_norm": 0.3944130539894104,
"learning_rate": 1.3552578863941195e-05,
"loss": 0.7441,
"step": 24900
},
{
"epoch": 1.567692983006208,
"grad_norm": 242.84613037109375,
"learning_rate": 1.3517741120730201e-05,
"loss": 0.901,
"step": 25000
},
{
"epoch": 1.567692983006208,
"eval_loss": 1.1982382535934448,
"eval_runtime": 241.5382,
"eval_samples_per_second": 528.165,
"eval_steps_per_second": 16.507,
"step": 25000
},
{
"epoch": 1.5739637549382328,
"grad_norm": 61.62953567504883,
"learning_rate": 1.3482903377519206e-05,
"loss": 1.3147,
"step": 25100
},
{
"epoch": 1.5802345268702578,
"grad_norm": 2.465519905090332,
"learning_rate": 1.3448065634308212e-05,
"loss": 0.971,
"step": 25200
},
{
"epoch": 1.5865052988022825,
"grad_norm": 184.7733612060547,
"learning_rate": 1.3413227891097215e-05,
"loss": 0.9988,
"step": 25300
},
{
"epoch": 1.5927760707343075,
"grad_norm": 221.9571533203125,
"learning_rate": 1.337839014788622e-05,
"loss": 1.1445,
"step": 25400
},
{
"epoch": 1.5990468426663322,
"grad_norm": 14.548208236694336,
"learning_rate": 1.3343552404675226e-05,
"loss": 1.1018,
"step": 25500
},
{
"epoch": 1.5990468426663322,
"eval_loss": 1.142329454421997,
"eval_runtime": 238.9747,
"eval_samples_per_second": 533.831,
"eval_steps_per_second": 16.684,
"step": 25500
},
{
"epoch": 1.605317614598357,
"grad_norm": 0.4988707900047302,
"learning_rate": 1.330871466146423e-05,
"loss": 1.0902,
"step": 25600
},
{
"epoch": 1.611588386530382,
"grad_norm": 213.1658477783203,
"learning_rate": 1.3273876918253237e-05,
"loss": 1.2577,
"step": 25700
},
{
"epoch": 1.6178591584624067,
"grad_norm": 74.17716217041016,
"learning_rate": 1.3239039175042242e-05,
"loss": 1.2005,
"step": 25800
},
{
"epoch": 1.6241299303944317,
"grad_norm": 196.46742248535156,
"learning_rate": 1.3204201431831248e-05,
"loss": 1.2839,
"step": 25900
},
{
"epoch": 1.6304007023264564,
"grad_norm": 264.5187072753906,
"learning_rate": 1.3169363688620251e-05,
"loss": 1.4122,
"step": 26000
},
{
"epoch": 1.6304007023264564,
"eval_loss": 1.1125129461288452,
"eval_runtime": 238.2144,
"eval_samples_per_second": 535.534,
"eval_steps_per_second": 16.737,
"step": 26000
},
{
"epoch": 1.6366714742584811,
"grad_norm": 0.5429248213768005,
"learning_rate": 1.3134525945409257e-05,
"loss": 0.7832,
"step": 26100
},
{
"epoch": 1.642942246190506,
"grad_norm": 0.07243086397647858,
"learning_rate": 1.3099688202198262e-05,
"loss": 1.3278,
"step": 26200
},
{
"epoch": 1.6492130181225308,
"grad_norm": 176.74636840820312,
"learning_rate": 1.3064850458987268e-05,
"loss": 1.2055,
"step": 26300
},
{
"epoch": 1.6554837900545558,
"grad_norm": 1.1564711332321167,
"learning_rate": 1.3030012715776273e-05,
"loss": 1.5814,
"step": 26400
},
{
"epoch": 1.6617545619865806,
"grad_norm": 0.3095082640647888,
"learning_rate": 1.299517497256528e-05,
"loss": 1.0393,
"step": 26500
},
{
"epoch": 1.6617545619865806,
"eval_loss": 1.0945708751678467,
"eval_runtime": 240.297,
"eval_samples_per_second": 530.893,
"eval_steps_per_second": 16.592,
"step": 26500
},
{
"epoch": 1.6680253339186053,
"grad_norm": 0.8863621354103088,
"learning_rate": 1.2960337229354282e-05,
"loss": 1.4531,
"step": 26600
},
{
"epoch": 1.6742961058506303,
"grad_norm": 0.15211889147758484,
"learning_rate": 1.2925499486143289e-05,
"loss": 1.4162,
"step": 26700
},
{
"epoch": 1.680566877782655,
"grad_norm": 0.271015465259552,
"learning_rate": 1.2890661742932293e-05,
"loss": 0.8498,
"step": 26800
},
{
"epoch": 1.68683764971468,
"grad_norm": 1.462451457977295,
"learning_rate": 1.285617237715341e-05,
"loss": 1.1318,
"step": 26900
},
{
"epoch": 1.6931084216467047,
"grad_norm": 1.1144922971725464,
"learning_rate": 1.2821334633942416e-05,
"loss": 1.3287,
"step": 27000
},
{
"epoch": 1.6931084216467047,
"eval_loss": 1.0439221858978271,
"eval_runtime": 239.1496,
"eval_samples_per_second": 533.44,
"eval_steps_per_second": 16.672,
"step": 27000
},
{
"epoch": 1.6993791935787295,
"grad_norm": 1.3803671598434448,
"learning_rate": 1.2786496890731419e-05,
"loss": 1.0886,
"step": 27100
},
{
"epoch": 1.7056499655107544,
"grad_norm": 51.79226303100586,
"learning_rate": 1.2752007524952535e-05,
"loss": 0.8991,
"step": 27200
},
{
"epoch": 1.7119207374427792,
"grad_norm": 17.195894241333008,
"learning_rate": 1.2717169781741541e-05,
"loss": 0.7563,
"step": 27300
},
{
"epoch": 1.7181915093748041,
"grad_norm": 0.548939049243927,
"learning_rate": 1.2682332038530544e-05,
"loss": 0.9284,
"step": 27400
},
{
"epoch": 1.7244622813068289,
"grad_norm": 3.179530620574951,
"learning_rate": 1.264749429531955e-05,
"loss": 1.3388,
"step": 27500
},
{
"epoch": 1.7244622813068289,
"eval_loss": 1.0940054655075073,
"eval_runtime": 239.4702,
"eval_samples_per_second": 532.726,
"eval_steps_per_second": 16.649,
"step": 27500
},
{
"epoch": 1.7307330532388536,
"grad_norm": 0.8089356422424316,
"learning_rate": 1.2612656552108555e-05,
"loss": 1.2951,
"step": 27600
},
{
"epoch": 1.7370038251708786,
"grad_norm": 698.0848388671875,
"learning_rate": 1.2577818808897562e-05,
"loss": 0.9789,
"step": 27700
},
{
"epoch": 1.7432745971029033,
"grad_norm": 156.7066192626953,
"learning_rate": 1.2542981065686566e-05,
"loss": 1.2898,
"step": 27800
},
{
"epoch": 1.7495453690349283,
"grad_norm": 59.603519439697266,
"learning_rate": 1.2508143322475569e-05,
"loss": 0.9915,
"step": 27900
},
{
"epoch": 1.755816140966953,
"grad_norm": 5.36550760269165,
"learning_rate": 1.2473305579264575e-05,
"loss": 1.5349,
"step": 28000
},
{
"epoch": 1.755816140966953,
"eval_loss": 1.0266426801681519,
"eval_runtime": 240.254,
"eval_samples_per_second": 530.988,
"eval_steps_per_second": 16.595,
"step": 28000
},
{
"epoch": 1.7620869128989778,
"grad_norm": 3.0849006175994873,
"learning_rate": 1.243846783605358e-05,
"loss": 1.124,
"step": 28100
},
{
"epoch": 1.7683576848310028,
"grad_norm": 2.890775442123413,
"learning_rate": 1.2403630092842586e-05,
"loss": 0.809,
"step": 28200
},
{
"epoch": 1.7746284567630275,
"grad_norm": 0.6994801163673401,
"learning_rate": 1.2368792349631591e-05,
"loss": 0.9617,
"step": 28300
},
{
"epoch": 1.7808992286950525,
"grad_norm": 14.703944206237793,
"learning_rate": 1.2333954606420597e-05,
"loss": 1.3061,
"step": 28400
},
{
"epoch": 1.7871700006270772,
"grad_norm": 188.39633178710938,
"learning_rate": 1.2299116863209602e-05,
"loss": 1.1323,
"step": 28500
},
{
"epoch": 1.7871700006270772,
"eval_loss": 1.0488332509994507,
"eval_runtime": 240.6796,
"eval_samples_per_second": 530.049,
"eval_steps_per_second": 16.566,
"step": 28500
},
{
"epoch": 1.793440772559102,
"grad_norm": 12.853857040405273,
"learning_rate": 1.2264279119998608e-05,
"loss": 1.2991,
"step": 28600
},
{
"epoch": 1.7997115444911267,
"grad_norm": 17.315292358398438,
"learning_rate": 1.2229441376787611e-05,
"loss": 0.8708,
"step": 28700
},
{
"epoch": 1.8059823164231517,
"grad_norm": 24.514192581176758,
"learning_rate": 1.2194603633576618e-05,
"loss": 0.7493,
"step": 28800
},
{
"epoch": 1.8122530883551766,
"grad_norm": 17.776947021484375,
"learning_rate": 1.2159765890365622e-05,
"loss": 1.004,
"step": 28900
},
{
"epoch": 1.8185238602872014,
"grad_norm": 154.2757110595703,
"learning_rate": 1.2124928147154629e-05,
"loss": 1.1477,
"step": 29000
},
{
"epoch": 1.8185238602872014,
"eval_loss": 1.0206255912780762,
"eval_runtime": 238.9764,
"eval_samples_per_second": 533.827,
"eval_steps_per_second": 16.684,
"step": 29000
},
{
"epoch": 1.824794632219226,
"grad_norm": 174.9512939453125,
"learning_rate": 1.2090090403943633e-05,
"loss": 1.1826,
"step": 29100
},
{
"epoch": 1.8310654041512509,
"grad_norm": 251.60848999023438,
"learning_rate": 1.205525266073264e-05,
"loss": 1.0961,
"step": 29200
},
{
"epoch": 1.8373361760832758,
"grad_norm": 15.37478256225586,
"learning_rate": 1.2020414917521643e-05,
"loss": 1.4743,
"step": 29300
},
{
"epoch": 1.8436069480153008,
"grad_norm": 17.250076293945312,
"learning_rate": 1.1985577174310649e-05,
"loss": 0.8413,
"step": 29400
},
{
"epoch": 1.8498777199473255,
"grad_norm": 0.08943232893943787,
"learning_rate": 1.1950739431099654e-05,
"loss": 1.2623,
"step": 29500
},
{
"epoch": 1.8498777199473255,
"eval_loss": 1.004668951034546,
"eval_runtime": 241.047,
"eval_samples_per_second": 529.241,
"eval_steps_per_second": 16.54,
"step": 29500
},
{
"epoch": 1.8561484918793503,
"grad_norm": 66.96379089355469,
"learning_rate": 1.191590168788866e-05,
"loss": 0.8486,
"step": 29600
},
{
"epoch": 1.862419263811375,
"grad_norm": 62.850799560546875,
"learning_rate": 1.1881063944677665e-05,
"loss": 1.4481,
"step": 29700
},
{
"epoch": 1.8686900357434,
"grad_norm": 1.5179458856582642,
"learning_rate": 1.1846226201466671e-05,
"loss": 1.2704,
"step": 29800
},
{
"epoch": 1.874960807675425,
"grad_norm": 0.09656574577093124,
"learning_rate": 1.1811388458255676e-05,
"loss": 1.1913,
"step": 29900
},
{
"epoch": 1.8812315796074497,
"grad_norm": 0.12182077020406723,
"learning_rate": 1.1776550715044682e-05,
"loss": 0.9369,
"step": 30000
},
{
"epoch": 1.8812315796074497,
"eval_loss": 1.0277103185653687,
"eval_runtime": 240.7265,
"eval_samples_per_second": 529.946,
"eval_steps_per_second": 16.562,
"step": 30000
},
{
"epoch": 1.8875023515394744,
"grad_norm": 171.4630126953125,
"learning_rate": 1.1741712971833685e-05,
"loss": 1.2427,
"step": 30100
},
{
"epoch": 1.8937731234714992,
"grad_norm": 14.272507667541504,
"learning_rate": 1.1706875228622691e-05,
"loss": 1.0576,
"step": 30200
},
{
"epoch": 1.9000438954035241,
"grad_norm": 8.003202438354492,
"learning_rate": 1.1672037485411696e-05,
"loss": 0.9188,
"step": 30300
},
{
"epoch": 1.906314667335549,
"grad_norm": 72.2535629272461,
"learning_rate": 1.1637199742200702e-05,
"loss": 1.3227,
"step": 30400
},
{
"epoch": 1.9125854392675739,
"grad_norm": 60.970176696777344,
"learning_rate": 1.1602361998989707e-05,
"loss": 1.4614,
"step": 30500
},
{
"epoch": 1.9125854392675739,
"eval_loss": 1.0549676418304443,
"eval_runtime": 232.0087,
"eval_samples_per_second": 549.859,
"eval_steps_per_second": 17.185,
"step": 30500
},
{
"epoch": 1.9188562111995986,
"grad_norm": 113.54409790039062,
"learning_rate": 1.1567524255778713e-05,
"loss": 1.2316,
"step": 30600
},
{
"epoch": 1.9251269831316233,
"grad_norm": 1.6219086647033691,
"learning_rate": 1.1532686512567716e-05,
"loss": 0.9487,
"step": 30700
},
{
"epoch": 1.9313977550636483,
"grad_norm": 74.66547393798828,
"learning_rate": 1.1497848769356722e-05,
"loss": 1.1651,
"step": 30800
},
{
"epoch": 1.9376685269956733,
"grad_norm": 0.036245282739400864,
"learning_rate": 1.1463011026145727e-05,
"loss": 1.1622,
"step": 30900
},
{
"epoch": 1.943939298927698,
"grad_norm": 1.6117188930511475,
"learning_rate": 1.1428173282934732e-05,
"loss": 1.1801,
"step": 31000
},
{
"epoch": 1.943939298927698,
"eval_loss": 0.9981088042259216,
"eval_runtime": 241.2373,
"eval_samples_per_second": 528.824,
"eval_steps_per_second": 16.527,
"step": 31000
},
{
"epoch": 1.9502100708597228,
"grad_norm": 4.923341751098633,
"learning_rate": 1.1393335539723738e-05,
"loss": 0.8798,
"step": 31100
},
{
"epoch": 1.9564808427917475,
"grad_norm": 214.46116638183594,
"learning_rate": 1.1358497796512741e-05,
"loss": 0.7196,
"step": 31200
},
{
"epoch": 1.9627516147237725,
"grad_norm": 16.161603927612305,
"learning_rate": 1.1323660053301749e-05,
"loss": 1.2003,
"step": 31300
},
{
"epoch": 1.9690223866557974,
"grad_norm": 249.83189392089844,
"learning_rate": 1.1289170687522864e-05,
"loss": 1.1823,
"step": 31400
},
{
"epoch": 1.9752931585878222,
"grad_norm": 18.310449600219727,
"learning_rate": 1.1254332944311868e-05,
"loss": 1.1453,
"step": 31500
},
{
"epoch": 1.9752931585878222,
"eval_loss": 1.0320409536361694,
"eval_runtime": 237.3095,
"eval_samples_per_second": 537.576,
"eval_steps_per_second": 16.801,
"step": 31500
},
{
"epoch": 1.981563930519847,
"grad_norm": 221.7801513671875,
"learning_rate": 1.1219495201100875e-05,
"loss": 1.4751,
"step": 31600
},
{
"epoch": 1.9878347024518717,
"grad_norm": 20.95890235900879,
"learning_rate": 1.1184657457889878e-05,
"loss": 0.8502,
"step": 31700
},
{
"epoch": 1.9941054743838966,
"grad_norm": 2.7732744216918945,
"learning_rate": 1.1149819714678884e-05,
"loss": 0.8757,
"step": 31800
},
{
"epoch": 2.0003762463159216,
"grad_norm": 1.1170719861984253,
"learning_rate": 1.1114981971467889e-05,
"loss": 1.0489,
"step": 31900
},
{
"epoch": 2.0066470182479463,
"grad_norm": 31.308385848999023,
"learning_rate": 1.1080144228256895e-05,
"loss": 1.4672,
"step": 32000
},
{
"epoch": 2.0066470182479463,
"eval_loss": 1.0570933818817139,
"eval_runtime": 236.2248,
"eval_samples_per_second": 540.045,
"eval_steps_per_second": 16.878,
"step": 32000
},
{
"epoch": 2.012917790179971,
"grad_norm": 0.2743261754512787,
"learning_rate": 1.10453064850459e-05,
"loss": 0.9474,
"step": 32100
},
{
"epoch": 2.019188562111996,
"grad_norm": 2.2496840953826904,
"learning_rate": 1.1010468741834906e-05,
"loss": 0.8037,
"step": 32200
},
{
"epoch": 2.025459334044021,
"grad_norm": 32.999935150146484,
"learning_rate": 1.0975630998623909e-05,
"loss": 0.9782,
"step": 32300
},
{
"epoch": 2.0317301059760458,
"grad_norm": 19.94236183166504,
"learning_rate": 1.0940793255412915e-05,
"loss": 0.6943,
"step": 32400
},
{
"epoch": 2.0380008779080705,
"grad_norm": 0.7693130373954773,
"learning_rate": 1.090595551220192e-05,
"loss": 1.0097,
"step": 32500
},
{
"epoch": 2.0380008779080705,
"eval_loss": 0.9797225594520569,
"eval_runtime": 237.8696,
"eval_samples_per_second": 536.311,
"eval_steps_per_second": 16.761,
"step": 32500
},
{
"epoch": 2.0442716498400952,
"grad_norm": 156.60507202148438,
"learning_rate": 1.0871117768990926e-05,
"loss": 0.9067,
"step": 32600
},
{
"epoch": 2.05054242177212,
"grad_norm": 45.05233383178711,
"learning_rate": 1.083628002577993e-05,
"loss": 1.09,
"step": 32700
},
{
"epoch": 2.056813193704145,
"grad_norm": 0.9790059328079224,
"learning_rate": 1.0801442282568937e-05,
"loss": 0.8464,
"step": 32800
},
{
"epoch": 2.06308396563617,
"grad_norm": 311.8387145996094,
"learning_rate": 1.0766604539357942e-05,
"loss": 0.9359,
"step": 32900
},
{
"epoch": 2.0693547375681947,
"grad_norm": 2.4389493465423584,
"learning_rate": 1.0731766796146948e-05,
"loss": 0.813,
"step": 33000
},
{
"epoch": 2.0693547375681947,
"eval_loss": 0.990721583366394,
"eval_runtime": 239.867,
"eval_samples_per_second": 531.845,
"eval_steps_per_second": 16.622,
"step": 33000
},
{
"epoch": 2.0756255095002194,
"grad_norm": 40.27507781982422,
"learning_rate": 1.0696929052935951e-05,
"loss": 0.8738,
"step": 33100
},
{
"epoch": 2.081896281432244,
"grad_norm": 0.029316190630197525,
"learning_rate": 1.0662091309724957e-05,
"loss": 0.8178,
"step": 33200
},
{
"epoch": 2.0881670533642693,
"grad_norm": 0.06512907892465591,
"learning_rate": 1.0627253566513962e-05,
"loss": 1.1704,
"step": 33300
},
{
"epoch": 2.094437825296294,
"grad_norm": 14.495019912719727,
"learning_rate": 1.0592415823302968e-05,
"loss": 1.0073,
"step": 33400
},
{
"epoch": 2.100708597228319,
"grad_norm": 85.92517852783203,
"learning_rate": 1.0557578080091973e-05,
"loss": 1.1849,
"step": 33500
},
{
"epoch": 2.100708597228319,
"eval_loss": 0.9582126140594482,
"eval_runtime": 238.5255,
"eval_samples_per_second": 534.836,
"eval_steps_per_second": 16.715,
"step": 33500
},
{
"epoch": 2.1069793691603436,
"grad_norm": 0.8284154534339905,
"learning_rate": 1.0522740336880976e-05,
"loss": 0.7795,
"step": 33600
},
{
"epoch": 2.1132501410923683,
"grad_norm": 3.656404972076416,
"learning_rate": 1.0487902593669982e-05,
"loss": 0.7688,
"step": 33700
},
{
"epoch": 2.1195209130243935,
"grad_norm": 0.08456479012966156,
"learning_rate": 1.0453064850458987e-05,
"loss": 0.9465,
"step": 33800
},
{
"epoch": 2.1257916849564182,
"grad_norm": 27.962339401245117,
"learning_rate": 1.0418227107247993e-05,
"loss": 1.0883,
"step": 33900
},
{
"epoch": 2.132062456888443,
"grad_norm": 37.31398010253906,
"learning_rate": 1.0383389364036998e-05,
"loss": 0.7711,
"step": 34000
},
{
"epoch": 2.132062456888443,
"eval_loss": 0.955656886100769,
"eval_runtime": 237.3977,
"eval_samples_per_second": 537.377,
"eval_steps_per_second": 16.795,
"step": 34000
},
{
"epoch": 2.1383332288204677,
"grad_norm": 3.700526714324951,
"learning_rate": 1.0348551620826004e-05,
"loss": 0.9767,
"step": 34100
},
{
"epoch": 2.1446040007524925,
"grad_norm": 111.15718841552734,
"learning_rate": 1.0313713877615009e-05,
"loss": 0.6702,
"step": 34200
},
{
"epoch": 2.1508747726845177,
"grad_norm": 0.5821614861488342,
"learning_rate": 1.0278876134404015e-05,
"loss": 0.9444,
"step": 34300
},
{
"epoch": 2.1571455446165424,
"grad_norm": 20.9290771484375,
"learning_rate": 1.0244038391193018e-05,
"loss": 0.8741,
"step": 34400
},
{
"epoch": 2.163416316548567,
"grad_norm": 52.165771484375,
"learning_rate": 1.0209200647982025e-05,
"loss": 1.0717,
"step": 34500
},
{
"epoch": 2.163416316548567,
"eval_loss": 0.9526209831237793,
"eval_runtime": 235.6861,
"eval_samples_per_second": 541.279,
"eval_steps_per_second": 16.917,
"step": 34500
},
{
"epoch": 2.169687088480592,
"grad_norm": 0.01671871915459633,
"learning_rate": 1.017436290477103e-05,
"loss": 0.8584,
"step": 34600
},
{
"epoch": 2.1759578604126166,
"grad_norm": 12.125747680664062,
"learning_rate": 1.0139525161560035e-05,
"loss": 0.8926,
"step": 34700
},
{
"epoch": 2.182228632344642,
"grad_norm": 114.18839263916016,
"learning_rate": 1.010468741834904e-05,
"loss": 0.8567,
"step": 34800
},
{
"epoch": 2.1884994042766666,
"grad_norm": 0.2531642019748688,
"learning_rate": 1.0069849675138046e-05,
"loss": 0.71,
"step": 34900
},
{
"epoch": 2.1947701762086913,
"grad_norm": 160.1878662109375,
"learning_rate": 1.0035360309359161e-05,
"loss": 1.1285,
"step": 35000
},
{
"epoch": 2.1947701762086913,
"eval_loss": 0.958905816078186,
"eval_runtime": 235.6565,
"eval_samples_per_second": 541.347,
"eval_steps_per_second": 16.919,
"step": 35000
},
{
"epoch": 2.201040948140716,
"grad_norm": 42.54741287231445,
"learning_rate": 1.0000522566148166e-05,
"loss": 0.8999,
"step": 35100
},
{
"epoch": 2.207311720072741,
"grad_norm": 291.0119323730469,
"learning_rate": 9.96568482293717e-06,
"loss": 0.8459,
"step": 35200
},
{
"epoch": 2.213582492004766,
"grad_norm": 3.8935604095458984,
"learning_rate": 9.930847079726175e-06,
"loss": 1.0608,
"step": 35300
},
{
"epoch": 2.2198532639367907,
"grad_norm": 73.73111724853516,
"learning_rate": 9.896009336515181e-06,
"loss": 0.6115,
"step": 35400
},
{
"epoch": 2.2261240358688155,
"grad_norm": 137.14573669433594,
"learning_rate": 9.861171593304186e-06,
"loss": 1.2468,
"step": 35500
},
{
"epoch": 2.2261240358688155,
"eval_loss": 0.9768953323364258,
"eval_runtime": 237.6341,
"eval_samples_per_second": 536.842,
"eval_steps_per_second": 16.778,
"step": 35500
},
{
"epoch": 2.23239480780084,
"grad_norm": 72.25751495361328,
"learning_rate": 9.826333850093192e-06,
"loss": 0.9987,
"step": 35600
},
{
"epoch": 2.238665579732865,
"grad_norm": 310.7902526855469,
"learning_rate": 9.791496106882197e-06,
"loss": 0.9186,
"step": 35700
},
{
"epoch": 2.24493635166489,
"grad_norm": 0.11791533976793289,
"learning_rate": 9.756658363671202e-06,
"loss": 1.0505,
"step": 35800
},
{
"epoch": 2.251207123596915,
"grad_norm": 43.25834274291992,
"learning_rate": 9.721820620460208e-06,
"loss": 0.6253,
"step": 35900
},
{
"epoch": 2.2574778955289396,
"grad_norm": 29.648263931274414,
"learning_rate": 9.686982877249213e-06,
"loss": 0.6523,
"step": 36000
},
{
"epoch": 2.2574778955289396,
"eval_loss": 0.9501162171363831,
"eval_runtime": 238.1223,
"eval_samples_per_second": 535.742,
"eval_steps_per_second": 16.743,
"step": 36000
},
{
"epoch": 2.2637486674609644,
"grad_norm": 5.313396453857422,
"learning_rate": 9.652145134038217e-06,
"loss": 0.8252,
"step": 36100
},
{
"epoch": 2.270019439392989,
"grad_norm": 0.04373766854405403,
"learning_rate": 9.617307390827224e-06,
"loss": 0.9793,
"step": 36200
},
{
"epoch": 2.2762902113250143,
"grad_norm": 118.00153350830078,
"learning_rate": 9.582469647616228e-06,
"loss": 0.8845,
"step": 36300
},
{
"epoch": 2.282560983257039,
"grad_norm": 99.67394256591797,
"learning_rate": 9.547631904405233e-06,
"loss": 1.0121,
"step": 36400
},
{
"epoch": 2.288831755189064,
"grad_norm": 0.7632407546043396,
"learning_rate": 9.51279416119424e-06,
"loss": 0.9849,
"step": 36500
},
{
"epoch": 2.288831755189064,
"eval_loss": 0.9245060086250305,
"eval_runtime": 237.5388,
"eval_samples_per_second": 537.058,
"eval_steps_per_second": 16.785,
"step": 36500
},
{
"epoch": 2.2951025271210885,
"grad_norm": 0.21792149543762207,
"learning_rate": 9.477956417983244e-06,
"loss": 1.2937,
"step": 36600
},
{
"epoch": 2.3013732990531133,
"grad_norm": 161.54714965820312,
"learning_rate": 9.443118674772248e-06,
"loss": 1.0484,
"step": 36700
},
{
"epoch": 2.3076440709851385,
"grad_norm": 1.5865380764007568,
"learning_rate": 9.408280931561255e-06,
"loss": 0.8801,
"step": 36800
},
{
"epoch": 2.313914842917163,
"grad_norm": 52.73973846435547,
"learning_rate": 9.37344318835026e-06,
"loss": 0.7552,
"step": 36900
},
{
"epoch": 2.320185614849188,
"grad_norm": 72.2259750366211,
"learning_rate": 9.338605445139266e-06,
"loss": 0.7641,
"step": 37000
},
{
"epoch": 2.320185614849188,
"eval_loss": 0.9280443787574768,
"eval_runtime": 234.7895,
"eval_samples_per_second": 543.346,
"eval_steps_per_second": 16.981,
"step": 37000
},
{
"epoch": 2.3264563867812127,
"grad_norm": 161.67674255371094,
"learning_rate": 9.30376770192827e-06,
"loss": 0.883,
"step": 37100
},
{
"epoch": 2.3327271587132374,
"grad_norm": 0.07621905952692032,
"learning_rate": 9.269278336149385e-06,
"loss": 0.77,
"step": 37200
},
{
"epoch": 2.3389979306452626,
"grad_norm": 0.2586478292942047,
"learning_rate": 9.234440592938391e-06,
"loss": 1.2699,
"step": 37300
},
{
"epoch": 2.3452687025772874,
"grad_norm": 79.81159973144531,
"learning_rate": 9.199602849727396e-06,
"loss": 0.8766,
"step": 37400
},
{
"epoch": 2.351539474509312,
"grad_norm": 7.059108257293701,
"learning_rate": 9.1647651065164e-06,
"loss": 1.1154,
"step": 37500
},
{
"epoch": 2.351539474509312,
"eval_loss": 0.962340772151947,
"eval_runtime": 238.8795,
"eval_samples_per_second": 534.043,
"eval_steps_per_second": 16.69,
"step": 37500
},
{
"epoch": 2.357810246441337,
"grad_norm": 1.4081709384918213,
"learning_rate": 9.129927363305405e-06,
"loss": 1.0634,
"step": 37600
},
{
"epoch": 2.3640810183733616,
"grad_norm": 0.605450451374054,
"learning_rate": 9.09508962009441e-06,
"loss": 0.8822,
"step": 37700
},
{
"epoch": 2.370351790305387,
"grad_norm": 1.7804793119430542,
"learning_rate": 9.060251876883416e-06,
"loss": 0.839,
"step": 37800
},
{
"epoch": 2.3766225622374115,
"grad_norm": 0.285157710313797,
"learning_rate": 9.025414133672421e-06,
"loss": 0.684,
"step": 37900
},
{
"epoch": 2.3828933341694363,
"grad_norm": 1.6291695833206177,
"learning_rate": 8.990576390461425e-06,
"loss": 0.8051,
"step": 38000
},
{
"epoch": 2.3828933341694363,
"eval_loss": 0.9198396801948547,
"eval_runtime": 235.4699,
"eval_samples_per_second": 541.776,
"eval_steps_per_second": 16.932,
"step": 38000
},
{
"epoch": 2.389164106101461,
"grad_norm": 0.22198112308979034,
"learning_rate": 8.955738647250432e-06,
"loss": 0.9585,
"step": 38100
},
{
"epoch": 2.3954348780334858,
"grad_norm": 0.15497685968875885,
"learning_rate": 8.920900904039436e-06,
"loss": 0.7156,
"step": 38200
},
{
"epoch": 2.401705649965511,
"grad_norm": 3.716522216796875,
"learning_rate": 8.886063160828443e-06,
"loss": 0.5271,
"step": 38300
},
{
"epoch": 2.4079764218975357,
"grad_norm": 211.54660034179688,
"learning_rate": 8.851225417617447e-06,
"loss": 0.805,
"step": 38400
},
{
"epoch": 2.4142471938295604,
"grad_norm": 104.68868255615234,
"learning_rate": 8.816387674406452e-06,
"loss": 0.7898,
"step": 38500
},
{
"epoch": 2.4142471938295604,
"eval_loss": 0.8785400986671448,
"eval_runtime": 236.3653,
"eval_samples_per_second": 539.724,
"eval_steps_per_second": 16.868,
"step": 38500
},
{
"epoch": 2.420517965761585,
"grad_norm": 117.63562774658203,
"learning_rate": 8.781549931195458e-06,
"loss": 0.6935,
"step": 38600
},
{
"epoch": 2.42678873769361,
"grad_norm": 6.395357131958008,
"learning_rate": 8.746712187984463e-06,
"loss": 0.8011,
"step": 38700
},
{
"epoch": 2.433059509625635,
"grad_norm": 146.0078582763672,
"learning_rate": 8.711874444773468e-06,
"loss": 0.9812,
"step": 38800
},
{
"epoch": 2.43933028155766,
"grad_norm": 0.07249762117862701,
"learning_rate": 8.677036701562474e-06,
"loss": 0.4427,
"step": 38900
},
{
"epoch": 2.4456010534896846,
"grad_norm": 113.86747741699219,
"learning_rate": 8.642198958351479e-06,
"loss": 0.492,
"step": 39000
},
{
"epoch": 2.4456010534896846,
"eval_loss": 0.9312570095062256,
"eval_runtime": 235.7259,
"eval_samples_per_second": 541.188,
"eval_steps_per_second": 16.914,
"step": 39000
},
{
"epoch": 2.4518718254217093,
"grad_norm": 174.28895568847656,
"learning_rate": 8.607361215140483e-06,
"loss": 0.47,
"step": 39100
},
{
"epoch": 2.458142597353734,
"grad_norm": 1.0906648635864258,
"learning_rate": 8.5728718493616e-06,
"loss": 1.1876,
"step": 39200
},
{
"epoch": 2.4644133692857593,
"grad_norm": 1.2390027046203613,
"learning_rate": 8.538034106150604e-06,
"loss": 0.5778,
"step": 39300
},
{
"epoch": 2.470684141217784,
"grad_norm": 8.68694019317627,
"learning_rate": 8.503196362939609e-06,
"loss": 0.6763,
"step": 39400
},
{
"epoch": 2.4769549131498088,
"grad_norm": 0.0290305744856596,
"learning_rate": 8.468358619728615e-06,
"loss": 0.6896,
"step": 39500
},
{
"epoch": 2.4769549131498088,
"eval_loss": 0.8978257179260254,
"eval_runtime": 238.5786,
"eval_samples_per_second": 534.717,
"eval_steps_per_second": 16.711,
"step": 39500
},
{
"epoch": 2.4832256850818335,
"grad_norm": NaN,
"learning_rate": 8.43386925394973e-06,
"loss": 0.8905,
"step": 39600
},
{
"epoch": 2.4894964570138582,
"grad_norm": 0.6685202121734619,
"learning_rate": 8.399031510738736e-06,
"loss": 0.7845,
"step": 39700
},
{
"epoch": 2.4957672289458834,
"grad_norm": 0.6609179377555847,
"learning_rate": 8.36419376752774e-06,
"loss": 0.8691,
"step": 39800
},
{
"epoch": 2.502038000877908,
"grad_norm": 0.44005250930786133,
"learning_rate": 8.329356024316745e-06,
"loss": 0.55,
"step": 39900
},
{
"epoch": 2.508308772809933,
"grad_norm": 191.84471130371094,
"learning_rate": 8.294518281105752e-06,
"loss": 0.6978,
"step": 40000
},
{
"epoch": 2.508308772809933,
"eval_loss": 0.9054428935050964,
"eval_runtime": 236.0808,
"eval_samples_per_second": 540.374,
"eval_steps_per_second": 16.888,
"step": 40000
},
{
"epoch": 2.5145795447419577,
"grad_norm": 20.063995361328125,
"learning_rate": 8.259680537894755e-06,
"loss": 0.6378,
"step": 40100
},
{
"epoch": 2.5208503166739824,
"grad_norm": 1.4460866451263428,
"learning_rate": 8.224842794683761e-06,
"loss": 0.895,
"step": 40200
},
{
"epoch": 2.527121088606007,
"grad_norm": 0.06669195741415024,
"learning_rate": 8.190353428904876e-06,
"loss": 0.9683,
"step": 40300
},
{
"epoch": 2.5333918605380323,
"grad_norm": 80.40859985351562,
"learning_rate": 8.155515685693882e-06,
"loss": 0.9373,
"step": 40400
},
{
"epoch": 2.539662632470057,
"grad_norm": 0.014817653223872185,
"learning_rate": 8.120677942482887e-06,
"loss": 0.7406,
"step": 40500
},
{
"epoch": 2.539662632470057,
"eval_loss": 0.912805438041687,
"eval_runtime": 230.5789,
"eval_samples_per_second": 553.268,
"eval_steps_per_second": 17.291,
"step": 40500
},
{
"epoch": 2.545933404402082,
"grad_norm": 41.673622131347656,
"learning_rate": 8.085840199271891e-06,
"loss": 0.8917,
"step": 40600
},
{
"epoch": 2.5522041763341066,
"grad_norm": 213.1597900390625,
"learning_rate": 8.051002456060898e-06,
"loss": 1.0552,
"step": 40700
},
{
"epoch": 2.5584749482661318,
"grad_norm": 65.40398406982422,
"learning_rate": 8.016164712849902e-06,
"loss": 0.5281,
"step": 40800
},
{
"epoch": 2.5647457201981565,
"grad_norm": 4.673154830932617,
"learning_rate": 7.981326969638907e-06,
"loss": 0.9064,
"step": 40900
},
{
"epoch": 2.5710164921301812,
"grad_norm": 187.15573120117188,
"learning_rate": 7.946489226427913e-06,
"loss": 0.6886,
"step": 41000
},
{
"epoch": 2.5710164921301812,
"eval_loss": 0.9048876166343689,
"eval_runtime": 269.6795,
"eval_samples_per_second": 473.05,
"eval_steps_per_second": 14.784,
"step": 41000
},
{
"epoch": 2.577287264062206,
"grad_norm": 0.14457735419273376,
"learning_rate": 7.911651483216918e-06,
"loss": 0.7166,
"step": 41100
},
{
"epoch": 2.5835580359942307,
"grad_norm": 126.45314025878906,
"learning_rate": 7.876813740005922e-06,
"loss": 0.8343,
"step": 41200
},
{
"epoch": 2.5898288079262555,
"grad_norm": 0.15031389892101288,
"learning_rate": 7.841975996794929e-06,
"loss": 0.9468,
"step": 41300
},
{
"epoch": 2.5960995798582807,
"grad_norm": 0.14378446340560913,
"learning_rate": 7.807138253583933e-06,
"loss": 0.8529,
"step": 41400
},
{
"epoch": 2.6023703517903054,
"grad_norm": 0.031118595972657204,
"learning_rate": 7.772300510372938e-06,
"loss": 0.8092,
"step": 41500
},
{
"epoch": 2.6023703517903054,
"eval_loss": 0.8954480886459351,
"eval_runtime": 246.3441,
"eval_samples_per_second": 517.861,
"eval_steps_per_second": 16.185,
"step": 41500
},
{
"epoch": 2.60864112372233,
"grad_norm": 17.187223434448242,
"learning_rate": 7.737462767161944e-06,
"loss": 0.8501,
"step": 41600
},
{
"epoch": 2.614911895654355,
"grad_norm": 3.00113844871521,
"learning_rate": 7.702625023950949e-06,
"loss": 0.9877,
"step": 41700
},
{
"epoch": 2.62118266758638,
"grad_norm": 0.45281580090522766,
"learning_rate": 7.667787280739954e-06,
"loss": 0.8592,
"step": 41800
},
{
"epoch": 2.627453439518405,
"grad_norm": 79.49444580078125,
"learning_rate": 7.63294953752896e-06,
"loss": 0.8632,
"step": 41900
},
{
"epoch": 2.6337242114504296,
"grad_norm": 0.05600200593471527,
"learning_rate": 7.598111794317965e-06,
"loss": 0.6766,
"step": 42000
},
{
"epoch": 2.6337242114504296,
"eval_loss": 0.8706979751586914,
"eval_runtime": 245.6205,
"eval_samples_per_second": 519.387,
"eval_steps_per_second": 16.232,
"step": 42000
},
{
"epoch": 2.6399949833824543,
"grad_norm": 20.844148635864258,
"learning_rate": 7.56327405110697e-06,
"loss": 0.7587,
"step": 42100
},
{
"epoch": 2.646265755314479,
"grad_norm": 0.24995607137680054,
"learning_rate": 7.528436307895976e-06,
"loss": 0.8949,
"step": 42200
},
{
"epoch": 2.652536527246504,
"grad_norm": 80.21415710449219,
"learning_rate": 7.49359856468498e-06,
"loss": 0.4173,
"step": 42300
},
{
"epoch": 2.658807299178529,
"grad_norm": 24.900297164916992,
"learning_rate": 7.458760821473986e-06,
"loss": 0.5995,
"step": 42400
},
{
"epoch": 2.6650780711105537,
"grad_norm": 231.90145874023438,
"learning_rate": 7.423923078262991e-06,
"loss": 0.8157,
"step": 42500
},
{
"epoch": 2.6650780711105537,
"eval_loss": 0.8680915236473083,
"eval_runtime": 245.7882,
"eval_samples_per_second": 519.032,
"eval_steps_per_second": 16.221,
"step": 42500
},
{
"epoch": 2.6713488430425785,
"grad_norm": 0.030076002702116966,
"learning_rate": 7.389085335051997e-06,
"loss": 0.92,
"step": 42600
},
{
"epoch": 2.677619614974603,
"grad_norm": 391.046875,
"learning_rate": 7.354247591841001e-06,
"loss": 0.9118,
"step": 42700
},
{
"epoch": 2.6838903869066284,
"grad_norm": 0.29524192214012146,
"learning_rate": 7.319409848630006e-06,
"loss": 0.7446,
"step": 42800
},
{
"epoch": 2.690161158838653,
"grad_norm": 0.06050710007548332,
"learning_rate": 7.284572105419011e-06,
"loss": 0.6835,
"step": 42900
},
{
"epoch": 2.696431930770678,
"grad_norm": 0.3519326150417328,
"learning_rate": 7.249734362208016e-06,
"loss": 0.6157,
"step": 43000
},
{
"epoch": 2.696431930770678,
"eval_loss": 0.8691079020500183,
"eval_runtime": 245.1929,
"eval_samples_per_second": 520.292,
"eval_steps_per_second": 16.261,
"step": 43000
},
{
"epoch": 2.7027027027027026,
"grad_norm": 3.0073323249816895,
"learning_rate": 7.214896618997022e-06,
"loss": 0.5423,
"step": 43100
},
{
"epoch": 2.7089734746347274,
"grad_norm": 47.103782653808594,
"learning_rate": 7.180058875786027e-06,
"loss": 0.8098,
"step": 43200
},
{
"epoch": 2.715244246566752,
"grad_norm": 1.3290644884109497,
"learning_rate": 7.145221132575032e-06,
"loss": 0.8908,
"step": 43300
},
{
"epoch": 2.7215150184987773,
"grad_norm": 51.733924865722656,
"learning_rate": 7.110383389364037e-06,
"loss": 1.1275,
"step": 43400
},
{
"epoch": 2.727785790430802,
"grad_norm": 7.54064416885376,
"learning_rate": 7.075545646153043e-06,
"loss": 1.0345,
"step": 43500
},
{
"epoch": 2.727785790430802,
"eval_loss": 0.8884279131889343,
"eval_runtime": 250.7463,
"eval_samples_per_second": 508.769,
"eval_steps_per_second": 15.901,
"step": 43500
},
{
"epoch": 2.734056562362827,
"grad_norm": 0.2361198216676712,
"learning_rate": 7.0407079029420475e-06,
"loss": 0.6198,
"step": 43600
},
{
"epoch": 2.7403273342948515,
"grad_norm": 0.045945364981889725,
"learning_rate": 7.005870159731053e-06,
"loss": 0.8315,
"step": 43700
},
{
"epoch": 2.7465981062268767,
"grad_norm": 1.2798868417739868,
"learning_rate": 6.9710324165200584e-06,
"loss": 0.9317,
"step": 43800
},
{
"epoch": 2.7528688781589015,
"grad_norm": 0.2944384217262268,
"learning_rate": 6.936194673309063e-06,
"loss": 0.516,
"step": 43900
},
{
"epoch": 2.759139650090926,
"grad_norm": 0.38825371861457825,
"learning_rate": 6.9013569300980686e-06,
"loss": 0.8229,
"step": 44000
},
{
"epoch": 2.759139650090926,
"eval_loss": 0.8659059405326843,
"eval_runtime": 250.2562,
"eval_samples_per_second": 509.766,
"eval_steps_per_second": 15.932,
"step": 44000
},
{
"epoch": 2.765410422022951,
"grad_norm": 121.3291015625,
"learning_rate": 6.866519186887074e-06,
"loss": 0.7989,
"step": 44100
},
{
"epoch": 2.7716811939549757,
"grad_norm": 0.05258101224899292,
"learning_rate": 6.8316814436760795e-06,
"loss": 0.9291,
"step": 44200
},
{
"epoch": 2.7779519658870004,
"grad_norm": 13.635845184326172,
"learning_rate": 6.796843700465084e-06,
"loss": 0.5954,
"step": 44300
},
{
"epoch": 2.7842227378190256,
"grad_norm": 0.01324045192450285,
"learning_rate": 6.76200595725409e-06,
"loss": 0.8537,
"step": 44400
},
{
"epoch": 2.7904935097510504,
"grad_norm": 0.1794157326221466,
"learning_rate": 6.727168214043095e-06,
"loss": 0.9506,
"step": 44500
},
{
"epoch": 2.7904935097510504,
"eval_loss": 0.8657113909721375,
"eval_runtime": 251.0944,
"eval_samples_per_second": 508.064,
"eval_steps_per_second": 15.878,
"step": 44500
},
{
"epoch": 2.796764281683075,
"grad_norm": 1.5337361097335815,
"learning_rate": 6.6923304708321e-06,
"loss": 0.5789,
"step": 44600
},
{
"epoch": 2.8030350536151,
"grad_norm": 67.04114532470703,
"learning_rate": 6.657492727621105e-06,
"loss": 0.4861,
"step": 44700
},
{
"epoch": 2.809305825547125,
"grad_norm": 0.7064642310142517,
"learning_rate": 6.622654984410111e-06,
"loss": 0.9614,
"step": 44800
},
{
"epoch": 2.81557659747915,
"grad_norm": 182.1068572998047,
"learning_rate": 6.587817241199116e-06,
"loss": 1.0069,
"step": 44900
},
{
"epoch": 2.8218473694111745,
"grad_norm": 11.14926528930664,
"learning_rate": 6.552979497988121e-06,
"loss": 0.5599,
"step": 45000
},
{
"epoch": 2.8218473694111745,
"eval_loss": 0.8618975281715393,
"eval_runtime": 253.2257,
"eval_samples_per_second": 503.788,
"eval_steps_per_second": 15.745,
"step": 45000
},
{
"epoch": 2.8281181413431993,
"grad_norm": 3.852113723754883,
"learning_rate": 6.5181417547771264e-06,
"loss": 1.3747,
"step": 45100
},
{
"epoch": 2.834388913275224,
"grad_norm": 0.024370471015572548,
"learning_rate": 6.483304011566132e-06,
"loss": 0.5638,
"step": 45200
},
{
"epoch": 2.8406596852072488,
"grad_norm": 30.42238998413086,
"learning_rate": 6.4484662683551366e-06,
"loss": 1.2095,
"step": 45300
},
{
"epoch": 2.846930457139274,
"grad_norm": 54.890380859375,
"learning_rate": 6.413628525144142e-06,
"loss": 0.7364,
"step": 45400
},
{
"epoch": 2.8532012290712987,
"grad_norm": 0.05865807831287384,
"learning_rate": 6.3787907819331475e-06,
"loss": 0.5692,
"step": 45500
},
{
"epoch": 2.8532012290712987,
"eval_loss": 0.8817957043647766,
"eval_runtime": 250.5213,
"eval_samples_per_second": 509.226,
"eval_steps_per_second": 15.915,
"step": 45500
},
{
"epoch": 2.8594720010033234,
"grad_norm": 0.23342262208461761,
"learning_rate": 6.343953038722153e-06,
"loss": 0.8848,
"step": 45600
},
{
"epoch": 2.865742772935348,
"grad_norm": 0.24238887429237366,
"learning_rate": 6.309115295511157e-06,
"loss": 0.9063,
"step": 45700
},
{
"epoch": 2.8720135448673734,
"grad_norm": 303.49761962890625,
"learning_rate": 6.274277552300162e-06,
"loss": 0.8675,
"step": 45800
},
{
"epoch": 2.878284316799398,
"grad_norm": 27.475610733032227,
"learning_rate": 6.239439809089167e-06,
"loss": 0.9703,
"step": 45900
},
{
"epoch": 2.884555088731423,
"grad_norm": 0.12018956989049911,
"learning_rate": 6.2046020658781725e-06,
"loss": 0.6657,
"step": 46000
},
{
"epoch": 2.884555088731423,
"eval_loss": 0.842439591884613,
"eval_runtime": 250.5638,
"eval_samples_per_second": 509.14,
"eval_steps_per_second": 15.912,
"step": 46000
},
{
"epoch": 2.8908258606634476,
"grad_norm": 36.39583969116211,
"learning_rate": 6.169764322667178e-06,
"loss": 0.6564,
"step": 46100
},
{
"epoch": 2.8970966325954723,
"grad_norm": 6.755324840545654,
"learning_rate": 6.135274956888293e-06,
"loss": 0.7945,
"step": 46200
},
{
"epoch": 2.903367404527497,
"grad_norm": 0.24825870990753174,
"learning_rate": 6.100437213677298e-06,
"loss": 0.6341,
"step": 46300
},
{
"epoch": 2.9096381764595223,
"grad_norm": 0.06013401225209236,
"learning_rate": 6.0655994704663035e-06,
"loss": 1.042,
"step": 46400
},
{
"epoch": 2.915908948391547,
"grad_norm": 14.515037536621094,
"learning_rate": 6.030761727255309e-06,
"loss": 1.0812,
"step": 46500
},
{
"epoch": 2.915908948391547,
"eval_loss": 0.8509716987609863,
"eval_runtime": 247.7079,
"eval_samples_per_second": 515.01,
"eval_steps_per_second": 16.096,
"step": 46500
},
{
"epoch": 2.9221797203235718,
"grad_norm": 0.9338593482971191,
"learning_rate": 5.995923984044314e-06,
"loss": 0.9787,
"step": 46600
},
{
"epoch": 2.9284504922555965,
"grad_norm": 241.10589599609375,
"learning_rate": 5.961086240833319e-06,
"loss": 0.8732,
"step": 46700
},
{
"epoch": 2.9347212641876217,
"grad_norm": 119.96747589111328,
"learning_rate": 5.926248497622325e-06,
"loss": 1.1872,
"step": 46800
},
{
"epoch": 2.9409920361196464,
"grad_norm": 28.35833740234375,
"learning_rate": 5.89141075441133e-06,
"loss": 0.989,
"step": 46900
},
{
"epoch": 2.947262808051671,
"grad_norm": 0.007068769074976444,
"learning_rate": 5.856573011200335e-06,
"loss": 0.874,
"step": 47000
},
{
"epoch": 2.947262808051671,
"eval_loss": 0.8214829564094543,
"eval_runtime": 243.5295,
"eval_samples_per_second": 523.846,
"eval_steps_per_second": 16.372,
"step": 47000
},
{
"epoch": 2.953533579983696,
"grad_norm": 0.0442727729678154,
"learning_rate": 5.82173526798934e-06,
"loss": 1.0229,
"step": 47100
},
{
"epoch": 2.9598043519157207,
"grad_norm": 0.031402587890625,
"learning_rate": 5.786897524778346e-06,
"loss": 0.9888,
"step": 47200
},
{
"epoch": 2.9660751238477454,
"grad_norm": 2.0282115936279297,
"learning_rate": 5.75205978156735e-06,
"loss": 0.4883,
"step": 47300
},
{
"epoch": 2.9723458957797706,
"grad_norm": 7.441370487213135,
"learning_rate": 5.717222038356356e-06,
"loss": 0.7474,
"step": 47400
},
{
"epoch": 2.9786166677117953,
"grad_norm": 20.524629592895508,
"learning_rate": 5.682384295145361e-06,
"loss": 0.7615,
"step": 47500
},
{
"epoch": 2.9786166677117953,
"eval_loss": 0.8217635750770569,
"eval_runtime": 249.4571,
"eval_samples_per_second": 511.399,
"eval_steps_per_second": 15.983,
"step": 47500
},
{
"epoch": 2.98488743964382,
"grad_norm": 0.4798177182674408,
"learning_rate": 5.647546551934367e-06,
"loss": 0.6208,
"step": 47600
},
{
"epoch": 2.991158211575845,
"grad_norm": 112.3564224243164,
"learning_rate": 5.6127088087233715e-06,
"loss": 0.8332,
"step": 47700
},
{
"epoch": 2.99742898350787,
"grad_norm": 52.40660095214844,
"learning_rate": 5.577871065512377e-06,
"loss": 0.6734,
"step": 47800
},
{
"epoch": 3.0036997554398948,
"grad_norm": 0.9568219184875488,
"learning_rate": 5.5430333223013825e-06,
"loss": 0.5095,
"step": 47900
},
{
"epoch": 3.0099705273719195,
"grad_norm": 0.40387988090515137,
"learning_rate": 5.508195579090387e-06,
"loss": 0.7709,
"step": 48000
},
{
"epoch": 3.0099705273719195,
"eval_loss": 0.8220009803771973,
"eval_runtime": 248.6927,
"eval_samples_per_second": 512.97,
"eval_steps_per_second": 16.032,
"step": 48000
},
{
"epoch": 3.0162412993039442,
"grad_norm": 192.66201782226562,
"learning_rate": 5.473357835879393e-06,
"loss": 0.5449,
"step": 48100
},
{
"epoch": 3.022512071235969,
"grad_norm": 0.026696085929870605,
"learning_rate": 5.438520092668398e-06,
"loss": 0.772,
"step": 48200
},
{
"epoch": 3.028782843167994,
"grad_norm": 7.1632232666015625,
"learning_rate": 5.403682349457403e-06,
"loss": 0.8582,
"step": 48300
},
{
"epoch": 3.035053615100019,
"grad_norm": 4.1231584548950195,
"learning_rate": 5.369192983678517e-06,
"loss": 0.5742,
"step": 48400
},
{
"epoch": 3.0413243870320437,
"grad_norm": 0.08916144073009491,
"learning_rate": 5.334355240467523e-06,
"loss": 0.5584,
"step": 48500
},
{
"epoch": 3.0413243870320437,
"eval_loss": 0.8492663502693176,
"eval_runtime": 247.0758,
"eval_samples_per_second": 516.327,
"eval_steps_per_second": 16.137,
"step": 48500
},
{
"epoch": 3.0475951589640684,
"grad_norm": 15.887138366699219,
"learning_rate": 5.299517497256527e-06,
"loss": 0.9766,
"step": 48600
},
{
"epoch": 3.053865930896093,
"grad_norm": 3.1666200160980225,
"learning_rate": 5.264679754045533e-06,
"loss": 0.6473,
"step": 48700
},
{
"epoch": 3.0601367028281183,
"grad_norm": 4.730705261230469,
"learning_rate": 5.229842010834538e-06,
"loss": 0.5861,
"step": 48800
},
{
"epoch": 3.066407474760143,
"grad_norm": 0.19111567735671997,
"learning_rate": 5.195004267623544e-06,
"loss": 0.6377,
"step": 48900
},
{
"epoch": 3.072678246692168,
"grad_norm": 0.17477057874202728,
"learning_rate": 5.1601665244125485e-06,
"loss": 0.8393,
"step": 49000
},
{
"epoch": 3.072678246692168,
"eval_loss": 0.8429604768753052,
"eval_runtime": 246.6172,
"eval_samples_per_second": 517.288,
"eval_steps_per_second": 16.167,
"step": 49000
},
{
"epoch": 3.0789490186241926,
"grad_norm": 0.47240251302719116,
"learning_rate": 5.125328781201554e-06,
"loss": 0.8385,
"step": 49100
},
{
"epoch": 3.0852197905562173,
"grad_norm": 1.6392873525619507,
"learning_rate": 5.0904910379905595e-06,
"loss": 0.5523,
"step": 49200
},
{
"epoch": 3.0914905624882425,
"grad_norm": 0.08180980384349823,
"learning_rate": 5.055653294779564e-06,
"loss": 0.6217,
"step": 49300
},
{
"epoch": 3.0977613344202672,
"grad_norm": 10.683464050292969,
"learning_rate": 5.02081555156857e-06,
"loss": 0.5515,
"step": 49400
},
{
"epoch": 3.104032106352292,
"grad_norm": 154.55838012695312,
"learning_rate": 4.985977808357575e-06,
"loss": 0.851,
"step": 49500
},
{
"epoch": 3.104032106352292,
"eval_loss": 0.8000255227088928,
"eval_runtime": 245.3365,
"eval_samples_per_second": 519.988,
"eval_steps_per_second": 16.251,
"step": 49500
},
{
"epoch": 3.1103028782843167,
"grad_norm": 0.106838159263134,
"learning_rate": 4.95114006514658e-06,
"loss": 0.9247,
"step": 49600
},
{
"epoch": 3.1165736502163415,
"grad_norm": 0.03634607046842575,
"learning_rate": 4.916302321935585e-06,
"loss": 0.655,
"step": 49700
},
{
"epoch": 3.1228444221483667,
"grad_norm": 118.1080322265625,
"learning_rate": 4.881464578724591e-06,
"loss": 0.4979,
"step": 49800
},
{
"epoch": 3.1291151940803914,
"grad_norm": 0.2726267874240875,
"learning_rate": 4.846626835513596e-06,
"loss": 0.7521,
"step": 49900
},
{
"epoch": 3.135385966012416,
"grad_norm": 0.031166499480605125,
"learning_rate": 4.811789092302601e-06,
"loss": 0.53,
"step": 50000
},
{
"epoch": 3.135385966012416,
"eval_loss": 0.8105431795120239,
"eval_runtime": 248.1106,
"eval_samples_per_second": 514.174,
"eval_steps_per_second": 16.069,
"step": 50000
},
{
"epoch": 3.141656737944441,
"grad_norm": 88.85710144042969,
"learning_rate": 4.776951349091606e-06,
"loss": 0.5943,
"step": 50100
},
{
"epoch": 3.1479275098764656,
"grad_norm": 11.926735877990723,
"learning_rate": 4.742113605880612e-06,
"loss": 0.4659,
"step": 50200
},
{
"epoch": 3.154198281808491,
"grad_norm": 17.817556381225586,
"learning_rate": 4.7072758626696165e-06,
"loss": 0.4843,
"step": 50300
},
{
"epoch": 3.1604690537405156,
"grad_norm": 95.25701904296875,
"learning_rate": 4.672438119458621e-06,
"loss": 0.7577,
"step": 50400
},
{
"epoch": 3.1667398256725403,
"grad_norm": 0.007618566509336233,
"learning_rate": 4.637600376247627e-06,
"loss": 0.3448,
"step": 50500
},
{
"epoch": 3.1667398256725403,
"eval_loss": 0.8055439591407776,
"eval_runtime": 246.7777,
"eval_samples_per_second": 516.951,
"eval_steps_per_second": 16.156,
"step": 50500
},
{
"epoch": 3.173010597604565,
"grad_norm": 45.008056640625,
"learning_rate": 4.602762633036632e-06,
"loss": 0.8392,
"step": 50600
},
{
"epoch": 3.17928136953659,
"grad_norm": 0.11749571561813354,
"learning_rate": 4.567924889825638e-06,
"loss": 0.75,
"step": 50700
},
{
"epoch": 3.185552141468615,
"grad_norm": 0.04399213567376137,
"learning_rate": 4.533087146614642e-06,
"loss": 0.5195,
"step": 50800
},
{
"epoch": 3.1918229134006397,
"grad_norm": 0.1250951737165451,
"learning_rate": 4.498249403403648e-06,
"loss": 0.617,
"step": 50900
},
{
"epoch": 3.1980936853326645,
"grad_norm": 213.82589721679688,
"learning_rate": 4.463411660192653e-06,
"loss": 0.6892,
"step": 51000
},
{
"epoch": 3.1980936853326645,
"eval_loss": 0.8293086290359497,
"eval_runtime": 244.3828,
"eval_samples_per_second": 522.017,
"eval_steps_per_second": 16.315,
"step": 51000
},
{
"epoch": 3.204364457264689,
"grad_norm": 12.81237506866455,
"learning_rate": 4.428573916981658e-06,
"loss": 0.497,
"step": 51100
},
{
"epoch": 3.210635229196714,
"grad_norm": 0.06836537271738052,
"learning_rate": 4.393736173770663e-06,
"loss": 0.6793,
"step": 51200
},
{
"epoch": 3.216906001128739,
"grad_norm": 0.30741751194000244,
"learning_rate": 4.358898430559669e-06,
"loss": 0.7251,
"step": 51300
},
{
"epoch": 3.223176773060764,
"grad_norm": 299.8288269042969,
"learning_rate": 4.324060687348674e-06,
"loss": 0.6471,
"step": 51400
},
{
"epoch": 3.2294475449927886,
"grad_norm": 39.92329406738281,
"learning_rate": 4.289222944137679e-06,
"loss": 0.775,
"step": 51500
},
{
"epoch": 3.2294475449927886,
"eval_loss": 0.8012564778327942,
"eval_runtime": 245.7294,
"eval_samples_per_second": 519.157,
"eval_steps_per_second": 16.225,
"step": 51500
},
{
"epoch": 3.2357183169248134,
"grad_norm": 7.386813640594482,
"learning_rate": 4.2543852009266845e-06,
"loss": 0.7289,
"step": 51600
},
{
"epoch": 3.241989088856838,
"grad_norm": 0.8339570760726929,
"learning_rate": 4.21954745771569e-06,
"loss": 0.6894,
"step": 51700
},
{
"epoch": 3.2482598607888633,
"grad_norm": 103.696533203125,
"learning_rate": 4.184709714504695e-06,
"loss": 0.5677,
"step": 51800
},
{
"epoch": 3.254530632720888,
"grad_norm": 157.94912719726562,
"learning_rate": 4.149871971293699e-06,
"loss": 0.317,
"step": 51900
},
{
"epoch": 3.260801404652913,
"grad_norm": 0.6201029419898987,
"learning_rate": 4.115034228082705e-06,
"loss": 0.5376,
"step": 52000
},
{
"epoch": 3.260801404652913,
"eval_loss": 0.785252034664154,
"eval_runtime": 247.2797,
"eval_samples_per_second": 515.902,
"eval_steps_per_second": 16.123,
"step": 52000
},
{
"epoch": 3.2670721765849375,
"grad_norm": 5.9356913566589355,
"learning_rate": 4.08019648487171e-06,
"loss": 0.4582,
"step": 52100
},
{
"epoch": 3.2733429485169623,
"grad_norm": 99.29075622558594,
"learning_rate": 4.045358741660716e-06,
"loss": 0.8505,
"step": 52200
},
{
"epoch": 3.279613720448987,
"grad_norm": 7.142418384552002,
"learning_rate": 4.01052099844972e-06,
"loss": 0.6236,
"step": 52300
},
{
"epoch": 3.285884492381012,
"grad_norm": 0.18595051765441895,
"learning_rate": 3.975683255238726e-06,
"loss": 0.7388,
"step": 52400
},
{
"epoch": 3.292155264313037,
"grad_norm": 0.26398783922195435,
"learning_rate": 3.940845512027731e-06,
"loss": 0.7061,
"step": 52500
},
{
"epoch": 3.292155264313037,
"eval_loss": 0.786342203617096,
"eval_runtime": 246.6236,
"eval_samples_per_second": 517.274,
"eval_steps_per_second": 16.166,
"step": 52500
},
{
"epoch": 3.2984260362450617,
"grad_norm": 0.15353605151176453,
"learning_rate": 3.906007768816736e-06,
"loss": 0.5411,
"step": 52600
},
{
"epoch": 3.3046968081770864,
"grad_norm": 0.6905626654624939,
"learning_rate": 3.8711700256057415e-06,
"loss": 0.9511,
"step": 52700
},
{
"epoch": 3.3109675801091116,
"grad_norm": 120.66680145263672,
"learning_rate": 3.836332282394747e-06,
"loss": 0.5364,
"step": 52800
},
{
"epoch": 3.3172383520411364,
"grad_norm": 22.492393493652344,
"learning_rate": 3.801494539183752e-06,
"loss": 0.5795,
"step": 52900
},
{
"epoch": 3.323509123973161,
"grad_norm": 11.335774421691895,
"learning_rate": 3.766656795972757e-06,
"loss": 0.5305,
"step": 53000
},
{
"epoch": 3.323509123973161,
"eval_loss": 0.787602961063385,
"eval_runtime": 250.8532,
"eval_samples_per_second": 508.552,
"eval_steps_per_second": 15.894,
"step": 53000
},
{
"epoch": 3.329779895905186,
"grad_norm": 0.07559686154127121,
"learning_rate": 3.7318190527617626e-06,
"loss": 0.8051,
"step": 53100
},
{
"epoch": 3.3360506678372106,
"grad_norm": 0.06827156990766525,
"learning_rate": 3.6969813095507677e-06,
"loss": 0.5342,
"step": 53200
},
{
"epoch": 3.3423214397692353,
"grad_norm": 1.358184576034546,
"learning_rate": 3.662143566339773e-06,
"loss": 0.4567,
"step": 53300
},
{
"epoch": 3.3485922117012605,
"grad_norm": 58.48233413696289,
"learning_rate": 3.627305823128778e-06,
"loss": 0.9751,
"step": 53400
},
{
"epoch": 3.3548629836332853,
"grad_norm": 0.13244691491127014,
"learning_rate": 3.592468079917783e-06,
"loss": 0.4413,
"step": 53500
},
{
"epoch": 3.3548629836332853,
"eval_loss": 0.8008161783218384,
"eval_runtime": 248.8641,
"eval_samples_per_second": 512.617,
"eval_steps_per_second": 16.021,
"step": 53500
},
{
"epoch": 3.36113375556531,
"grad_norm": 5.010788917541504,
"learning_rate": 3.5576303367067884e-06,
"loss": 0.6011,
"step": 53600
},
{
"epoch": 3.3674045274973348,
"grad_norm": 0.032868873327970505,
"learning_rate": 3.5227925934957935e-06,
"loss": 0.4708,
"step": 53700
},
{
"epoch": 3.37367529942936,
"grad_norm": 2.3022570610046387,
"learning_rate": 3.488303227716909e-06,
"loss": 0.6167,
"step": 53800
},
{
"epoch": 3.3799460713613847,
"grad_norm": 0.7494950890541077,
"learning_rate": 3.453465484505914e-06,
"loss": 0.7653,
"step": 53900
},
{
"epoch": 3.3862168432934094,
"grad_norm": 1.9640907049179077,
"learning_rate": 3.4186277412949194e-06,
"loss": 0.7781,
"step": 54000
},
{
"epoch": 3.3862168432934094,
"eval_loss": 0.7897498607635498,
"eval_runtime": 250.7968,
"eval_samples_per_second": 508.667,
"eval_steps_per_second": 15.897,
"step": 54000
},
{
"epoch": 3.392487615225434,
"grad_norm": 95.03298950195312,
"learning_rate": 3.3837899980839245e-06,
"loss": 0.9323,
"step": 54100
},
{
"epoch": 3.398758387157459,
"grad_norm": 1.3489042520523071,
"learning_rate": 3.3489522548729296e-06,
"loss": 0.6003,
"step": 54200
},
{
"epoch": 3.4050291590894837,
"grad_norm": 1.4920170307159424,
"learning_rate": 3.314114511661935e-06,
"loss": 0.5268,
"step": 54300
},
{
"epoch": 3.411299931021509,
"grad_norm": 71.25545501708984,
"learning_rate": 3.27927676845094e-06,
"loss": 0.6639,
"step": 54400
},
{
"epoch": 3.4175707029535336,
"grad_norm": 1.5343536138534546,
"learning_rate": 3.2444390252399456e-06,
"loss": 0.388,
"step": 54500
},
{
"epoch": 3.4175707029535336,
"eval_loss": 0.7854874730110168,
"eval_runtime": 247.7117,
"eval_samples_per_second": 515.002,
"eval_steps_per_second": 16.095,
"step": 54500
},
{
"epoch": 3.4238414748855583,
"grad_norm": 0.22106263041496277,
"learning_rate": 3.2096012820289502e-06,
"loss": 0.7258,
"step": 54600
},
{
"epoch": 3.430112246817583,
"grad_norm": 0.10803945362567902,
"learning_rate": 3.1747635388179553e-06,
"loss": 0.6475,
"step": 54700
},
{
"epoch": 3.4363830187496083,
"grad_norm": 89.42733764648438,
"learning_rate": 3.139925795606961e-06,
"loss": 0.795,
"step": 54800
},
{
"epoch": 3.442653790681633,
"grad_norm": 0.15668845176696777,
"learning_rate": 3.105088052395966e-06,
"loss": 0.4978,
"step": 54900
},
{
"epoch": 3.4489245626136578,
"grad_norm": 60.56550216674805,
"learning_rate": 3.070250309184971e-06,
"loss": 0.6259,
"step": 55000
},
{
"epoch": 3.4489245626136578,
"eval_loss": 0.7704712748527527,
"eval_runtime": 250.1048,
"eval_samples_per_second": 510.074,
"eval_steps_per_second": 15.941,
"step": 55000
},
{
"epoch": 3.4551953345456825,
"grad_norm": 0.28135305643081665,
"learning_rate": 3.0354125659739764e-06,
"loss": 0.791,
"step": 55100
},
{
"epoch": 3.4614661064777072,
"grad_norm": 120.33629608154297,
"learning_rate": 3.0005748227629815e-06,
"loss": 0.7602,
"step": 55200
},
{
"epoch": 3.467736878409732,
"grad_norm": 0.6213288903236389,
"learning_rate": 2.965737079551987e-06,
"loss": 0.2236,
"step": 55300
},
{
"epoch": 3.474007650341757,
"grad_norm": 0.051405176520347595,
"learning_rate": 2.930899336340992e-06,
"loss": 0.5577,
"step": 55400
},
{
"epoch": 3.480278422273782,
"grad_norm": 6.140790939331055,
"learning_rate": 2.8960615931299975e-06,
"loss": 0.4214,
"step": 55500
},
{
"epoch": 3.480278422273782,
"eval_loss": 0.768252432346344,
"eval_runtime": 248.6626,
"eval_samples_per_second": 513.032,
"eval_steps_per_second": 16.034,
"step": 55500
},
{
"epoch": 3.4865491942058067,
"grad_norm": 0.051673661917448044,
"learning_rate": 2.8612238499190026e-06,
"loss": 0.7335,
"step": 55600
},
{
"epoch": 3.4928199661378314,
"grad_norm": 5.123118877410889,
"learning_rate": 2.8263861067080077e-06,
"loss": 0.7536,
"step": 55700
},
{
"epoch": 3.4990907380698566,
"grad_norm": 0.7104228734970093,
"learning_rate": 2.791548363497013e-06,
"loss": 0.4577,
"step": 55800
},
{
"epoch": 3.5053615100018813,
"grad_norm": 49.410400390625,
"learning_rate": 2.7567106202860182e-06,
"loss": 0.5869,
"step": 55900
},
{
"epoch": 3.511632281933906,
"grad_norm": 0.0593554824590683,
"learning_rate": 2.7218728770750237e-06,
"loss": 0.8563,
"step": 56000
},
{
"epoch": 3.511632281933906,
"eval_loss": 0.7587498426437378,
"eval_runtime": 247.0433,
"eval_samples_per_second": 516.395,
"eval_steps_per_second": 16.139,
"step": 56000
},
{
"epoch": 3.517903053865931,
"grad_norm": 8.727328300476074,
"learning_rate": 2.6870351338640284e-06,
"loss": 0.9291,
"step": 56100
},
{
"epoch": 3.5241738257979556,
"grad_norm": 0.023664651438593864,
"learning_rate": 2.6521973906530334e-06,
"loss": 0.4387,
"step": 56200
},
{
"epoch": 3.5304445977299803,
"grad_norm": 2.834498405456543,
"learning_rate": 2.617359647442039e-06,
"loss": 0.4491,
"step": 56300
},
{
"epoch": 3.5367153696620055,
"grad_norm": 1.9824761152267456,
"learning_rate": 2.582870281663154e-06,
"loss": 0.506,
"step": 56400
},
{
"epoch": 3.5429861415940302,
"grad_norm": 0.7142437100410461,
"learning_rate": 2.5480325384521594e-06,
"loss": 0.6626,
"step": 56500
},
{
"epoch": 3.5429861415940302,
"eval_loss": 0.7634491920471191,
"eval_runtime": 247.6797,
"eval_samples_per_second": 515.068,
"eval_steps_per_second": 16.097,
"step": 56500
},
{
"epoch": 3.549256913526055,
"grad_norm": 0.030130065977573395,
"learning_rate": 2.5131947952411645e-06,
"loss": 0.8654,
"step": 56600
},
{
"epoch": 3.5555276854580797,
"grad_norm": 0.757265031337738,
"learning_rate": 2.4783570520301695e-06,
"loss": 0.4455,
"step": 56700
},
{
"epoch": 3.561798457390105,
"grad_norm": 130.99807739257812,
"learning_rate": 2.443519308819175e-06,
"loss": 0.4593,
"step": 56800
},
{
"epoch": 3.5680692293221297,
"grad_norm": 86.36803436279297,
"learning_rate": 2.40868156560818e-06,
"loss": 0.878,
"step": 56900
},
{
"epoch": 3.5743400012541544,
"grad_norm": 0.8545703887939453,
"learning_rate": 2.373843822397185e-06,
"loss": 0.3737,
"step": 57000
},
{
"epoch": 3.5743400012541544,
"eval_loss": 0.7617383599281311,
"eval_runtime": 246.3862,
"eval_samples_per_second": 517.773,
"eval_steps_per_second": 16.182,
"step": 57000
},
{
"epoch": 3.580610773186179,
"grad_norm": 100.52796173095703,
"learning_rate": 2.3390060791861902e-06,
"loss": 0.377,
"step": 57100
},
{
"epoch": 3.586881545118204,
"grad_norm": 31.44060516357422,
"learning_rate": 2.3041683359751957e-06,
"loss": 0.6894,
"step": 57200
},
{
"epoch": 3.5931523170502286,
"grad_norm": 0.2915436625480652,
"learning_rate": 2.2693305927642008e-06,
"loss": 0.6635,
"step": 57300
},
{
"epoch": 3.599423088982254,
"grad_norm": 0.009617321193218231,
"learning_rate": 2.2344928495532063e-06,
"loss": 0.9224,
"step": 57400
},
{
"epoch": 3.6056938609142786,
"grad_norm": 0.21305809915065765,
"learning_rate": 2.1996551063422113e-06,
"loss": 0.635,
"step": 57500
},
{
"epoch": 3.6056938609142786,
"eval_loss": 0.7668555974960327,
"eval_runtime": 248.1105,
"eval_samples_per_second": 514.174,
"eval_steps_per_second": 16.069,
"step": 57500
},
{
"epoch": 3.6119646328463033,
"grad_norm": 11.654231071472168,
"learning_rate": 2.164817363131217e-06,
"loss": 0.6797,
"step": 57600
},
{
"epoch": 3.618235404778328,
"grad_norm": 1.0893511772155762,
"learning_rate": 2.1299796199202215e-06,
"loss": 0.9814,
"step": 57700
},
{
"epoch": 3.6245061767103532,
"grad_norm": 0.3305797278881073,
"learning_rate": 2.095141876709227e-06,
"loss": 0.9893,
"step": 57800
},
{
"epoch": 3.630776948642378,
"grad_norm": 0.10635466873645782,
"learning_rate": 2.060304133498232e-06,
"loss": 0.6753,
"step": 57900
},
{
"epoch": 3.6370477205744027,
"grad_norm": 0.09898664057254791,
"learning_rate": 2.0254663902872375e-06,
"loss": 0.8349,
"step": 58000
},
{
"epoch": 3.6370477205744027,
"eval_loss": 0.7500940561294556,
"eval_runtime": 252.1601,
"eval_samples_per_second": 505.917,
"eval_steps_per_second": 15.811,
"step": 58000
},
{
"epoch": 3.6433184925064275,
"grad_norm": 0.05218241736292839,
"learning_rate": 1.9906286470762426e-06,
"loss": 0.8523,
"step": 58100
},
{
"epoch": 3.649589264438452,
"grad_norm": 0.648098886013031,
"learning_rate": 1.9557909038652477e-06,
"loss": 0.2962,
"step": 58200
},
{
"epoch": 3.655860036370477,
"grad_norm": 19.993263244628906,
"learning_rate": 1.920953160654253e-06,
"loss": 0.6585,
"step": 58300
},
{
"epoch": 3.662130808302502,
"grad_norm": 549.2650146484375,
"learning_rate": 1.886115417443258e-06,
"loss": 1.0247,
"step": 58400
},
{
"epoch": 3.668401580234527,
"grad_norm": 53.44794464111328,
"learning_rate": 1.8516260516643734e-06,
"loss": 0.8638,
"step": 58500
},
{
"epoch": 3.668401580234527,
"eval_loss": 0.7576786279678345,
"eval_runtime": 251.8307,
"eval_samples_per_second": 506.578,
"eval_steps_per_second": 15.832,
"step": 58500
},
{
"epoch": 3.6746723521665516,
"grad_norm": 0.11638414114713669,
"learning_rate": 1.8167883084533785e-06,
"loss": 0.9456,
"step": 58600
},
{
"epoch": 3.6809431240985764,
"grad_norm": 0.16805872321128845,
"learning_rate": 1.7819505652423837e-06,
"loss": 0.5401,
"step": 58700
},
{
"epoch": 3.6872138960306016,
"grad_norm": 161.84934997558594,
"learning_rate": 1.747112822031389e-06,
"loss": 0.6602,
"step": 58800
},
{
"epoch": 3.6934846679626263,
"grad_norm": 0.16537758708000183,
"learning_rate": 1.7122750788203943e-06,
"loss": 0.7543,
"step": 58900
},
{
"epoch": 3.699755439894651,
"grad_norm": 82.06924438476562,
"learning_rate": 1.6774373356093992e-06,
"loss": 0.7893,
"step": 59000
},
{
"epoch": 3.699755439894651,
"eval_loss": 0.7599766850471497,
"eval_runtime": 255.6784,
"eval_samples_per_second": 498.955,
"eval_steps_per_second": 15.594,
"step": 59000
},
{
"epoch": 3.706026211826676,
"grad_norm": 123.94532012939453,
"learning_rate": 1.6425995923984044e-06,
"loss": 0.7746,
"step": 59100
},
{
"epoch": 3.7122969837587005,
"grad_norm": 0.06561436504125595,
"learning_rate": 1.6077618491874097e-06,
"loss": 0.6539,
"step": 59200
},
{
"epoch": 3.7185677556907253,
"grad_norm": 243.56668090820312,
"learning_rate": 1.572924105976415e-06,
"loss": 0.8083,
"step": 59300
},
{
"epoch": 3.7248385276227505,
"grad_norm": 0.13773566484451294,
"learning_rate": 1.5380863627654203e-06,
"loss": 0.3429,
"step": 59400
},
{
"epoch": 3.731109299554775,
"grad_norm": 412.2792053222656,
"learning_rate": 1.5032486195544256e-06,
"loss": 0.5005,
"step": 59500
},
{
"epoch": 3.731109299554775,
"eval_loss": 0.7445316314697266,
"eval_runtime": 251.276,
"eval_samples_per_second": 507.697,
"eval_steps_per_second": 15.867,
"step": 59500
},
{
"epoch": 3.7373800714868,
"grad_norm": 10.323953628540039,
"learning_rate": 1.4684108763434306e-06,
"loss": 0.6238,
"step": 59600
},
{
"epoch": 3.7436508434188247,
"grad_norm": 34.32875061035156,
"learning_rate": 1.4335731331324357e-06,
"loss": 0.4343,
"step": 59700
},
{
"epoch": 3.74992161535085,
"grad_norm": 0.08429472893476486,
"learning_rate": 1.398735389921441e-06,
"loss": 0.8189,
"step": 59800
},
{
"epoch": 3.7561923872828746,
"grad_norm": 68.88423156738281,
"learning_rate": 1.3638976467104463e-06,
"loss": 0.6272,
"step": 59900
},
{
"epoch": 3.7624631592148994,
"grad_norm": 0.1870589703321457,
"learning_rate": 1.3290599034994513e-06,
"loss": 0.2982,
"step": 60000
},
{
"epoch": 3.7624631592148994,
"eval_loss": 0.7597461342811584,
"eval_runtime": 254.4738,
"eval_samples_per_second": 501.317,
"eval_steps_per_second": 15.668,
"step": 60000
},
{
"epoch": 3.768733931146924,
"grad_norm": 0.051242515444755554,
"learning_rate": 1.2942221602884566e-06,
"loss": 0.7028,
"step": 60100
},
{
"epoch": 3.775004703078949,
"grad_norm": 187.53872680664062,
"learning_rate": 1.2593844170774619e-06,
"loss": 0.9447,
"step": 60200
},
{
"epoch": 3.7812754750109736,
"grad_norm": 64.70340728759766,
"learning_rate": 1.224546673866467e-06,
"loss": 0.6175,
"step": 60300
},
{
"epoch": 3.787546246942999,
"grad_norm": 0.8817376494407654,
"learning_rate": 1.1897089306554722e-06,
"loss": 0.5856,
"step": 60400
},
{
"epoch": 3.7938170188750235,
"grad_norm": 88.64114379882812,
"learning_rate": 1.1548711874444775e-06,
"loss": 0.8249,
"step": 60500
},
{
"epoch": 3.7938170188750235,
"eval_loss": 0.750523030757904,
"eval_runtime": 252.8744,
"eval_samples_per_second": 504.488,
"eval_steps_per_second": 15.767,
"step": 60500
},
{
"epoch": 3.8000877908070483,
"grad_norm": 0.041767679154872894,
"learning_rate": 1.1203818216655927e-06,
"loss": 0.6617,
"step": 60600
},
{
"epoch": 3.806358562739073,
"grad_norm": 74.78905487060547,
"learning_rate": 1.085544078454598e-06,
"loss": 0.5767,
"step": 60700
},
{
"epoch": 3.812629334671098,
"grad_norm": 0.11142675578594208,
"learning_rate": 1.050706335243603e-06,
"loss": 1.0094,
"step": 60800
},
{
"epoch": 3.818900106603123,
"grad_norm": 92.60441589355469,
"learning_rate": 1.0158685920326083e-06,
"loss": 0.471,
"step": 60900
},
{
"epoch": 3.8251708785351477,
"grad_norm": 42.58308410644531,
"learning_rate": 9.810308488216134e-07,
"loss": 0.6313,
"step": 61000
},
{
"epoch": 3.8251708785351477,
"eval_loss": 0.7488948702812195,
"eval_runtime": 251.0082,
"eval_samples_per_second": 508.238,
"eval_steps_per_second": 15.884,
"step": 61000
},
{
"epoch": 3.8314416504671724,
"grad_norm": 46.6805305480957,
"learning_rate": 9.461931056106186e-07,
"loss": 0.6545,
"step": 61100
},
{
"epoch": 3.837712422399197,
"grad_norm": 0.06978940218687057,
"learning_rate": 9.113553623996238e-07,
"loss": 0.699,
"step": 61200
},
{
"epoch": 3.843983194331222,
"grad_norm": 0.933862030506134,
"learning_rate": 8.76517619188629e-07,
"loss": 0.6272,
"step": 61300
},
{
"epoch": 3.850253966263247,
"grad_norm": 44.13498306274414,
"learning_rate": 8.416798759776343e-07,
"loss": 0.7375,
"step": 61400
},
{
"epoch": 3.856524738195272,
"grad_norm": 3.0953245162963867,
"learning_rate": 8.068421327666394e-07,
"loss": 0.4213,
"step": 61500
},
{
"epoch": 3.856524738195272,
"eval_loss": 0.7490043640136719,
"eval_runtime": 251.9621,
"eval_samples_per_second": 506.314,
"eval_steps_per_second": 15.824,
"step": 61500
},
{
"epoch": 3.8627955101272966,
"grad_norm": 15.084046363830566,
"learning_rate": 7.720043895556446e-07,
"loss": 0.6631,
"step": 61600
},
{
"epoch": 3.8690662820593213,
"grad_norm": 34.3710823059082,
"learning_rate": 7.371666463446499e-07,
"loss": 0.552,
"step": 61700
},
{
"epoch": 3.8753370539913465,
"grad_norm": 0.2596281170845032,
"learning_rate": 7.023289031336551e-07,
"loss": 0.7041,
"step": 61800
},
{
"epoch": 3.8816078259233713,
"grad_norm": 0.04028361290693283,
"learning_rate": 6.674911599226603e-07,
"loss": 0.8457,
"step": 61900
},
{
"epoch": 3.887878597855396,
"grad_norm": 0.2941274344921112,
"learning_rate": 6.326534167116654e-07,
"loss": 0.8104,
"step": 62000
},
{
"epoch": 3.887878597855396,
"eval_loss": 0.7476946115493774,
"eval_runtime": 276.1611,
"eval_samples_per_second": 461.948,
"eval_steps_per_second": 14.437,
"step": 62000
},
{
"epoch": 3.8941493697874208,
"grad_norm": 51.24428939819336,
"learning_rate": 5.978156735006706e-07,
"loss": 0.4494,
"step": 62100
},
{
"epoch": 3.9004201417194455,
"grad_norm": 89.3067855834961,
"learning_rate": 5.629779302896759e-07,
"loss": 0.6947,
"step": 62200
},
{
"epoch": 3.9066909136514703,
"grad_norm": 0.06883756071329117,
"learning_rate": 5.281401870786811e-07,
"loss": 0.8061,
"step": 62300
},
{
"epoch": 3.9129616855834954,
"grad_norm": 0.8000829219818115,
"learning_rate": 4.933024438676863e-07,
"loss": 0.416,
"step": 62400
},
{
"epoch": 3.91923245751552,
"grad_norm": 119.61589813232422,
"learning_rate": 4.5846470065669146e-07,
"loss": 0.7359,
"step": 62500
},
{
"epoch": 3.91923245751552,
"eval_loss": 0.7468039989471436,
"eval_runtime": 257.4303,
"eval_samples_per_second": 495.559,
"eval_steps_per_second": 15.488,
"step": 62500
},
{
"epoch": 3.925503229447545,
"grad_norm": 0.29899609088897705,
"learning_rate": 4.2362695744569673e-07,
"loss": 0.7408,
"step": 62600
},
{
"epoch": 3.9317740013795697,
"grad_norm": 112.43661499023438,
"learning_rate": 3.887892142347019e-07,
"loss": 0.6255,
"step": 62700
},
{
"epoch": 3.938044773311595,
"grad_norm": 474.4875793457031,
"learning_rate": 3.5395147102370713e-07,
"loss": 0.7865,
"step": 62800
},
{
"epoch": 3.9443155452436196,
"grad_norm": 0.3225669860839844,
"learning_rate": 3.191137278127123e-07,
"loss": 0.4879,
"step": 62900
},
{
"epoch": 3.9505863171756443,
"grad_norm": 7.089817047119141,
"learning_rate": 2.8427598460171753e-07,
"loss": 0.5196,
"step": 63000
},
{
"epoch": 3.9505863171756443,
"eval_loss": 0.7484961748123169,
"eval_runtime": 258.5356,
"eval_samples_per_second": 493.441,
"eval_steps_per_second": 15.421,
"step": 63000
},
{
"epoch": 3.956857089107669,
"grad_norm": 0.09167669713497162,
"learning_rate": 2.494382413907227e-07,
"loss": 0.5683,
"step": 63100
},
{
"epoch": 3.963127861039694,
"grad_norm": 12.482440948486328,
"learning_rate": 2.1460049817972793e-07,
"loss": 0.5141,
"step": 63200
},
{
"epoch": 3.9693986329717186,
"grad_norm": 8.954193115234375,
"learning_rate": 1.8011113240084312e-07,
"loss": 0.6068,
"step": 63300
},
{
"epoch": 3.9756694049037438,
"grad_norm": 219.27337646484375,
"learning_rate": 1.452733891898483e-07,
"loss": 0.5929,
"step": 63400
},
{
"epoch": 3.9819401768357685,
"grad_norm": 1.6949673891067505,
"learning_rate": 1.104356459788535e-07,
"loss": 0.7513,
"step": 63500
},
{
"epoch": 3.9819401768357685,
"eval_loss": 0.7482015490531921,
"eval_runtime": 257.9174,
"eval_samples_per_second": 494.623,
"eval_steps_per_second": 15.458,
"step": 63500
},
{
"epoch": 3.9882109487677933,
"grad_norm": 0.34383705258369446,
"learning_rate": 7.55979027678587e-08,
"loss": 0.5053,
"step": 63600
},
{
"epoch": 3.994481720699818,
"grad_norm": 0.20212756097316742,
"learning_rate": 4.0760159556863914e-08,
"loss": 0.5707,
"step": 63700
}
],
"logging_steps": 100,
"max_steps": 63788,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}