{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 63788, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006270771932024832, "grad_norm": 92.5528793334961, "learning_rate": 3.041229032763756e-07, "loss": 11.9622, "step": 100 }, { "epoch": 0.012541543864049664, "grad_norm": 85.98809814453125, "learning_rate": 6.176516695406804e-07, "loss": 11.265, "step": 200 }, { "epoch": 0.018812315796074498, "grad_norm": 82.95439910888672, "learning_rate": 9.311804358049851e-07, "loss": 10.5195, "step": 300 }, { "epoch": 0.025083087728099328, "grad_norm": 74.84368896484375, "learning_rate": 1.2447092020692899e-06, "loss": 9.4744, "step": 400 }, { "epoch": 0.03135385966012416, "grad_norm": 71.96393585205078, "learning_rate": 1.5582379683335947e-06, "loss": 8.4815, "step": 500 }, { "epoch": 0.03135385966012416, "eval_loss": 8.621713638305664, "eval_runtime": 232.9735, "eval_samples_per_second": 547.582, "eval_steps_per_second": 17.114, "step": 500 }, { "epoch": 0.037624631592148995, "grad_norm": 82.08424377441406, "learning_rate": 1.8717667345978996e-06, "loss": 7.6105, "step": 600 }, { "epoch": 0.04389540352417383, "grad_norm": 82.3794937133789, "learning_rate": 2.1852955008622044e-06, "loss": 6.8023, "step": 700 }, { "epoch": 0.050166175456198656, "grad_norm": 42.44660186767578, "learning_rate": 2.4988242671265088e-06, "loss": 6.1258, "step": 800 }, { "epoch": 0.05643694738822349, "grad_norm": 42.748497009277344, "learning_rate": 2.812353033390814e-06, "loss": 5.5032, "step": 900 }, { "epoch": 0.06270771932024832, "grad_norm": 49.40472412109375, "learning_rate": 3.1258817996551187e-06, "loss": 5.0397, "step": 1000 }, { "epoch": 0.06270771932024832, "eval_loss": 5.194935321807861, "eval_runtime": 248.6107, "eval_samples_per_second": 513.14, "eval_steps_per_second": 16.037, "step": 1000 }, { "epoch": 0.06897849125227315, "grad_norm": 54.32060623168945, "learning_rate": 3.439410565919423e-06, "loss": 4.6909, "step": 1100 }, { "epoch": 0.07524926318429799, "grad_norm": 50.955474853515625, "learning_rate": 3.7529393321837283e-06, "loss": 4.5716, "step": 1200 }, { "epoch": 0.08152003511632282, "grad_norm": 42.99276351928711, "learning_rate": 4.066468098448033e-06, "loss": 4.3983, "step": 1300 }, { "epoch": 0.08779080704834766, "grad_norm": 56.20285415649414, "learning_rate": 4.379996864712338e-06, "loss": 4.2073, "step": 1400 }, { "epoch": 0.09406157898037248, "grad_norm": 47.72187805175781, "learning_rate": 4.693525630976643e-06, "loss": 4.2164, "step": 1500 }, { "epoch": 0.09406157898037248, "eval_loss": 4.142153263092041, "eval_runtime": 255.6642, "eval_samples_per_second": 498.983, "eval_steps_per_second": 15.595, "step": 1500 }, { "epoch": 0.10033235091239731, "grad_norm": 53.83956527709961, "learning_rate": 5.0070543972409465e-06, "loss": 4.0921, "step": 1600 }, { "epoch": 0.10660312284442215, "grad_norm": 35.98961639404297, "learning_rate": 5.320583163505252e-06, "loss": 4.1785, "step": 1700 }, { "epoch": 0.11287389477644698, "grad_norm": 90.68247985839844, "learning_rate": 5.634111929769557e-06, "loss": 4.0503, "step": 1800 }, { "epoch": 0.11914466670847182, "grad_norm": 64.07307434082031, "learning_rate": 5.947640696033862e-06, "loss": 3.8969, "step": 1900 }, { "epoch": 0.12541543864049665, "grad_norm": 58.8675537109375, "learning_rate": 6.2611694622981665e-06, "loss": 3.8538, "step": 2000 }, { "epoch": 0.12541543864049665, "eval_loss": 3.910861015319824, "eval_runtime": 257.3903, "eval_samples_per_second": 495.636, "eval_steps_per_second": 15.49, "step": 2000 }, { "epoch": 0.13168621057252147, "grad_norm": 58.32042694091797, "learning_rate": 6.574698228562471e-06, "loss": 3.872, "step": 2100 }, { "epoch": 0.1379569825045463, "grad_norm": 38.084102630615234, "learning_rate": 6.885091707164133e-06, "loss": 3.851, "step": 2200 }, { "epoch": 0.14422775443657115, "grad_norm": 62.679237365722656, "learning_rate": 7.198620473428438e-06, "loss": 3.6301, "step": 2300 }, { "epoch": 0.15049852636859598, "grad_norm": 60.0799446105957, "learning_rate": 7.512149239692742e-06, "loss": 3.5202, "step": 2400 }, { "epoch": 0.1567692983006208, "grad_norm": 30.11835289001465, "learning_rate": 7.825678005957047e-06, "loss": 3.6759, "step": 2500 }, { "epoch": 0.1567692983006208, "eval_loss": 3.638855218887329, "eval_runtime": 259.3655, "eval_samples_per_second": 491.862, "eval_steps_per_second": 15.372, "step": 2500 }, { "epoch": 0.16304007023264563, "grad_norm": 72.76881408691406, "learning_rate": 8.139206772221352e-06, "loss": 3.4106, "step": 2600 }, { "epoch": 0.16931084216467046, "grad_norm": 89.71743774414062, "learning_rate": 8.452735538485657e-06, "loss": 3.69, "step": 2700 }, { "epoch": 0.17558161409669532, "grad_norm": 81.42703247070312, "learning_rate": 8.766264304749962e-06, "loss": 3.6336, "step": 2800 }, { "epoch": 0.18185238602872014, "grad_norm": 84.91463470458984, "learning_rate": 9.079793071014266e-06, "loss": 3.4715, "step": 2900 }, { "epoch": 0.18812315796074497, "grad_norm": 50.145713806152344, "learning_rate": 9.393321837278571e-06, "loss": 3.2166, "step": 3000 }, { "epoch": 0.18812315796074497, "eval_loss": 3.273871898651123, "eval_runtime": 257.8101, "eval_samples_per_second": 494.829, "eval_steps_per_second": 15.465, "step": 3000 }, { "epoch": 0.1943939298927698, "grad_norm": 62.13496780395508, "learning_rate": 9.706850603542876e-06, "loss": 3.3844, "step": 3100 }, { "epoch": 0.20066470182479462, "grad_norm": 74.26377868652344, "learning_rate": 1.002037936980718e-05, "loss": 3.4449, "step": 3200 }, { "epoch": 0.20693547375681948, "grad_norm": 112.08758544921875, "learning_rate": 1.0333908136071484e-05, "loss": 3.0811, "step": 3300 }, { "epoch": 0.2132062456888443, "grad_norm": 40.37177276611328, "learning_rate": 1.0644301614673146e-05, "loss": 3.2777, "step": 3400 }, { "epoch": 0.21947701762086913, "grad_norm": 65.6463623046875, "learning_rate": 1.0957830380937451e-05, "loss": 2.9505, "step": 3500 }, { "epoch": 0.21947701762086913, "eval_loss": 3.0864851474761963, "eval_runtime": 252.3303, "eval_samples_per_second": 505.575, "eval_steps_per_second": 15.801, "step": 3500 }, { "epoch": 0.22574778955289396, "grad_norm": 103.7303466796875, "learning_rate": 1.1271359147201758e-05, "loss": 3.1534, "step": 3600 }, { "epoch": 0.23201856148491878, "grad_norm": 145.92767333984375, "learning_rate": 1.158488791346606e-05, "loss": 2.9669, "step": 3700 }, { "epoch": 0.23828933341694364, "grad_norm": 78.69353485107422, "learning_rate": 1.1898416679730367e-05, "loss": 2.9416, "step": 3800 }, { "epoch": 0.24456010534896847, "grad_norm": 55.99378204345703, "learning_rate": 1.221194544599467e-05, "loss": 2.9637, "step": 3900 }, { "epoch": 0.2508308772809933, "grad_norm": 104.32599639892578, "learning_rate": 1.2525474212258977e-05, "loss": 2.9322, "step": 4000 }, { "epoch": 0.2508308772809933, "eval_loss": 2.844682455062866, "eval_runtime": 252.142, "eval_samples_per_second": 505.953, "eval_steps_per_second": 15.813, "step": 4000 }, { "epoch": 0.25710164921301815, "grad_norm": 68.12931823730469, "learning_rate": 1.283900297852328e-05, "loss": 2.6926, "step": 4100 }, { "epoch": 0.26337242114504295, "grad_norm": 208.65591430664062, "learning_rate": 1.3152531744787585e-05, "loss": 2.9353, "step": 4200 }, { "epoch": 0.2696431930770678, "grad_norm": 91.44706726074219, "learning_rate": 1.3466060511051891e-05, "loss": 2.635, "step": 4300 }, { "epoch": 0.2759139650090926, "grad_norm": 92.03852081298828, "learning_rate": 1.3779589277316194e-05, "loss": 2.5692, "step": 4400 }, { "epoch": 0.28218473694111745, "grad_norm": 138.34088134765625, "learning_rate": 1.4089982755917857e-05, "loss": 3.0283, "step": 4500 }, { "epoch": 0.28218473694111745, "eval_loss": 2.9033422470092773, "eval_runtime": 248.9921, "eval_samples_per_second": 512.354, "eval_steps_per_second": 16.013, "step": 4500 }, { "epoch": 0.2884555088731423, "grad_norm": 57.657936096191406, "learning_rate": 1.4403511522182162e-05, "loss": 2.5804, "step": 4600 }, { "epoch": 0.2947262808051671, "grad_norm": 52.86611557006836, "learning_rate": 1.4717040288446466e-05, "loss": 3.1374, "step": 4700 }, { "epoch": 0.30099705273719196, "grad_norm": 80.26192474365234, "learning_rate": 1.5030569054710771e-05, "loss": 2.8479, "step": 4800 }, { "epoch": 0.30726782466921676, "grad_norm": 7.4570465087890625, "learning_rate": 1.5344097820975076e-05, "loss": 2.6809, "step": 4900 }, { "epoch": 0.3135385966012416, "grad_norm": 4.543123722076416, "learning_rate": 1.5657626587239382e-05, "loss": 2.8267, "step": 5000 }, { "epoch": 0.3135385966012416, "eval_loss": 2.6946306228637695, "eval_runtime": 249.1581, "eval_samples_per_second": 512.012, "eval_steps_per_second": 16.002, "step": 5000 }, { "epoch": 0.31980936853326647, "grad_norm": 29.848108291625977, "learning_rate": 1.5971155353503685e-05, "loss": 2.7341, "step": 5100 }, { "epoch": 0.32608014046529127, "grad_norm": 139.14234924316406, "learning_rate": 1.6284684119767992e-05, "loss": 2.8157, "step": 5200 }, { "epoch": 0.3323509123973161, "grad_norm": 65.15583038330078, "learning_rate": 1.659507759836965e-05, "loss": 2.5867, "step": 5300 }, { "epoch": 0.3386216843293409, "grad_norm": 76.32029724121094, "learning_rate": 1.6908606364633958e-05, "loss": 2.8622, "step": 5400 }, { "epoch": 0.3448924562613658, "grad_norm": 85.2268295288086, "learning_rate": 1.722213513089826e-05, "loss": 2.9063, "step": 5500 }, { "epoch": 0.3448924562613658, "eval_loss": 2.6115071773529053, "eval_runtime": 249.5988, "eval_samples_per_second": 511.108, "eval_steps_per_second": 15.974, "step": 5500 }, { "epoch": 0.35116322819339063, "grad_norm": 21.968101501464844, "learning_rate": 1.7535663897162567e-05, "loss": 2.1514, "step": 5600 }, { "epoch": 0.35743400012541543, "grad_norm": 159.9650421142578, "learning_rate": 1.784919266342687e-05, "loss": 2.3755, "step": 5700 }, { "epoch": 0.3637047720574403, "grad_norm": 53.702919006347656, "learning_rate": 1.8162721429691173e-05, "loss": 2.5055, "step": 5800 }, { "epoch": 0.3699755439894651, "grad_norm": 16.580570220947266, "learning_rate": 1.847625019595548e-05, "loss": 3.3237, "step": 5900 }, { "epoch": 0.37624631592148994, "grad_norm": 77.9209213256836, "learning_rate": 1.8789778962219786e-05, "loss": 2.561, "step": 6000 }, { "epoch": 0.37624631592148994, "eval_loss": 2.7512075901031494, "eval_runtime": 252.3158, "eval_samples_per_second": 505.605, "eval_steps_per_second": 15.802, "step": 6000 }, { "epoch": 0.3825170878535148, "grad_norm": 181.68307495117188, "learning_rate": 1.910330772848409e-05, "loss": 2.4351, "step": 6100 }, { "epoch": 0.3887878597855396, "grad_norm": 405.9890441894531, "learning_rate": 1.9416836494748396e-05, "loss": 2.8472, "step": 6200 }, { "epoch": 0.39505863171756445, "grad_norm": 65.42109680175781, "learning_rate": 1.9730365261012702e-05, "loss": 2.76, "step": 6300 }, { "epoch": 0.40132940364958924, "grad_norm": 1.5880606174468994, "learning_rate": 1.999512271595046e-05, "loss": 2.1947, "step": 6400 }, { "epoch": 0.4076001755816141, "grad_norm": 147.59877014160156, "learning_rate": 1.9960284972739466e-05, "loss": 2.6409, "step": 6500 }, { "epoch": 0.4076001755816141, "eval_loss": 2.536679267883301, "eval_runtime": 247.9993, "eval_samples_per_second": 514.405, "eval_steps_per_second": 16.077, "step": 6500 }, { "epoch": 0.41387094751363895, "grad_norm": 147.17579650878906, "learning_rate": 1.9925447229528472e-05, "loss": 2.7262, "step": 6600 }, { "epoch": 0.42014171944566375, "grad_norm": 42.11772155761719, "learning_rate": 1.989060948631748e-05, "loss": 2.7781, "step": 6700 }, { "epoch": 0.4264124913776886, "grad_norm": 194.7137451171875, "learning_rate": 1.985577174310648e-05, "loss": 2.4718, "step": 6800 }, { "epoch": 0.4326832633097134, "grad_norm": 63.2336540222168, "learning_rate": 1.9820933999895488e-05, "loss": 2.567, "step": 6900 }, { "epoch": 0.43895403524173826, "grad_norm": 122.4419174194336, "learning_rate": 1.9786096256684494e-05, "loss": 2.4215, "step": 7000 }, { "epoch": 0.43895403524173826, "eval_loss": 2.340890407562256, "eval_runtime": 249.0613, "eval_samples_per_second": 512.211, "eval_steps_per_second": 16.008, "step": 7000 }, { "epoch": 0.4452248071737631, "grad_norm": 173.85031127929688, "learning_rate": 1.97512585134735e-05, "loss": 1.9308, "step": 7100 }, { "epoch": 0.4514955791057879, "grad_norm": 200.34971618652344, "learning_rate": 1.9716420770262504e-05, "loss": 2.1232, "step": 7200 }, { "epoch": 0.45776635103781277, "grad_norm": 208.45030212402344, "learning_rate": 1.968158302705151e-05, "loss": 2.421, "step": 7300 }, { "epoch": 0.46403712296983757, "grad_norm": 148.36253356933594, "learning_rate": 1.9646745283840513e-05, "loss": 2.3232, "step": 7400 }, { "epoch": 0.4703078949018624, "grad_norm": 24.392248153686523, "learning_rate": 1.961190754062952e-05, "loss": 2.8543, "step": 7500 }, { "epoch": 0.4703078949018624, "eval_loss": 2.3705639839172363, "eval_runtime": 247.4495, "eval_samples_per_second": 515.548, "eval_steps_per_second": 16.112, "step": 7500 }, { "epoch": 0.4765786668338873, "grad_norm": 217.60328674316406, "learning_rate": 1.9577069797418526e-05, "loss": 2.4276, "step": 7600 }, { "epoch": 0.4828494387659121, "grad_norm": 91.55315399169922, "learning_rate": 1.9542232054207532e-05, "loss": 2.4507, "step": 7700 }, { "epoch": 0.48912021069793693, "grad_norm": 246.22488403320312, "learning_rate": 1.9507394310996535e-05, "loss": 2.1963, "step": 7800 }, { "epoch": 0.49539098262996173, "grad_norm": 76.7205810546875, "learning_rate": 1.947255656778554e-05, "loss": 2.4247, "step": 7900 }, { "epoch": 0.5016617545619866, "grad_norm": 0.794611394405365, "learning_rate": 1.9437718824574544e-05, "loss": 2.1948, "step": 8000 }, { "epoch": 0.5016617545619866, "eval_loss": 2.5728752613067627, "eval_runtime": 248.1179, "eval_samples_per_second": 514.159, "eval_steps_per_second": 16.069, "step": 8000 }, { "epoch": 0.5079325264940114, "grad_norm": 75.7978744506836, "learning_rate": 1.940288108136355e-05, "loss": 2.4069, "step": 8100 }, { "epoch": 0.5142032984260363, "grad_norm": 81.46521759033203, "learning_rate": 1.9368043338152557e-05, "loss": 2.4328, "step": 8200 }, { "epoch": 0.520474070358061, "grad_norm": 153.08226013183594, "learning_rate": 1.9333205594941563e-05, "loss": 2.2198, "step": 8300 }, { "epoch": 0.5267448422900859, "grad_norm": 1.353060245513916, "learning_rate": 1.9298367851730566e-05, "loss": 2.1746, "step": 8400 }, { "epoch": 0.5330156142221107, "grad_norm": 298.2365417480469, "learning_rate": 1.9263878485951682e-05, "loss": 2.2618, "step": 8500 }, { "epoch": 0.5330156142221107, "eval_loss": 2.345949411392212, "eval_runtime": 249.317, "eval_samples_per_second": 511.686, "eval_steps_per_second": 15.992, "step": 8500 }, { "epoch": 0.5392863861541356, "grad_norm": 409.5244140625, "learning_rate": 1.922904074274069e-05, "loss": 2.3909, "step": 8600 }, { "epoch": 0.5455571580861605, "grad_norm": 0.6597223877906799, "learning_rate": 1.919420299952969e-05, "loss": 2.035, "step": 8700 }, { "epoch": 0.5518279300181852, "grad_norm": 365.05914306640625, "learning_rate": 1.9159365256318698e-05, "loss": 2.2626, "step": 8800 }, { "epoch": 0.55809870195021, "grad_norm": 103.37579345703125, "learning_rate": 1.91245275131077e-05, "loss": 2.1541, "step": 8900 }, { "epoch": 0.5643694738822349, "grad_norm": 4.599234104156494, "learning_rate": 1.9089689769896707e-05, "loss": 1.9424, "step": 9000 }, { "epoch": 0.5643694738822349, "eval_loss": 2.1624536514282227, "eval_runtime": 248.0391, "eval_samples_per_second": 514.322, "eval_steps_per_second": 16.074, "step": 9000 }, { "epoch": 0.5706402458142598, "grad_norm": 0.6885708570480347, "learning_rate": 1.9054852026685714e-05, "loss": 2.5152, "step": 9100 }, { "epoch": 0.5769110177462846, "grad_norm": 103.164794921875, "learning_rate": 1.9020014283474716e-05, "loss": 2.0462, "step": 9200 }, { "epoch": 0.5831817896783094, "grad_norm": 0.7507800459861755, "learning_rate": 1.8985176540263723e-05, "loss": 1.6124, "step": 9300 }, { "epoch": 0.5894525616103342, "grad_norm": 38.5381965637207, "learning_rate": 1.895033879705273e-05, "loss": 2.2236, "step": 9400 }, { "epoch": 0.5957233335423591, "grad_norm": 374.18011474609375, "learning_rate": 1.8915501053841735e-05, "loss": 2.4706, "step": 9500 }, { "epoch": 0.5957233335423591, "eval_loss": 2.0568950176239014, "eval_runtime": 250.2023, "eval_samples_per_second": 509.875, "eval_steps_per_second": 15.935, "step": 9500 }, { "epoch": 0.6019941054743839, "grad_norm": 115.00419616699219, "learning_rate": 1.888066331063074e-05, "loss": 2.4612, "step": 9600 }, { "epoch": 0.6082648774064088, "grad_norm": 302.7066955566406, "learning_rate": 1.8845825567419745e-05, "loss": 2.2784, "step": 9700 }, { "epoch": 0.6145356493384335, "grad_norm": 0.18385061621665955, "learning_rate": 1.8810987824208748e-05, "loss": 1.9335, "step": 9800 }, { "epoch": 0.6208064212704584, "grad_norm": 9.742902755737305, "learning_rate": 1.8776150080997754e-05, "loss": 2.3779, "step": 9900 }, { "epoch": 0.6270771932024832, "grad_norm": 12.202372550964355, "learning_rate": 1.874131233778676e-05, "loss": 1.6778, "step": 10000 }, { "epoch": 0.6270771932024832, "eval_loss": 2.112342596054077, "eval_runtime": 247.5759, "eval_samples_per_second": 515.284, "eval_steps_per_second": 16.104, "step": 10000 }, { "epoch": 0.6333479651345081, "grad_norm": 47.51719284057617, "learning_rate": 1.8706474594575767e-05, "loss": 2.4721, "step": 10100 }, { "epoch": 0.6396187370665329, "grad_norm": 330.02703857421875, "learning_rate": 1.867163685136477e-05, "loss": 1.7822, "step": 10200 }, { "epoch": 0.6458895089985577, "grad_norm": 110.14346313476562, "learning_rate": 1.8636799108153776e-05, "loss": 2.077, "step": 10300 }, { "epoch": 0.6521602809305825, "grad_norm": 28.561458587646484, "learning_rate": 1.860196136494278e-05, "loss": 1.9223, "step": 10400 }, { "epoch": 0.6584310528626074, "grad_norm": 14.915325164794922, "learning_rate": 1.8567123621731785e-05, "loss": 2.3513, "step": 10500 }, { "epoch": 0.6584310528626074, "eval_loss": 1.8402663469314575, "eval_runtime": 246.8028, "eval_samples_per_second": 516.899, "eval_steps_per_second": 16.155, "step": 10500 }, { "epoch": 0.6647018247946322, "grad_norm": 142.3553009033203, "learning_rate": 1.853228587852079e-05, "loss": 2.1387, "step": 10600 }, { "epoch": 0.6709725967266571, "grad_norm": 2.4230360984802246, "learning_rate": 1.8497448135309798e-05, "loss": 2.1853, "step": 10700 }, { "epoch": 0.6772433686586818, "grad_norm": 85.05690002441406, "learning_rate": 1.84626103920988e-05, "loss": 1.8715, "step": 10800 }, { "epoch": 0.6835141405907067, "grad_norm": 88.9746322631836, "learning_rate": 1.8427772648887807e-05, "loss": 1.8581, "step": 10900 }, { "epoch": 0.6897849125227316, "grad_norm": 191.67779541015625, "learning_rate": 1.839293490567681e-05, "loss": 2.0076, "step": 11000 }, { "epoch": 0.6897849125227316, "eval_loss": 2.00632643699646, "eval_runtime": 246.7856, "eval_samples_per_second": 516.934, "eval_steps_per_second": 16.156, "step": 11000 }, { "epoch": 0.6960556844547564, "grad_norm": 157.76986694335938, "learning_rate": 1.8358097162465817e-05, "loss": 2.3144, "step": 11100 }, { "epoch": 0.7023264563867813, "grad_norm": 52.53676223754883, "learning_rate": 1.8323259419254823e-05, "loss": 2.0942, "step": 11200 }, { "epoch": 0.708597228318806, "grad_norm": 61.30582046508789, "learning_rate": 1.828842167604383e-05, "loss": 1.9117, "step": 11300 }, { "epoch": 0.7148680002508309, "grad_norm": 146.37437438964844, "learning_rate": 1.8253583932832832e-05, "loss": 2.2214, "step": 11400 }, { "epoch": 0.7211387721828557, "grad_norm": 214.81398010253906, "learning_rate": 1.821874618962184e-05, "loss": 1.9678, "step": 11500 }, { "epoch": 0.7211387721828557, "eval_loss": 1.9028793573379517, "eval_runtime": 244.7222, "eval_samples_per_second": 521.293, "eval_steps_per_second": 16.292, "step": 11500 }, { "epoch": 0.7274095441148806, "grad_norm": 5.435591220855713, "learning_rate": 1.818390844641084e-05, "loss": 1.7459, "step": 11600 }, { "epoch": 0.7336803160469054, "grad_norm": 107.97034454345703, "learning_rate": 1.8149070703199848e-05, "loss": 2.0616, "step": 11700 }, { "epoch": 0.7399510879789302, "grad_norm": 63.21007537841797, "learning_rate": 1.8114232959988854e-05, "loss": 1.6169, "step": 11800 }, { "epoch": 0.746221859910955, "grad_norm": 113.56210327148438, "learning_rate": 1.8079743594209967e-05, "loss": 1.5674, "step": 11900 }, { "epoch": 0.7524926318429799, "grad_norm": 107.1183090209961, "learning_rate": 1.8044905850998973e-05, "loss": 1.4956, "step": 12000 }, { "epoch": 0.7524926318429799, "eval_loss": 1.8266816139221191, "eval_runtime": 244.3373, "eval_samples_per_second": 522.114, "eval_steps_per_second": 16.318, "step": 12000 }, { "epoch": 0.7587634037750047, "grad_norm": 151.79904174804688, "learning_rate": 1.801006810778798e-05, "loss": 2.3816, "step": 12100 }, { "epoch": 0.7650341757070296, "grad_norm": 323.1309814453125, "learning_rate": 1.7975230364576983e-05, "loss": 2.2387, "step": 12200 }, { "epoch": 0.7713049476390543, "grad_norm": 4.0979743003845215, "learning_rate": 1.794039262136599e-05, "loss": 1.4625, "step": 12300 }, { "epoch": 0.7775757195710792, "grad_norm": 126.16666412353516, "learning_rate": 1.7905554878154995e-05, "loss": 2.028, "step": 12400 }, { "epoch": 0.783846491503104, "grad_norm": 42.80760955810547, "learning_rate": 1.7870717134944e-05, "loss": 2.151, "step": 12500 }, { "epoch": 0.783846491503104, "eval_loss": 1.7581337690353394, "eval_runtime": 244.6209, "eval_samples_per_second": 521.509, "eval_steps_per_second": 16.299, "step": 12500 }, { "epoch": 0.7901172634351289, "grad_norm": 0.3076690435409546, "learning_rate": 1.7835879391733005e-05, "loss": 1.6896, "step": 12600 }, { "epoch": 0.7963880353671537, "grad_norm": 1.4938758611679077, "learning_rate": 1.780104164852201e-05, "loss": 1.8526, "step": 12700 }, { "epoch": 0.8026588072991785, "grad_norm": 208.20004272460938, "learning_rate": 1.7766203905311014e-05, "loss": 1.9745, "step": 12800 }, { "epoch": 0.8089295792312033, "grad_norm": 14.515748023986816, "learning_rate": 1.773136616210002e-05, "loss": 2.1042, "step": 12900 }, { "epoch": 0.8152003511632282, "grad_norm": 170.497314453125, "learning_rate": 1.7696528418889027e-05, "loss": 1.83, "step": 13000 }, { "epoch": 0.8152003511632282, "eval_loss": 1.5666632652282715, "eval_runtime": 244.9634, "eval_samples_per_second": 520.78, "eval_steps_per_second": 16.276, "step": 13000 }, { "epoch": 0.821471123095253, "grad_norm": 37.14794158935547, "learning_rate": 1.7661690675678033e-05, "loss": 1.7451, "step": 13100 }, { "epoch": 0.8277418950272779, "grad_norm": 97.6008529663086, "learning_rate": 1.7626852932467036e-05, "loss": 1.568, "step": 13200 }, { "epoch": 0.8340126669593027, "grad_norm": 1.4752888679504395, "learning_rate": 1.7592015189256042e-05, "loss": 1.4432, "step": 13300 }, { "epoch": 0.8402834388913275, "grad_norm": 100.85454559326172, "learning_rate": 1.7557177446045045e-05, "loss": 1.9172, "step": 13400 }, { "epoch": 0.8465542108233524, "grad_norm": 169.63970947265625, "learning_rate": 1.752233970283405e-05, "loss": 1.9438, "step": 13500 }, { "epoch": 0.8465542108233524, "eval_loss": 1.6055145263671875, "eval_runtime": 239.684, "eval_samples_per_second": 532.251, "eval_steps_per_second": 16.634, "step": 13500 }, { "epoch": 0.8528249827553772, "grad_norm": 145.4659881591797, "learning_rate": 1.7487501959623058e-05, "loss": 1.6488, "step": 13600 }, { "epoch": 0.8590957546874021, "grad_norm": 9.112565994262695, "learning_rate": 1.7452664216412064e-05, "loss": 1.8166, "step": 13700 }, { "epoch": 0.8653665266194268, "grad_norm": 122.40379333496094, "learning_rate": 1.7417826473201067e-05, "loss": 1.5929, "step": 13800 }, { "epoch": 0.8716372985514517, "grad_norm": 1.4977953433990479, "learning_rate": 1.7382988729990073e-05, "loss": 1.2476, "step": 13900 }, { "epoch": 0.8779080704834765, "grad_norm": 264.9580078125, "learning_rate": 1.7348150986779076e-05, "loss": 1.5236, "step": 14000 }, { "epoch": 0.8779080704834765, "eval_loss": 1.8921126127243042, "eval_runtime": 242.2787, "eval_samples_per_second": 526.551, "eval_steps_per_second": 16.456, "step": 14000 }, { "epoch": 0.8841788424155014, "grad_norm": 0.0032478359062224627, "learning_rate": 1.7313661621000193e-05, "loss": 1.6538, "step": 14100 }, { "epoch": 0.8904496143475262, "grad_norm": 169.41224670410156, "learning_rate": 1.72788238777892e-05, "loss": 1.8689, "step": 14200 }, { "epoch": 0.896720386279551, "grad_norm": 91.79679107666016, "learning_rate": 1.7243986134578202e-05, "loss": 1.0831, "step": 14300 }, { "epoch": 0.9029911582115758, "grad_norm": 1.378010869026184, "learning_rate": 1.7209148391367208e-05, "loss": 1.7765, "step": 14400 }, { "epoch": 0.9092619301436007, "grad_norm": 86.2571792602539, "learning_rate": 1.7174310648156215e-05, "loss": 1.3548, "step": 14500 }, { "epoch": 0.9092619301436007, "eval_loss": 1.668320894241333, "eval_runtime": 242.597, "eval_samples_per_second": 525.86, "eval_steps_per_second": 16.435, "step": 14500 }, { "epoch": 0.9155327020756255, "grad_norm": 42.63466262817383, "learning_rate": 1.7139472904945218e-05, "loss": 1.7792, "step": 14600 }, { "epoch": 0.9218034740076504, "grad_norm": 31.874799728393555, "learning_rate": 1.7104635161734224e-05, "loss": 1.73, "step": 14700 }, { "epoch": 0.9280742459396751, "grad_norm": 288.0302734375, "learning_rate": 1.7069797418523227e-05, "loss": 1.5979, "step": 14800 }, { "epoch": 0.9343450178717, "grad_norm": 76.91877746582031, "learning_rate": 1.7034959675312233e-05, "loss": 1.3678, "step": 14900 }, { "epoch": 0.9406157898037248, "grad_norm": 153.2476348876953, "learning_rate": 1.700012193210124e-05, "loss": 2.0664, "step": 15000 }, { "epoch": 0.9406157898037248, "eval_loss": 1.5160768032073975, "eval_runtime": 241.7632, "eval_samples_per_second": 527.673, "eval_steps_per_second": 16.491, "step": 15000 }, { "epoch": 0.9468865617357497, "grad_norm": 204.87367248535156, "learning_rate": 1.6965284188890246e-05, "loss": 1.4472, "step": 15100 }, { "epoch": 0.9531573336677746, "grad_norm": 107.19727325439453, "learning_rate": 1.693044644567925e-05, "loss": 1.447, "step": 15200 }, { "epoch": 0.9594281055997993, "grad_norm": 0.9635588526725769, "learning_rate": 1.6895608702468255e-05, "loss": 1.7261, "step": 15300 }, { "epoch": 0.9656988775318242, "grad_norm": 21.72879981994629, "learning_rate": 1.686077095925726e-05, "loss": 1.4881, "step": 15400 }, { "epoch": 0.971969649463849, "grad_norm": 3.110539197921753, "learning_rate": 1.6825933216046268e-05, "loss": 1.313, "step": 15500 }, { "epoch": 0.971969649463849, "eval_loss": 1.6226599216461182, "eval_runtime": 241.336, "eval_samples_per_second": 528.607, "eval_steps_per_second": 16.521, "step": 15500 }, { "epoch": 0.9782404213958739, "grad_norm": 4.804477691650391, "learning_rate": 1.679109547283527e-05, "loss": 1.4587, "step": 15600 }, { "epoch": 0.9845111933278987, "grad_norm": 159.54579162597656, "learning_rate": 1.6756257729624277e-05, "loss": 2.0982, "step": 15700 }, { "epoch": 0.9907819652599235, "grad_norm": 0.04496179521083832, "learning_rate": 1.672141998641328e-05, "loss": 1.4854, "step": 15800 }, { "epoch": 0.9970527371919483, "grad_norm": 178.064453125, "learning_rate": 1.6686582243202286e-05, "loss": 1.343, "step": 15900 }, { "epoch": 1.0033235091239732, "grad_norm": 60.21414566040039, "learning_rate": 1.6651744499991293e-05, "loss": 1.1795, "step": 16000 }, { "epoch": 1.0033235091239732, "eval_loss": 1.5639160871505737, "eval_runtime": 239.9545, "eval_samples_per_second": 531.651, "eval_steps_per_second": 16.616, "step": 16000 }, { "epoch": 1.009594281055998, "grad_norm": 28.01744842529297, "learning_rate": 1.66169067567803e-05, "loss": 1.4001, "step": 16100 }, { "epoch": 1.0158650529880229, "grad_norm": 0.9447069764137268, "learning_rate": 1.6582069013569302e-05, "loss": 1.3867, "step": 16200 }, { "epoch": 1.0221358249200476, "grad_norm": 271.91583251953125, "learning_rate": 1.654723127035831e-05, "loss": 1.5191, "step": 16300 }, { "epoch": 1.0284065968520726, "grad_norm": 50.53108596801758, "learning_rate": 1.651239352714731e-05, "loss": 1.4693, "step": 16400 }, { "epoch": 1.0346773687840973, "grad_norm": 37.87648010253906, "learning_rate": 1.6477555783936318e-05, "loss": 1.628, "step": 16500 }, { "epoch": 1.0346773687840973, "eval_loss": 1.4715627431869507, "eval_runtime": 243.7121, "eval_samples_per_second": 523.454, "eval_steps_per_second": 16.359, "step": 16500 }, { "epoch": 1.040948140716122, "grad_norm": 0.5571967363357544, "learning_rate": 1.6442718040725324e-05, "loss": 1.0041, "step": 16600 }, { "epoch": 1.047218912648147, "grad_norm": 410.87158203125, "learning_rate": 1.6408228674946437e-05, "loss": 1.7728, "step": 16700 }, { "epoch": 1.0534896845801718, "grad_norm": 0.04839416220784187, "learning_rate": 1.6373390931735443e-05, "loss": 1.5586, "step": 16800 }, { "epoch": 1.0597604565121967, "grad_norm": 15.377680778503418, "learning_rate": 1.633855318852445e-05, "loss": 1.7229, "step": 16900 }, { "epoch": 1.0660312284442215, "grad_norm": 156.4866943359375, "learning_rate": 1.6304063822745562e-05, "loss": 1.5556, "step": 17000 }, { "epoch": 1.0660312284442215, "eval_loss": 1.467575192451477, "eval_runtime": 244.9859, "eval_samples_per_second": 520.732, "eval_steps_per_second": 16.274, "step": 17000 }, { "epoch": 1.0723020003762462, "grad_norm": 0.4987052083015442, "learning_rate": 1.626922607953457e-05, "loss": 1.2529, "step": 17100 }, { "epoch": 1.0785727723082712, "grad_norm": 0.12283490598201752, "learning_rate": 1.6234388336323575e-05, "loss": 1.4787, "step": 17200 }, { "epoch": 1.084843544240296, "grad_norm": 0.2928747534751892, "learning_rate": 1.6199550593112578e-05, "loss": 1.1947, "step": 17300 }, { "epoch": 1.091114316172321, "grad_norm": 0.06402698904275894, "learning_rate": 1.6164712849901584e-05, "loss": 1.3014, "step": 17400 }, { "epoch": 1.0973850881043457, "grad_norm": 38.54865646362305, "learning_rate": 1.612987510669059e-05, "loss": 1.3743, "step": 17500 }, { "epoch": 1.0973850881043457, "eval_loss": 1.4624249935150146, "eval_runtime": 242.5321, "eval_samples_per_second": 526.0, "eval_steps_per_second": 16.439, "step": 17500 }, { "epoch": 1.1036558600363704, "grad_norm": 4.770035266876221, "learning_rate": 1.6095037363479597e-05, "loss": 1.3397, "step": 17600 }, { "epoch": 1.1099266319683954, "grad_norm": 73.70013427734375, "learning_rate": 1.60601996202686e-05, "loss": 1.3062, "step": 17700 }, { "epoch": 1.11619740390042, "grad_norm": 0.7905834317207336, "learning_rate": 1.6025361877057606e-05, "loss": 1.3288, "step": 17800 }, { "epoch": 1.122468175832445, "grad_norm": 85.46574401855469, "learning_rate": 1.599052413384661e-05, "loss": 2.0002, "step": 17900 }, { "epoch": 1.1287389477644698, "grad_norm": 101.38238525390625, "learning_rate": 1.5955686390635616e-05, "loss": 2.0294, "step": 18000 }, { "epoch": 1.1287389477644698, "eval_loss": 1.4184610843658447, "eval_runtime": 243.202, "eval_samples_per_second": 524.552, "eval_steps_per_second": 16.394, "step": 18000 }, { "epoch": 1.1350097196964946, "grad_norm": 177.34451293945312, "learning_rate": 1.5920848647424622e-05, "loss": 1.5053, "step": 18100 }, { "epoch": 1.1412804916285195, "grad_norm": 0.12398409098386765, "learning_rate": 1.5886010904213628e-05, "loss": 1.3657, "step": 18200 }, { "epoch": 1.1475512635605443, "grad_norm": 1.1212390661239624, "learning_rate": 1.585117316100263e-05, "loss": 1.3877, "step": 18300 }, { "epoch": 1.1538220354925692, "grad_norm": 132.34060668945312, "learning_rate": 1.5816335417791638e-05, "loss": 1.9034, "step": 18400 }, { "epoch": 1.160092807424594, "grad_norm": 8.030499458312988, "learning_rate": 1.578149767458064e-05, "loss": 1.4001, "step": 18500 }, { "epoch": 1.160092807424594, "eval_loss": 1.3812620639801025, "eval_runtime": 242.7932, "eval_samples_per_second": 525.435, "eval_steps_per_second": 16.421, "step": 18500 }, { "epoch": 1.1663635793566187, "grad_norm": 100.8308334350586, "learning_rate": 1.5746659931369647e-05, "loss": 1.7503, "step": 18600 }, { "epoch": 1.1726343512886437, "grad_norm": 57.332176208496094, "learning_rate": 1.5711822188158653e-05, "loss": 1.1482, "step": 18700 }, { "epoch": 1.1789051232206684, "grad_norm": 0.38618066906929016, "learning_rate": 1.567698444494766e-05, "loss": 1.0958, "step": 18800 }, { "epoch": 1.1851758951526934, "grad_norm": 29.31690216064453, "learning_rate": 1.5642146701736662e-05, "loss": 1.2657, "step": 18900 }, { "epoch": 1.1914466670847181, "grad_norm": 98.16004180908203, "learning_rate": 1.560730895852567e-05, "loss": 1.3721, "step": 19000 }, { "epoch": 1.1914466670847181, "eval_loss": 1.4701639413833618, "eval_runtime": 242.7602, "eval_samples_per_second": 525.506, "eval_steps_per_second": 16.424, "step": 19000 }, { "epoch": 1.1977174390167429, "grad_norm": 18.174930572509766, "learning_rate": 1.5572471215314672e-05, "loss": 1.2361, "step": 19100 }, { "epoch": 1.2039882109487678, "grad_norm": 18.77554702758789, "learning_rate": 1.5537633472103678e-05, "loss": 1.003, "step": 19200 }, { "epoch": 1.2102589828807926, "grad_norm": 105.063720703125, "learning_rate": 1.5502795728892684e-05, "loss": 1.3677, "step": 19300 }, { "epoch": 1.2165297548128176, "grad_norm": 65.42724609375, "learning_rate": 1.546795798568169e-05, "loss": 1.668, "step": 19400 }, { "epoch": 1.2228005267448423, "grad_norm": 57.190792083740234, "learning_rate": 1.5433120242470694e-05, "loss": 1.2026, "step": 19500 }, { "epoch": 1.2228005267448423, "eval_loss": 1.3641443252563477, "eval_runtime": 244.6626, "eval_samples_per_second": 521.42, "eval_steps_per_second": 16.296, "step": 19500 }, { "epoch": 1.229071298676867, "grad_norm": 291.0449523925781, "learning_rate": 1.53982824992597e-05, "loss": 1.1754, "step": 19600 }, { "epoch": 1.235342070608892, "grad_norm": 0.6484419703483582, "learning_rate": 1.5363444756048703e-05, "loss": 1.3196, "step": 19700 }, { "epoch": 1.2416128425409168, "grad_norm": 10.18918514251709, "learning_rate": 1.532860701283771e-05, "loss": 1.4766, "step": 19800 }, { "epoch": 1.2478836144729417, "grad_norm": 0.408495157957077, "learning_rate": 1.5293769269626716e-05, "loss": 1.389, "step": 19900 }, { "epoch": 1.2541543864049665, "grad_norm": 1.5292593240737915, "learning_rate": 1.525893152641572e-05, "loss": 1.6974, "step": 20000 }, { "epoch": 1.2541543864049665, "eval_loss": 1.3344130516052246, "eval_runtime": 243.5732, "eval_samples_per_second": 523.752, "eval_steps_per_second": 16.369, "step": 20000 }, { "epoch": 1.2604251583369912, "grad_norm": 0.13364413380622864, "learning_rate": 1.5224093783204725e-05, "loss": 1.5036, "step": 20100 }, { "epoch": 1.2666959302690162, "grad_norm": 68.8973617553711, "learning_rate": 1.518925603999373e-05, "loss": 1.1728, "step": 20200 }, { "epoch": 1.272966702201041, "grad_norm": 2.0211031436920166, "learning_rate": 1.5154418296782736e-05, "loss": 1.6058, "step": 20300 }, { "epoch": 1.2792374741330659, "grad_norm": 16.78483009338379, "learning_rate": 1.511958055357174e-05, "loss": 1.5191, "step": 20400 }, { "epoch": 1.2855082460650906, "grad_norm": 36.06229019165039, "learning_rate": 1.5084742810360747e-05, "loss": 1.4516, "step": 20500 }, { "epoch": 1.2855082460650906, "eval_loss": 1.320965051651001, "eval_runtime": 237.9506, "eval_samples_per_second": 536.128, "eval_steps_per_second": 16.756, "step": 20500 }, { "epoch": 1.2917790179971154, "grad_norm": 0.42136240005493164, "learning_rate": 1.504990506714975e-05, "loss": 1.3485, "step": 20600 }, { "epoch": 1.2980497899291403, "grad_norm": 69.18399810791016, "learning_rate": 1.5015067323938756e-05, "loss": 1.2598, "step": 20700 }, { "epoch": 1.304320561861165, "grad_norm": 0.9956406354904175, "learning_rate": 1.4980229580727761e-05, "loss": 1.5871, "step": 20800 }, { "epoch": 1.31059133379319, "grad_norm": 296.9071044921875, "learning_rate": 1.4945391837516767e-05, "loss": 1.1965, "step": 20900 }, { "epoch": 1.3168621057252148, "grad_norm": 135.63108825683594, "learning_rate": 1.4910554094305772e-05, "loss": 1.3983, "step": 21000 }, { "epoch": 1.3168621057252148, "eval_loss": 1.2516661882400513, "eval_runtime": 243.3539, "eval_samples_per_second": 524.224, "eval_steps_per_second": 16.384, "step": 21000 }, { "epoch": 1.3231328776572395, "grad_norm": 0.29125073552131653, "learning_rate": 1.4875716351094778e-05, "loss": 1.2605, "step": 21100 }, { "epoch": 1.3294036495892645, "grad_norm": 120.13431549072266, "learning_rate": 1.4840878607883781e-05, "loss": 1.5629, "step": 21200 }, { "epoch": 1.3356744215212892, "grad_norm": 0.6574529409408569, "learning_rate": 1.4806040864672787e-05, "loss": 1.0668, "step": 21300 }, { "epoch": 1.3419451934533142, "grad_norm": 0.08501740545034409, "learning_rate": 1.4771203121461792e-05, "loss": 1.1879, "step": 21400 }, { "epoch": 1.348215965385339, "grad_norm": 0.06920505315065384, "learning_rate": 1.4736365378250798e-05, "loss": 1.132, "step": 21500 }, { "epoch": 1.348215965385339, "eval_loss": 1.3881497383117676, "eval_runtime": 239.3274, "eval_samples_per_second": 533.044, "eval_steps_per_second": 16.659, "step": 21500 }, { "epoch": 1.3544867373173637, "grad_norm": 119.1258773803711, "learning_rate": 1.4701876012471915e-05, "loss": 1.7231, "step": 21600 }, { "epoch": 1.3607575092493887, "grad_norm": 219.5289764404297, "learning_rate": 1.4667038269260918e-05, "loss": 1.7636, "step": 21700 }, { "epoch": 1.3670282811814134, "grad_norm": 27.880413055419922, "learning_rate": 1.4632548903482034e-05, "loss": 1.1193, "step": 21800 }, { "epoch": 1.3732990531134384, "grad_norm": 5.331712245941162, "learning_rate": 1.459771116027104e-05, "loss": 1.4662, "step": 21900 }, { "epoch": 1.379569825045463, "grad_norm": 53.37089538574219, "learning_rate": 1.4562873417060043e-05, "loss": 2.0394, "step": 22000 }, { "epoch": 1.379569825045463, "eval_loss": 1.1926569938659668, "eval_runtime": 241.0069, "eval_samples_per_second": 529.329, "eval_steps_per_second": 16.543, "step": 22000 }, { "epoch": 1.3858405969774878, "grad_norm": 22.226316452026367, "learning_rate": 1.452803567384905e-05, "loss": 1.1535, "step": 22100 }, { "epoch": 1.3921113689095128, "grad_norm": 2.272599458694458, "learning_rate": 1.4493197930638054e-05, "loss": 1.4592, "step": 22200 }, { "epoch": 1.3983821408415376, "grad_norm": 25.961870193481445, "learning_rate": 1.445836018742706e-05, "loss": 1.276, "step": 22300 }, { "epoch": 1.4046529127735625, "grad_norm": 73.93904113769531, "learning_rate": 1.4423522444216065e-05, "loss": 1.2984, "step": 22400 }, { "epoch": 1.4109236847055873, "grad_norm": 122.38665771484375, "learning_rate": 1.438868470100507e-05, "loss": 0.9741, "step": 22500 }, { "epoch": 1.4109236847055873, "eval_loss": 1.2707290649414062, "eval_runtime": 241.8021, "eval_samples_per_second": 527.588, "eval_steps_per_second": 16.489, "step": 22500 }, { "epoch": 1.417194456637612, "grad_norm": 402.4999084472656, "learning_rate": 1.4353846957794076e-05, "loss": 1.4253, "step": 22600 }, { "epoch": 1.423465228569637, "grad_norm": 0.6434441208839417, "learning_rate": 1.4319009214583079e-05, "loss": 1.0769, "step": 22700 }, { "epoch": 1.4297360005016617, "grad_norm": 43.16348648071289, "learning_rate": 1.4284171471372085e-05, "loss": 0.8276, "step": 22800 }, { "epoch": 1.4360067724336867, "grad_norm": 91.6303482055664, "learning_rate": 1.424933372816109e-05, "loss": 1.2689, "step": 22900 }, { "epoch": 1.4422775443657114, "grad_norm": 123.81659698486328, "learning_rate": 1.4214495984950096e-05, "loss": 1.4817, "step": 23000 }, { "epoch": 1.4422775443657114, "eval_loss": 1.2094941139221191, "eval_runtime": 235.8497, "eval_samples_per_second": 540.904, "eval_steps_per_second": 16.905, "step": 23000 }, { "epoch": 1.4485483162977362, "grad_norm": 33.1621208190918, "learning_rate": 1.4179658241739101e-05, "loss": 1.1522, "step": 23100 }, { "epoch": 1.4548190882297611, "grad_norm": 0.6552605628967285, "learning_rate": 1.4144820498528107e-05, "loss": 0.8978, "step": 23200 }, { "epoch": 1.4610898601617859, "grad_norm": 1.6054786443710327, "learning_rate": 1.410998275531711e-05, "loss": 1.015, "step": 23300 }, { "epoch": 1.4673606320938108, "grad_norm": 30.116901397705078, "learning_rate": 1.4075145012106117e-05, "loss": 1.0351, "step": 23400 }, { "epoch": 1.4736314040258356, "grad_norm": 74.74423217773438, "learning_rate": 1.4040307268895121e-05, "loss": 1.3959, "step": 23500 }, { "epoch": 1.4736314040258356, "eval_loss": 1.1969189643859863, "eval_runtime": 238.5287, "eval_samples_per_second": 534.829, "eval_steps_per_second": 16.715, "step": 23500 }, { "epoch": 1.4799021759578603, "grad_norm": 4.446337699890137, "learning_rate": 1.4005469525684128e-05, "loss": 1.2879, "step": 23600 }, { "epoch": 1.4861729478898853, "grad_norm": 132.61671447753906, "learning_rate": 1.3970631782473132e-05, "loss": 1.0651, "step": 23700 }, { "epoch": 1.49244371982191, "grad_norm": 64.33197784423828, "learning_rate": 1.3935794039262139e-05, "loss": 1.1601, "step": 23800 }, { "epoch": 1.498714491753935, "grad_norm": 0.5995836853981018, "learning_rate": 1.3900956296051142e-05, "loss": 1.0034, "step": 23900 }, { "epoch": 1.5049852636859598, "grad_norm": 0.1931271255016327, "learning_rate": 1.3866118552840148e-05, "loss": 1.3386, "step": 24000 }, { "epoch": 1.5049852636859598, "eval_loss": 1.1590368747711182, "eval_runtime": 241.3271, "eval_samples_per_second": 528.627, "eval_steps_per_second": 16.521, "step": 24000 }, { "epoch": 1.5112560356179845, "grad_norm": 38.5876579284668, "learning_rate": 1.3831280809629153e-05, "loss": 1.142, "step": 24100 }, { "epoch": 1.5175268075500095, "grad_norm": 8.049750328063965, "learning_rate": 1.3796443066418159e-05, "loss": 1.3495, "step": 24200 }, { "epoch": 1.5237975794820342, "grad_norm": 32.30927658081055, "learning_rate": 1.3761605323207164e-05, "loss": 0.9993, "step": 24300 }, { "epoch": 1.5300683514140592, "grad_norm": 0.47087952494621277, "learning_rate": 1.372676757999617e-05, "loss": 0.9363, "step": 24400 }, { "epoch": 1.536339123346084, "grad_norm": 160.12139892578125, "learning_rate": 1.3691929836785175e-05, "loss": 1.4402, "step": 24500 }, { "epoch": 1.536339123346084, "eval_loss": 1.2178274393081665, "eval_runtime": 241.2634, "eval_samples_per_second": 528.767, "eval_steps_per_second": 16.526, "step": 24500 }, { "epoch": 1.5426098952781087, "grad_norm": 43.729827880859375, "learning_rate": 1.3657092093574181e-05, "loss": 1.0648, "step": 24600 }, { "epoch": 1.5488806672101336, "grad_norm": 16.396068572998047, "learning_rate": 1.3622254350363184e-05, "loss": 1.5102, "step": 24700 }, { "epoch": 1.5551514391421584, "grad_norm": 280.9241027832031, "learning_rate": 1.358741660715219e-05, "loss": 1.3415, "step": 24800 }, { "epoch": 1.5614222110741833, "grad_norm": 0.3944130539894104, "learning_rate": 1.3552578863941195e-05, "loss": 0.7441, "step": 24900 }, { "epoch": 1.567692983006208, "grad_norm": 242.84613037109375, "learning_rate": 1.3517741120730201e-05, "loss": 0.901, "step": 25000 }, { "epoch": 1.567692983006208, "eval_loss": 1.1982382535934448, "eval_runtime": 241.5382, "eval_samples_per_second": 528.165, "eval_steps_per_second": 16.507, "step": 25000 }, { "epoch": 1.5739637549382328, "grad_norm": 61.62953567504883, "learning_rate": 1.3482903377519206e-05, "loss": 1.3147, "step": 25100 }, { "epoch": 1.5802345268702578, "grad_norm": 2.465519905090332, "learning_rate": 1.3448065634308212e-05, "loss": 0.971, "step": 25200 }, { "epoch": 1.5865052988022825, "grad_norm": 184.7733612060547, "learning_rate": 1.3413227891097215e-05, "loss": 0.9988, "step": 25300 }, { "epoch": 1.5927760707343075, "grad_norm": 221.9571533203125, "learning_rate": 1.337839014788622e-05, "loss": 1.1445, "step": 25400 }, { "epoch": 1.5990468426663322, "grad_norm": 14.548208236694336, "learning_rate": 1.3343552404675226e-05, "loss": 1.1018, "step": 25500 }, { "epoch": 1.5990468426663322, "eval_loss": 1.142329454421997, "eval_runtime": 238.9747, "eval_samples_per_second": 533.831, "eval_steps_per_second": 16.684, "step": 25500 }, { "epoch": 1.605317614598357, "grad_norm": 0.4988707900047302, "learning_rate": 1.330871466146423e-05, "loss": 1.0902, "step": 25600 }, { "epoch": 1.611588386530382, "grad_norm": 213.1658477783203, "learning_rate": 1.3273876918253237e-05, "loss": 1.2577, "step": 25700 }, { "epoch": 1.6178591584624067, "grad_norm": 74.17716217041016, "learning_rate": 1.3239039175042242e-05, "loss": 1.2005, "step": 25800 }, { "epoch": 1.6241299303944317, "grad_norm": 196.46742248535156, "learning_rate": 1.3204201431831248e-05, "loss": 1.2839, "step": 25900 }, { "epoch": 1.6304007023264564, "grad_norm": 264.5187072753906, "learning_rate": 1.3169363688620251e-05, "loss": 1.4122, "step": 26000 }, { "epoch": 1.6304007023264564, "eval_loss": 1.1125129461288452, "eval_runtime": 238.2144, "eval_samples_per_second": 535.534, "eval_steps_per_second": 16.737, "step": 26000 }, { "epoch": 1.6366714742584811, "grad_norm": 0.5429248213768005, "learning_rate": 1.3134525945409257e-05, "loss": 0.7832, "step": 26100 }, { "epoch": 1.642942246190506, "grad_norm": 0.07243086397647858, "learning_rate": 1.3099688202198262e-05, "loss": 1.3278, "step": 26200 }, { "epoch": 1.6492130181225308, "grad_norm": 176.74636840820312, "learning_rate": 1.3064850458987268e-05, "loss": 1.2055, "step": 26300 }, { "epoch": 1.6554837900545558, "grad_norm": 1.1564711332321167, "learning_rate": 1.3030012715776273e-05, "loss": 1.5814, "step": 26400 }, { "epoch": 1.6617545619865806, "grad_norm": 0.3095082640647888, "learning_rate": 1.299517497256528e-05, "loss": 1.0393, "step": 26500 }, { "epoch": 1.6617545619865806, "eval_loss": 1.0945708751678467, "eval_runtime": 240.297, "eval_samples_per_second": 530.893, "eval_steps_per_second": 16.592, "step": 26500 }, { "epoch": 1.6680253339186053, "grad_norm": 0.8863621354103088, "learning_rate": 1.2960337229354282e-05, "loss": 1.4531, "step": 26600 }, { "epoch": 1.6742961058506303, "grad_norm": 0.15211889147758484, "learning_rate": 1.2925499486143289e-05, "loss": 1.4162, "step": 26700 }, { "epoch": 1.680566877782655, "grad_norm": 0.271015465259552, "learning_rate": 1.2890661742932293e-05, "loss": 0.8498, "step": 26800 }, { "epoch": 1.68683764971468, "grad_norm": 1.462451457977295, "learning_rate": 1.285617237715341e-05, "loss": 1.1318, "step": 26900 }, { "epoch": 1.6931084216467047, "grad_norm": 1.1144922971725464, "learning_rate": 1.2821334633942416e-05, "loss": 1.3287, "step": 27000 }, { "epoch": 1.6931084216467047, "eval_loss": 1.0439221858978271, "eval_runtime": 239.1496, "eval_samples_per_second": 533.44, "eval_steps_per_second": 16.672, "step": 27000 }, { "epoch": 1.6993791935787295, "grad_norm": 1.3803671598434448, "learning_rate": 1.2786496890731419e-05, "loss": 1.0886, "step": 27100 }, { "epoch": 1.7056499655107544, "grad_norm": 51.79226303100586, "learning_rate": 1.2752007524952535e-05, "loss": 0.8991, "step": 27200 }, { "epoch": 1.7119207374427792, "grad_norm": 17.195894241333008, "learning_rate": 1.2717169781741541e-05, "loss": 0.7563, "step": 27300 }, { "epoch": 1.7181915093748041, "grad_norm": 0.548939049243927, "learning_rate": 1.2682332038530544e-05, "loss": 0.9284, "step": 27400 }, { "epoch": 1.7244622813068289, "grad_norm": 3.179530620574951, "learning_rate": 1.264749429531955e-05, "loss": 1.3388, "step": 27500 }, { "epoch": 1.7244622813068289, "eval_loss": 1.0940054655075073, "eval_runtime": 239.4702, "eval_samples_per_second": 532.726, "eval_steps_per_second": 16.649, "step": 27500 }, { "epoch": 1.7307330532388536, "grad_norm": 0.8089356422424316, "learning_rate": 1.2612656552108555e-05, "loss": 1.2951, "step": 27600 }, { "epoch": 1.7370038251708786, "grad_norm": 698.0848388671875, "learning_rate": 1.2577818808897562e-05, "loss": 0.9789, "step": 27700 }, { "epoch": 1.7432745971029033, "grad_norm": 156.7066192626953, "learning_rate": 1.2542981065686566e-05, "loss": 1.2898, "step": 27800 }, { "epoch": 1.7495453690349283, "grad_norm": 59.603519439697266, "learning_rate": 1.2508143322475569e-05, "loss": 0.9915, "step": 27900 }, { "epoch": 1.755816140966953, "grad_norm": 5.36550760269165, "learning_rate": 1.2473305579264575e-05, "loss": 1.5349, "step": 28000 }, { "epoch": 1.755816140966953, "eval_loss": 1.0266426801681519, "eval_runtime": 240.254, "eval_samples_per_second": 530.988, "eval_steps_per_second": 16.595, "step": 28000 }, { "epoch": 1.7620869128989778, "grad_norm": 3.0849006175994873, "learning_rate": 1.243846783605358e-05, "loss": 1.124, "step": 28100 }, { "epoch": 1.7683576848310028, "grad_norm": 2.890775442123413, "learning_rate": 1.2403630092842586e-05, "loss": 0.809, "step": 28200 }, { "epoch": 1.7746284567630275, "grad_norm": 0.6994801163673401, "learning_rate": 1.2368792349631591e-05, "loss": 0.9617, "step": 28300 }, { "epoch": 1.7808992286950525, "grad_norm": 14.703944206237793, "learning_rate": 1.2333954606420597e-05, "loss": 1.3061, "step": 28400 }, { "epoch": 1.7871700006270772, "grad_norm": 188.39633178710938, "learning_rate": 1.2299116863209602e-05, "loss": 1.1323, "step": 28500 }, { "epoch": 1.7871700006270772, "eval_loss": 1.0488332509994507, "eval_runtime": 240.6796, "eval_samples_per_second": 530.049, "eval_steps_per_second": 16.566, "step": 28500 }, { "epoch": 1.793440772559102, "grad_norm": 12.853857040405273, "learning_rate": 1.2264279119998608e-05, "loss": 1.2991, "step": 28600 }, { "epoch": 1.7997115444911267, "grad_norm": 17.315292358398438, "learning_rate": 1.2229441376787611e-05, "loss": 0.8708, "step": 28700 }, { "epoch": 1.8059823164231517, "grad_norm": 24.514192581176758, "learning_rate": 1.2194603633576618e-05, "loss": 0.7493, "step": 28800 }, { "epoch": 1.8122530883551766, "grad_norm": 17.776947021484375, "learning_rate": 1.2159765890365622e-05, "loss": 1.004, "step": 28900 }, { "epoch": 1.8185238602872014, "grad_norm": 154.2757110595703, "learning_rate": 1.2124928147154629e-05, "loss": 1.1477, "step": 29000 }, { "epoch": 1.8185238602872014, "eval_loss": 1.0206255912780762, "eval_runtime": 238.9764, "eval_samples_per_second": 533.827, "eval_steps_per_second": 16.684, "step": 29000 }, { "epoch": 1.824794632219226, "grad_norm": 174.9512939453125, "learning_rate": 1.2090090403943633e-05, "loss": 1.1826, "step": 29100 }, { "epoch": 1.8310654041512509, "grad_norm": 251.60848999023438, "learning_rate": 1.205525266073264e-05, "loss": 1.0961, "step": 29200 }, { "epoch": 1.8373361760832758, "grad_norm": 15.37478256225586, "learning_rate": 1.2020414917521643e-05, "loss": 1.4743, "step": 29300 }, { "epoch": 1.8436069480153008, "grad_norm": 17.250076293945312, "learning_rate": 1.1985577174310649e-05, "loss": 0.8413, "step": 29400 }, { "epoch": 1.8498777199473255, "grad_norm": 0.08943232893943787, "learning_rate": 1.1950739431099654e-05, "loss": 1.2623, "step": 29500 }, { "epoch": 1.8498777199473255, "eval_loss": 1.004668951034546, "eval_runtime": 241.047, "eval_samples_per_second": 529.241, "eval_steps_per_second": 16.54, "step": 29500 }, { "epoch": 1.8561484918793503, "grad_norm": 66.96379089355469, "learning_rate": 1.191590168788866e-05, "loss": 0.8486, "step": 29600 }, { "epoch": 1.862419263811375, "grad_norm": 62.850799560546875, "learning_rate": 1.1881063944677665e-05, "loss": 1.4481, "step": 29700 }, { "epoch": 1.8686900357434, "grad_norm": 1.5179458856582642, "learning_rate": 1.1846226201466671e-05, "loss": 1.2704, "step": 29800 }, { "epoch": 1.874960807675425, "grad_norm": 0.09656574577093124, "learning_rate": 1.1811388458255676e-05, "loss": 1.1913, "step": 29900 }, { "epoch": 1.8812315796074497, "grad_norm": 0.12182077020406723, "learning_rate": 1.1776550715044682e-05, "loss": 0.9369, "step": 30000 }, { "epoch": 1.8812315796074497, "eval_loss": 1.0277103185653687, "eval_runtime": 240.7265, "eval_samples_per_second": 529.946, "eval_steps_per_second": 16.562, "step": 30000 }, { "epoch": 1.8875023515394744, "grad_norm": 171.4630126953125, "learning_rate": 1.1741712971833685e-05, "loss": 1.2427, "step": 30100 }, { "epoch": 1.8937731234714992, "grad_norm": 14.272507667541504, "learning_rate": 1.1706875228622691e-05, "loss": 1.0576, "step": 30200 }, { "epoch": 1.9000438954035241, "grad_norm": 8.003202438354492, "learning_rate": 1.1672037485411696e-05, "loss": 0.9188, "step": 30300 }, { "epoch": 1.906314667335549, "grad_norm": 72.2535629272461, "learning_rate": 1.1637199742200702e-05, "loss": 1.3227, "step": 30400 }, { "epoch": 1.9125854392675739, "grad_norm": 60.970176696777344, "learning_rate": 1.1602361998989707e-05, "loss": 1.4614, "step": 30500 }, { "epoch": 1.9125854392675739, "eval_loss": 1.0549676418304443, "eval_runtime": 232.0087, "eval_samples_per_second": 549.859, "eval_steps_per_second": 17.185, "step": 30500 }, { "epoch": 1.9188562111995986, "grad_norm": 113.54409790039062, "learning_rate": 1.1567524255778713e-05, "loss": 1.2316, "step": 30600 }, { "epoch": 1.9251269831316233, "grad_norm": 1.6219086647033691, "learning_rate": 1.1532686512567716e-05, "loss": 0.9487, "step": 30700 }, { "epoch": 1.9313977550636483, "grad_norm": 74.66547393798828, "learning_rate": 1.1497848769356722e-05, "loss": 1.1651, "step": 30800 }, { "epoch": 1.9376685269956733, "grad_norm": 0.036245282739400864, "learning_rate": 1.1463011026145727e-05, "loss": 1.1622, "step": 30900 }, { "epoch": 1.943939298927698, "grad_norm": 1.6117188930511475, "learning_rate": 1.1428173282934732e-05, "loss": 1.1801, "step": 31000 }, { "epoch": 1.943939298927698, "eval_loss": 0.9981088042259216, "eval_runtime": 241.2373, "eval_samples_per_second": 528.824, "eval_steps_per_second": 16.527, "step": 31000 }, { "epoch": 1.9502100708597228, "grad_norm": 4.923341751098633, "learning_rate": 1.1393335539723738e-05, "loss": 0.8798, "step": 31100 }, { "epoch": 1.9564808427917475, "grad_norm": 214.46116638183594, "learning_rate": 1.1358497796512741e-05, "loss": 0.7196, "step": 31200 }, { "epoch": 1.9627516147237725, "grad_norm": 16.161603927612305, "learning_rate": 1.1323660053301749e-05, "loss": 1.2003, "step": 31300 }, { "epoch": 1.9690223866557974, "grad_norm": 249.83189392089844, "learning_rate": 1.1289170687522864e-05, "loss": 1.1823, "step": 31400 }, { "epoch": 1.9752931585878222, "grad_norm": 18.310449600219727, "learning_rate": 1.1254332944311868e-05, "loss": 1.1453, "step": 31500 }, { "epoch": 1.9752931585878222, "eval_loss": 1.0320409536361694, "eval_runtime": 237.3095, "eval_samples_per_second": 537.576, "eval_steps_per_second": 16.801, "step": 31500 }, { "epoch": 1.981563930519847, "grad_norm": 221.7801513671875, "learning_rate": 1.1219495201100875e-05, "loss": 1.4751, "step": 31600 }, { "epoch": 1.9878347024518717, "grad_norm": 20.95890235900879, "learning_rate": 1.1184657457889878e-05, "loss": 0.8502, "step": 31700 }, { "epoch": 1.9941054743838966, "grad_norm": 2.7732744216918945, "learning_rate": 1.1149819714678884e-05, "loss": 0.8757, "step": 31800 }, { "epoch": 2.0003762463159216, "grad_norm": 1.1170719861984253, "learning_rate": 1.1114981971467889e-05, "loss": 1.0489, "step": 31900 }, { "epoch": 2.0066470182479463, "grad_norm": 31.308385848999023, "learning_rate": 1.1080144228256895e-05, "loss": 1.4672, "step": 32000 }, { "epoch": 2.0066470182479463, "eval_loss": 1.0570933818817139, "eval_runtime": 236.2248, "eval_samples_per_second": 540.045, "eval_steps_per_second": 16.878, "step": 32000 }, { "epoch": 2.012917790179971, "grad_norm": 0.2743261754512787, "learning_rate": 1.10453064850459e-05, "loss": 0.9474, "step": 32100 }, { "epoch": 2.019188562111996, "grad_norm": 2.2496840953826904, "learning_rate": 1.1010468741834906e-05, "loss": 0.8037, "step": 32200 }, { "epoch": 2.025459334044021, "grad_norm": 32.999935150146484, "learning_rate": 1.0975630998623909e-05, "loss": 0.9782, "step": 32300 }, { "epoch": 2.0317301059760458, "grad_norm": 19.94236183166504, "learning_rate": 1.0940793255412915e-05, "loss": 0.6943, "step": 32400 }, { "epoch": 2.0380008779080705, "grad_norm": 0.7693130373954773, "learning_rate": 1.090595551220192e-05, "loss": 1.0097, "step": 32500 }, { "epoch": 2.0380008779080705, "eval_loss": 0.9797225594520569, "eval_runtime": 237.8696, "eval_samples_per_second": 536.311, "eval_steps_per_second": 16.761, "step": 32500 }, { "epoch": 2.0442716498400952, "grad_norm": 156.60507202148438, "learning_rate": 1.0871117768990926e-05, "loss": 0.9067, "step": 32600 }, { "epoch": 2.05054242177212, "grad_norm": 45.05233383178711, "learning_rate": 1.083628002577993e-05, "loss": 1.09, "step": 32700 }, { "epoch": 2.056813193704145, "grad_norm": 0.9790059328079224, "learning_rate": 1.0801442282568937e-05, "loss": 0.8464, "step": 32800 }, { "epoch": 2.06308396563617, "grad_norm": 311.8387145996094, "learning_rate": 1.0766604539357942e-05, "loss": 0.9359, "step": 32900 }, { "epoch": 2.0693547375681947, "grad_norm": 2.4389493465423584, "learning_rate": 1.0731766796146948e-05, "loss": 0.813, "step": 33000 }, { "epoch": 2.0693547375681947, "eval_loss": 0.990721583366394, "eval_runtime": 239.867, "eval_samples_per_second": 531.845, "eval_steps_per_second": 16.622, "step": 33000 }, { "epoch": 2.0756255095002194, "grad_norm": 40.27507781982422, "learning_rate": 1.0696929052935951e-05, "loss": 0.8738, "step": 33100 }, { "epoch": 2.081896281432244, "grad_norm": 0.029316190630197525, "learning_rate": 1.0662091309724957e-05, "loss": 0.8178, "step": 33200 }, { "epoch": 2.0881670533642693, "grad_norm": 0.06512907892465591, "learning_rate": 1.0627253566513962e-05, "loss": 1.1704, "step": 33300 }, { "epoch": 2.094437825296294, "grad_norm": 14.495019912719727, "learning_rate": 1.0592415823302968e-05, "loss": 1.0073, "step": 33400 }, { "epoch": 2.100708597228319, "grad_norm": 85.92517852783203, "learning_rate": 1.0557578080091973e-05, "loss": 1.1849, "step": 33500 }, { "epoch": 2.100708597228319, "eval_loss": 0.9582126140594482, "eval_runtime": 238.5255, "eval_samples_per_second": 534.836, "eval_steps_per_second": 16.715, "step": 33500 }, { "epoch": 2.1069793691603436, "grad_norm": 0.8284154534339905, "learning_rate": 1.0522740336880976e-05, "loss": 0.7795, "step": 33600 }, { "epoch": 2.1132501410923683, "grad_norm": 3.656404972076416, "learning_rate": 1.0487902593669982e-05, "loss": 0.7688, "step": 33700 }, { "epoch": 2.1195209130243935, "grad_norm": 0.08456479012966156, "learning_rate": 1.0453064850458987e-05, "loss": 0.9465, "step": 33800 }, { "epoch": 2.1257916849564182, "grad_norm": 27.962339401245117, "learning_rate": 1.0418227107247993e-05, "loss": 1.0883, "step": 33900 }, { "epoch": 2.132062456888443, "grad_norm": 37.31398010253906, "learning_rate": 1.0383389364036998e-05, "loss": 0.7711, "step": 34000 }, { "epoch": 2.132062456888443, "eval_loss": 0.955656886100769, "eval_runtime": 237.3977, "eval_samples_per_second": 537.377, "eval_steps_per_second": 16.795, "step": 34000 }, { "epoch": 2.1383332288204677, "grad_norm": 3.700526714324951, "learning_rate": 1.0348551620826004e-05, "loss": 0.9767, "step": 34100 }, { "epoch": 2.1446040007524925, "grad_norm": 111.15718841552734, "learning_rate": 1.0313713877615009e-05, "loss": 0.6702, "step": 34200 }, { "epoch": 2.1508747726845177, "grad_norm": 0.5821614861488342, "learning_rate": 1.0278876134404015e-05, "loss": 0.9444, "step": 34300 }, { "epoch": 2.1571455446165424, "grad_norm": 20.9290771484375, "learning_rate": 1.0244038391193018e-05, "loss": 0.8741, "step": 34400 }, { "epoch": 2.163416316548567, "grad_norm": 52.165771484375, "learning_rate": 1.0209200647982025e-05, "loss": 1.0717, "step": 34500 }, { "epoch": 2.163416316548567, "eval_loss": 0.9526209831237793, "eval_runtime": 235.6861, "eval_samples_per_second": 541.279, "eval_steps_per_second": 16.917, "step": 34500 }, { "epoch": 2.169687088480592, "grad_norm": 0.01671871915459633, "learning_rate": 1.017436290477103e-05, "loss": 0.8584, "step": 34600 }, { "epoch": 2.1759578604126166, "grad_norm": 12.125747680664062, "learning_rate": 1.0139525161560035e-05, "loss": 0.8926, "step": 34700 }, { "epoch": 2.182228632344642, "grad_norm": 114.18839263916016, "learning_rate": 1.010468741834904e-05, "loss": 0.8567, "step": 34800 }, { "epoch": 2.1884994042766666, "grad_norm": 0.2531642019748688, "learning_rate": 1.0069849675138046e-05, "loss": 0.71, "step": 34900 }, { "epoch": 2.1947701762086913, "grad_norm": 160.1878662109375, "learning_rate": 1.0035360309359161e-05, "loss": 1.1285, "step": 35000 }, { "epoch": 2.1947701762086913, "eval_loss": 0.958905816078186, "eval_runtime": 235.6565, "eval_samples_per_second": 541.347, "eval_steps_per_second": 16.919, "step": 35000 }, { "epoch": 2.201040948140716, "grad_norm": 42.54741287231445, "learning_rate": 1.0000522566148166e-05, "loss": 0.8999, "step": 35100 }, { "epoch": 2.207311720072741, "grad_norm": 291.0119323730469, "learning_rate": 9.96568482293717e-06, "loss": 0.8459, "step": 35200 }, { "epoch": 2.213582492004766, "grad_norm": 3.8935604095458984, "learning_rate": 9.930847079726175e-06, "loss": 1.0608, "step": 35300 }, { "epoch": 2.2198532639367907, "grad_norm": 73.73111724853516, "learning_rate": 9.896009336515181e-06, "loss": 0.6115, "step": 35400 }, { "epoch": 2.2261240358688155, "grad_norm": 137.14573669433594, "learning_rate": 9.861171593304186e-06, "loss": 1.2468, "step": 35500 }, { "epoch": 2.2261240358688155, "eval_loss": 0.9768953323364258, "eval_runtime": 237.6341, "eval_samples_per_second": 536.842, "eval_steps_per_second": 16.778, "step": 35500 }, { "epoch": 2.23239480780084, "grad_norm": 72.25751495361328, "learning_rate": 9.826333850093192e-06, "loss": 0.9987, "step": 35600 }, { "epoch": 2.238665579732865, "grad_norm": 310.7902526855469, "learning_rate": 9.791496106882197e-06, "loss": 0.9186, "step": 35700 }, { "epoch": 2.24493635166489, "grad_norm": 0.11791533976793289, "learning_rate": 9.756658363671202e-06, "loss": 1.0505, "step": 35800 }, { "epoch": 2.251207123596915, "grad_norm": 43.25834274291992, "learning_rate": 9.721820620460208e-06, "loss": 0.6253, "step": 35900 }, { "epoch": 2.2574778955289396, "grad_norm": 29.648263931274414, "learning_rate": 9.686982877249213e-06, "loss": 0.6523, "step": 36000 }, { "epoch": 2.2574778955289396, "eval_loss": 0.9501162171363831, "eval_runtime": 238.1223, "eval_samples_per_second": 535.742, "eval_steps_per_second": 16.743, "step": 36000 }, { "epoch": 2.2637486674609644, "grad_norm": 5.313396453857422, "learning_rate": 9.652145134038217e-06, "loss": 0.8252, "step": 36100 }, { "epoch": 2.270019439392989, "grad_norm": 0.04373766854405403, "learning_rate": 9.617307390827224e-06, "loss": 0.9793, "step": 36200 }, { "epoch": 2.2762902113250143, "grad_norm": 118.00153350830078, "learning_rate": 9.582469647616228e-06, "loss": 0.8845, "step": 36300 }, { "epoch": 2.282560983257039, "grad_norm": 99.67394256591797, "learning_rate": 9.547631904405233e-06, "loss": 1.0121, "step": 36400 }, { "epoch": 2.288831755189064, "grad_norm": 0.7632407546043396, "learning_rate": 9.51279416119424e-06, "loss": 0.9849, "step": 36500 }, { "epoch": 2.288831755189064, "eval_loss": 0.9245060086250305, "eval_runtime": 237.5388, "eval_samples_per_second": 537.058, "eval_steps_per_second": 16.785, "step": 36500 }, { "epoch": 2.2951025271210885, "grad_norm": 0.21792149543762207, "learning_rate": 9.477956417983244e-06, "loss": 1.2937, "step": 36600 }, { "epoch": 2.3013732990531133, "grad_norm": 161.54714965820312, "learning_rate": 9.443118674772248e-06, "loss": 1.0484, "step": 36700 }, { "epoch": 2.3076440709851385, "grad_norm": 1.5865380764007568, "learning_rate": 9.408280931561255e-06, "loss": 0.8801, "step": 36800 }, { "epoch": 2.313914842917163, "grad_norm": 52.73973846435547, "learning_rate": 9.37344318835026e-06, "loss": 0.7552, "step": 36900 }, { "epoch": 2.320185614849188, "grad_norm": 72.2259750366211, "learning_rate": 9.338605445139266e-06, "loss": 0.7641, "step": 37000 }, { "epoch": 2.320185614849188, "eval_loss": 0.9280443787574768, "eval_runtime": 234.7895, "eval_samples_per_second": 543.346, "eval_steps_per_second": 16.981, "step": 37000 }, { "epoch": 2.3264563867812127, "grad_norm": 161.67674255371094, "learning_rate": 9.30376770192827e-06, "loss": 0.883, "step": 37100 }, { "epoch": 2.3327271587132374, "grad_norm": 0.07621905952692032, "learning_rate": 9.269278336149385e-06, "loss": 0.77, "step": 37200 }, { "epoch": 2.3389979306452626, "grad_norm": 0.2586478292942047, "learning_rate": 9.234440592938391e-06, "loss": 1.2699, "step": 37300 }, { "epoch": 2.3452687025772874, "grad_norm": 79.81159973144531, "learning_rate": 9.199602849727396e-06, "loss": 0.8766, "step": 37400 }, { "epoch": 2.351539474509312, "grad_norm": 7.059108257293701, "learning_rate": 9.1647651065164e-06, "loss": 1.1154, "step": 37500 }, { "epoch": 2.351539474509312, "eval_loss": 0.962340772151947, "eval_runtime": 238.8795, "eval_samples_per_second": 534.043, "eval_steps_per_second": 16.69, "step": 37500 }, { "epoch": 2.357810246441337, "grad_norm": 1.4081709384918213, "learning_rate": 9.129927363305405e-06, "loss": 1.0634, "step": 37600 }, { "epoch": 2.3640810183733616, "grad_norm": 0.605450451374054, "learning_rate": 9.09508962009441e-06, "loss": 0.8822, "step": 37700 }, { "epoch": 2.370351790305387, "grad_norm": 1.7804793119430542, "learning_rate": 9.060251876883416e-06, "loss": 0.839, "step": 37800 }, { "epoch": 2.3766225622374115, "grad_norm": 0.285157710313797, "learning_rate": 9.025414133672421e-06, "loss": 0.684, "step": 37900 }, { "epoch": 2.3828933341694363, "grad_norm": 1.6291695833206177, "learning_rate": 8.990576390461425e-06, "loss": 0.8051, "step": 38000 }, { "epoch": 2.3828933341694363, "eval_loss": 0.9198396801948547, "eval_runtime": 235.4699, "eval_samples_per_second": 541.776, "eval_steps_per_second": 16.932, "step": 38000 }, { "epoch": 2.389164106101461, "grad_norm": 0.22198112308979034, "learning_rate": 8.955738647250432e-06, "loss": 0.9585, "step": 38100 }, { "epoch": 2.3954348780334858, "grad_norm": 0.15497685968875885, "learning_rate": 8.920900904039436e-06, "loss": 0.7156, "step": 38200 }, { "epoch": 2.401705649965511, "grad_norm": 3.716522216796875, "learning_rate": 8.886063160828443e-06, "loss": 0.5271, "step": 38300 }, { "epoch": 2.4079764218975357, "grad_norm": 211.54660034179688, "learning_rate": 8.851225417617447e-06, "loss": 0.805, "step": 38400 }, { "epoch": 2.4142471938295604, "grad_norm": 104.68868255615234, "learning_rate": 8.816387674406452e-06, "loss": 0.7898, "step": 38500 }, { "epoch": 2.4142471938295604, "eval_loss": 0.8785400986671448, "eval_runtime": 236.3653, "eval_samples_per_second": 539.724, "eval_steps_per_second": 16.868, "step": 38500 }, { "epoch": 2.420517965761585, "grad_norm": 117.63562774658203, "learning_rate": 8.781549931195458e-06, "loss": 0.6935, "step": 38600 }, { "epoch": 2.42678873769361, "grad_norm": 6.395357131958008, "learning_rate": 8.746712187984463e-06, "loss": 0.8011, "step": 38700 }, { "epoch": 2.433059509625635, "grad_norm": 146.0078582763672, "learning_rate": 8.711874444773468e-06, "loss": 0.9812, "step": 38800 }, { "epoch": 2.43933028155766, "grad_norm": 0.07249762117862701, "learning_rate": 8.677036701562474e-06, "loss": 0.4427, "step": 38900 }, { "epoch": 2.4456010534896846, "grad_norm": 113.86747741699219, "learning_rate": 8.642198958351479e-06, "loss": 0.492, "step": 39000 }, { "epoch": 2.4456010534896846, "eval_loss": 0.9312570095062256, "eval_runtime": 235.7259, "eval_samples_per_second": 541.188, "eval_steps_per_second": 16.914, "step": 39000 }, { "epoch": 2.4518718254217093, "grad_norm": 174.28895568847656, "learning_rate": 8.607361215140483e-06, "loss": 0.47, "step": 39100 }, { "epoch": 2.458142597353734, "grad_norm": 1.0906648635864258, "learning_rate": 8.5728718493616e-06, "loss": 1.1876, "step": 39200 }, { "epoch": 2.4644133692857593, "grad_norm": 1.2390027046203613, "learning_rate": 8.538034106150604e-06, "loss": 0.5778, "step": 39300 }, { "epoch": 2.470684141217784, "grad_norm": 8.68694019317627, "learning_rate": 8.503196362939609e-06, "loss": 0.6763, "step": 39400 }, { "epoch": 2.4769549131498088, "grad_norm": 0.0290305744856596, "learning_rate": 8.468358619728615e-06, "loss": 0.6896, "step": 39500 }, { "epoch": 2.4769549131498088, "eval_loss": 0.8978257179260254, "eval_runtime": 238.5786, "eval_samples_per_second": 534.717, "eval_steps_per_second": 16.711, "step": 39500 }, { "epoch": 2.4832256850818335, "grad_norm": NaN, "learning_rate": 8.43386925394973e-06, "loss": 0.8905, "step": 39600 }, { "epoch": 2.4894964570138582, "grad_norm": 0.6685202121734619, "learning_rate": 8.399031510738736e-06, "loss": 0.7845, "step": 39700 }, { "epoch": 2.4957672289458834, "grad_norm": 0.6609179377555847, "learning_rate": 8.36419376752774e-06, "loss": 0.8691, "step": 39800 }, { "epoch": 2.502038000877908, "grad_norm": 0.44005250930786133, "learning_rate": 8.329356024316745e-06, "loss": 0.55, "step": 39900 }, { "epoch": 2.508308772809933, "grad_norm": 191.84471130371094, "learning_rate": 8.294518281105752e-06, "loss": 0.6978, "step": 40000 }, { "epoch": 2.508308772809933, "eval_loss": 0.9054428935050964, "eval_runtime": 236.0808, "eval_samples_per_second": 540.374, "eval_steps_per_second": 16.888, "step": 40000 }, { "epoch": 2.5145795447419577, "grad_norm": 20.063995361328125, "learning_rate": 8.259680537894755e-06, "loss": 0.6378, "step": 40100 }, { "epoch": 2.5208503166739824, "grad_norm": 1.4460866451263428, "learning_rate": 8.224842794683761e-06, "loss": 0.895, "step": 40200 }, { "epoch": 2.527121088606007, "grad_norm": 0.06669195741415024, "learning_rate": 8.190353428904876e-06, "loss": 0.9683, "step": 40300 }, { "epoch": 2.5333918605380323, "grad_norm": 80.40859985351562, "learning_rate": 8.155515685693882e-06, "loss": 0.9373, "step": 40400 }, { "epoch": 2.539662632470057, "grad_norm": 0.014817653223872185, "learning_rate": 8.120677942482887e-06, "loss": 0.7406, "step": 40500 }, { "epoch": 2.539662632470057, "eval_loss": 0.912805438041687, "eval_runtime": 230.5789, "eval_samples_per_second": 553.268, "eval_steps_per_second": 17.291, "step": 40500 }, { "epoch": 2.545933404402082, "grad_norm": 41.673622131347656, "learning_rate": 8.085840199271891e-06, "loss": 0.8917, "step": 40600 }, { "epoch": 2.5522041763341066, "grad_norm": 213.1597900390625, "learning_rate": 8.051002456060898e-06, "loss": 1.0552, "step": 40700 }, { "epoch": 2.5584749482661318, "grad_norm": 65.40398406982422, "learning_rate": 8.016164712849902e-06, "loss": 0.5281, "step": 40800 }, { "epoch": 2.5647457201981565, "grad_norm": 4.673154830932617, "learning_rate": 7.981326969638907e-06, "loss": 0.9064, "step": 40900 }, { "epoch": 2.5710164921301812, "grad_norm": 187.15573120117188, "learning_rate": 7.946489226427913e-06, "loss": 0.6886, "step": 41000 }, { "epoch": 2.5710164921301812, "eval_loss": 0.9048876166343689, "eval_runtime": 269.6795, "eval_samples_per_second": 473.05, "eval_steps_per_second": 14.784, "step": 41000 }, { "epoch": 2.577287264062206, "grad_norm": 0.14457735419273376, "learning_rate": 7.911651483216918e-06, "loss": 0.7166, "step": 41100 }, { "epoch": 2.5835580359942307, "grad_norm": 126.45314025878906, "learning_rate": 7.876813740005922e-06, "loss": 0.8343, "step": 41200 }, { "epoch": 2.5898288079262555, "grad_norm": 0.15031389892101288, "learning_rate": 7.841975996794929e-06, "loss": 0.9468, "step": 41300 }, { "epoch": 2.5960995798582807, "grad_norm": 0.14378446340560913, "learning_rate": 7.807138253583933e-06, "loss": 0.8529, "step": 41400 }, { "epoch": 2.6023703517903054, "grad_norm": 0.031118595972657204, "learning_rate": 7.772300510372938e-06, "loss": 0.8092, "step": 41500 }, { "epoch": 2.6023703517903054, "eval_loss": 0.8954480886459351, "eval_runtime": 246.3441, "eval_samples_per_second": 517.861, "eval_steps_per_second": 16.185, "step": 41500 }, { "epoch": 2.60864112372233, "grad_norm": 17.187223434448242, "learning_rate": 7.737462767161944e-06, "loss": 0.8501, "step": 41600 }, { "epoch": 2.614911895654355, "grad_norm": 3.00113844871521, "learning_rate": 7.702625023950949e-06, "loss": 0.9877, "step": 41700 }, { "epoch": 2.62118266758638, "grad_norm": 0.45281580090522766, "learning_rate": 7.667787280739954e-06, "loss": 0.8592, "step": 41800 }, { "epoch": 2.627453439518405, "grad_norm": 79.49444580078125, "learning_rate": 7.63294953752896e-06, "loss": 0.8632, "step": 41900 }, { "epoch": 2.6337242114504296, "grad_norm": 0.05600200593471527, "learning_rate": 7.598111794317965e-06, "loss": 0.6766, "step": 42000 }, { "epoch": 2.6337242114504296, "eval_loss": 0.8706979751586914, "eval_runtime": 245.6205, "eval_samples_per_second": 519.387, "eval_steps_per_second": 16.232, "step": 42000 }, { "epoch": 2.6399949833824543, "grad_norm": 20.844148635864258, "learning_rate": 7.56327405110697e-06, "loss": 0.7587, "step": 42100 }, { "epoch": 2.646265755314479, "grad_norm": 0.24995607137680054, "learning_rate": 7.528436307895976e-06, "loss": 0.8949, "step": 42200 }, { "epoch": 2.652536527246504, "grad_norm": 80.21415710449219, "learning_rate": 7.49359856468498e-06, "loss": 0.4173, "step": 42300 }, { "epoch": 2.658807299178529, "grad_norm": 24.900297164916992, "learning_rate": 7.458760821473986e-06, "loss": 0.5995, "step": 42400 }, { "epoch": 2.6650780711105537, "grad_norm": 231.90145874023438, "learning_rate": 7.423923078262991e-06, "loss": 0.8157, "step": 42500 }, { "epoch": 2.6650780711105537, "eval_loss": 0.8680915236473083, "eval_runtime": 245.7882, "eval_samples_per_second": 519.032, "eval_steps_per_second": 16.221, "step": 42500 }, { "epoch": 2.6713488430425785, "grad_norm": 0.030076002702116966, "learning_rate": 7.389085335051997e-06, "loss": 0.92, "step": 42600 }, { "epoch": 2.677619614974603, "grad_norm": 391.046875, "learning_rate": 7.354247591841001e-06, "loss": 0.9118, "step": 42700 }, { "epoch": 2.6838903869066284, "grad_norm": 0.29524192214012146, "learning_rate": 7.319409848630006e-06, "loss": 0.7446, "step": 42800 }, { "epoch": 2.690161158838653, "grad_norm": 0.06050710007548332, "learning_rate": 7.284572105419011e-06, "loss": 0.6835, "step": 42900 }, { "epoch": 2.696431930770678, "grad_norm": 0.3519326150417328, "learning_rate": 7.249734362208016e-06, "loss": 0.6157, "step": 43000 }, { "epoch": 2.696431930770678, "eval_loss": 0.8691079020500183, "eval_runtime": 245.1929, "eval_samples_per_second": 520.292, "eval_steps_per_second": 16.261, "step": 43000 }, { "epoch": 2.7027027027027026, "grad_norm": 3.0073323249816895, "learning_rate": 7.214896618997022e-06, "loss": 0.5423, "step": 43100 }, { "epoch": 2.7089734746347274, "grad_norm": 47.103782653808594, "learning_rate": 7.180058875786027e-06, "loss": 0.8098, "step": 43200 }, { "epoch": 2.715244246566752, "grad_norm": 1.3290644884109497, "learning_rate": 7.145221132575032e-06, "loss": 0.8908, "step": 43300 }, { "epoch": 2.7215150184987773, "grad_norm": 51.733924865722656, "learning_rate": 7.110383389364037e-06, "loss": 1.1275, "step": 43400 }, { "epoch": 2.727785790430802, "grad_norm": 7.54064416885376, "learning_rate": 7.075545646153043e-06, "loss": 1.0345, "step": 43500 }, { "epoch": 2.727785790430802, "eval_loss": 0.8884279131889343, "eval_runtime": 250.7463, "eval_samples_per_second": 508.769, "eval_steps_per_second": 15.901, "step": 43500 }, { "epoch": 2.734056562362827, "grad_norm": 0.2361198216676712, "learning_rate": 7.0407079029420475e-06, "loss": 0.6198, "step": 43600 }, { "epoch": 2.7403273342948515, "grad_norm": 0.045945364981889725, "learning_rate": 7.005870159731053e-06, "loss": 0.8315, "step": 43700 }, { "epoch": 2.7465981062268767, "grad_norm": 1.2798868417739868, "learning_rate": 6.9710324165200584e-06, "loss": 0.9317, "step": 43800 }, { "epoch": 2.7528688781589015, "grad_norm": 0.2944384217262268, "learning_rate": 6.936194673309063e-06, "loss": 0.516, "step": 43900 }, { "epoch": 2.759139650090926, "grad_norm": 0.38825371861457825, "learning_rate": 6.9013569300980686e-06, "loss": 0.8229, "step": 44000 }, { "epoch": 2.759139650090926, "eval_loss": 0.8659059405326843, "eval_runtime": 250.2562, "eval_samples_per_second": 509.766, "eval_steps_per_second": 15.932, "step": 44000 }, { "epoch": 2.765410422022951, "grad_norm": 121.3291015625, "learning_rate": 6.866519186887074e-06, "loss": 0.7989, "step": 44100 }, { "epoch": 2.7716811939549757, "grad_norm": 0.05258101224899292, "learning_rate": 6.8316814436760795e-06, "loss": 0.9291, "step": 44200 }, { "epoch": 2.7779519658870004, "grad_norm": 13.635845184326172, "learning_rate": 6.796843700465084e-06, "loss": 0.5954, "step": 44300 }, { "epoch": 2.7842227378190256, "grad_norm": 0.01324045192450285, "learning_rate": 6.76200595725409e-06, "loss": 0.8537, "step": 44400 }, { "epoch": 2.7904935097510504, "grad_norm": 0.1794157326221466, "learning_rate": 6.727168214043095e-06, "loss": 0.9506, "step": 44500 }, { "epoch": 2.7904935097510504, "eval_loss": 0.8657113909721375, "eval_runtime": 251.0944, "eval_samples_per_second": 508.064, "eval_steps_per_second": 15.878, "step": 44500 }, { "epoch": 2.796764281683075, "grad_norm": 1.5337361097335815, "learning_rate": 6.6923304708321e-06, "loss": 0.5789, "step": 44600 }, { "epoch": 2.8030350536151, "grad_norm": 67.04114532470703, "learning_rate": 6.657492727621105e-06, "loss": 0.4861, "step": 44700 }, { "epoch": 2.809305825547125, "grad_norm": 0.7064642310142517, "learning_rate": 6.622654984410111e-06, "loss": 0.9614, "step": 44800 }, { "epoch": 2.81557659747915, "grad_norm": 182.1068572998047, "learning_rate": 6.587817241199116e-06, "loss": 1.0069, "step": 44900 }, { "epoch": 2.8218473694111745, "grad_norm": 11.14926528930664, "learning_rate": 6.552979497988121e-06, "loss": 0.5599, "step": 45000 }, { "epoch": 2.8218473694111745, "eval_loss": 0.8618975281715393, "eval_runtime": 253.2257, "eval_samples_per_second": 503.788, "eval_steps_per_second": 15.745, "step": 45000 }, { "epoch": 2.8281181413431993, "grad_norm": 3.852113723754883, "learning_rate": 6.5181417547771264e-06, "loss": 1.3747, "step": 45100 }, { "epoch": 2.834388913275224, "grad_norm": 0.024370471015572548, "learning_rate": 6.483304011566132e-06, "loss": 0.5638, "step": 45200 }, { "epoch": 2.8406596852072488, "grad_norm": 30.42238998413086, "learning_rate": 6.4484662683551366e-06, "loss": 1.2095, "step": 45300 }, { "epoch": 2.846930457139274, "grad_norm": 54.890380859375, "learning_rate": 6.413628525144142e-06, "loss": 0.7364, "step": 45400 }, { "epoch": 2.8532012290712987, "grad_norm": 0.05865807831287384, "learning_rate": 6.3787907819331475e-06, "loss": 0.5692, "step": 45500 }, { "epoch": 2.8532012290712987, "eval_loss": 0.8817957043647766, "eval_runtime": 250.5213, "eval_samples_per_second": 509.226, "eval_steps_per_second": 15.915, "step": 45500 }, { "epoch": 2.8594720010033234, "grad_norm": 0.23342262208461761, "learning_rate": 6.343953038722153e-06, "loss": 0.8848, "step": 45600 }, { "epoch": 2.865742772935348, "grad_norm": 0.24238887429237366, "learning_rate": 6.309115295511157e-06, "loss": 0.9063, "step": 45700 }, { "epoch": 2.8720135448673734, "grad_norm": 303.49761962890625, "learning_rate": 6.274277552300162e-06, "loss": 0.8675, "step": 45800 }, { "epoch": 2.878284316799398, "grad_norm": 27.475610733032227, "learning_rate": 6.239439809089167e-06, "loss": 0.9703, "step": 45900 }, { "epoch": 2.884555088731423, "grad_norm": 0.12018956989049911, "learning_rate": 6.2046020658781725e-06, "loss": 0.6657, "step": 46000 }, { "epoch": 2.884555088731423, "eval_loss": 0.842439591884613, "eval_runtime": 250.5638, "eval_samples_per_second": 509.14, "eval_steps_per_second": 15.912, "step": 46000 }, { "epoch": 2.8908258606634476, "grad_norm": 36.39583969116211, "learning_rate": 6.169764322667178e-06, "loss": 0.6564, "step": 46100 }, { "epoch": 2.8970966325954723, "grad_norm": 6.755324840545654, "learning_rate": 6.135274956888293e-06, "loss": 0.7945, "step": 46200 }, { "epoch": 2.903367404527497, "grad_norm": 0.24825870990753174, "learning_rate": 6.100437213677298e-06, "loss": 0.6341, "step": 46300 }, { "epoch": 2.9096381764595223, "grad_norm": 0.06013401225209236, "learning_rate": 6.0655994704663035e-06, "loss": 1.042, "step": 46400 }, { "epoch": 2.915908948391547, "grad_norm": 14.515037536621094, "learning_rate": 6.030761727255309e-06, "loss": 1.0812, "step": 46500 }, { "epoch": 2.915908948391547, "eval_loss": 0.8509716987609863, "eval_runtime": 247.7079, "eval_samples_per_second": 515.01, "eval_steps_per_second": 16.096, "step": 46500 }, { "epoch": 2.9221797203235718, "grad_norm": 0.9338593482971191, "learning_rate": 5.995923984044314e-06, "loss": 0.9787, "step": 46600 }, { "epoch": 2.9284504922555965, "grad_norm": 241.10589599609375, "learning_rate": 5.961086240833319e-06, "loss": 0.8732, "step": 46700 }, { "epoch": 2.9347212641876217, "grad_norm": 119.96747589111328, "learning_rate": 5.926248497622325e-06, "loss": 1.1872, "step": 46800 }, { "epoch": 2.9409920361196464, "grad_norm": 28.35833740234375, "learning_rate": 5.89141075441133e-06, "loss": 0.989, "step": 46900 }, { "epoch": 2.947262808051671, "grad_norm": 0.007068769074976444, "learning_rate": 5.856573011200335e-06, "loss": 0.874, "step": 47000 }, { "epoch": 2.947262808051671, "eval_loss": 0.8214829564094543, "eval_runtime": 243.5295, "eval_samples_per_second": 523.846, "eval_steps_per_second": 16.372, "step": 47000 }, { "epoch": 2.953533579983696, "grad_norm": 0.0442727729678154, "learning_rate": 5.82173526798934e-06, "loss": 1.0229, "step": 47100 }, { "epoch": 2.9598043519157207, "grad_norm": 0.031402587890625, "learning_rate": 5.786897524778346e-06, "loss": 0.9888, "step": 47200 }, { "epoch": 2.9660751238477454, "grad_norm": 2.0282115936279297, "learning_rate": 5.75205978156735e-06, "loss": 0.4883, "step": 47300 }, { "epoch": 2.9723458957797706, "grad_norm": 7.441370487213135, "learning_rate": 5.717222038356356e-06, "loss": 0.7474, "step": 47400 }, { "epoch": 2.9786166677117953, "grad_norm": 20.524629592895508, "learning_rate": 5.682384295145361e-06, "loss": 0.7615, "step": 47500 }, { "epoch": 2.9786166677117953, "eval_loss": 0.8217635750770569, "eval_runtime": 249.4571, "eval_samples_per_second": 511.399, "eval_steps_per_second": 15.983, "step": 47500 }, { "epoch": 2.98488743964382, "grad_norm": 0.4798177182674408, "learning_rate": 5.647546551934367e-06, "loss": 0.6208, "step": 47600 }, { "epoch": 2.991158211575845, "grad_norm": 112.3564224243164, "learning_rate": 5.6127088087233715e-06, "loss": 0.8332, "step": 47700 }, { "epoch": 2.99742898350787, "grad_norm": 52.40660095214844, "learning_rate": 5.577871065512377e-06, "loss": 0.6734, "step": 47800 }, { "epoch": 3.0036997554398948, "grad_norm": 0.9568219184875488, "learning_rate": 5.5430333223013825e-06, "loss": 0.5095, "step": 47900 }, { "epoch": 3.0099705273719195, "grad_norm": 0.40387988090515137, "learning_rate": 5.508195579090387e-06, "loss": 0.7709, "step": 48000 }, { "epoch": 3.0099705273719195, "eval_loss": 0.8220009803771973, "eval_runtime": 248.6927, "eval_samples_per_second": 512.97, "eval_steps_per_second": 16.032, "step": 48000 }, { "epoch": 3.0162412993039442, "grad_norm": 192.66201782226562, "learning_rate": 5.473357835879393e-06, "loss": 0.5449, "step": 48100 }, { "epoch": 3.022512071235969, "grad_norm": 0.026696085929870605, "learning_rate": 5.438520092668398e-06, "loss": 0.772, "step": 48200 }, { "epoch": 3.028782843167994, "grad_norm": 7.1632232666015625, "learning_rate": 5.403682349457403e-06, "loss": 0.8582, "step": 48300 }, { "epoch": 3.035053615100019, "grad_norm": 4.1231584548950195, "learning_rate": 5.369192983678517e-06, "loss": 0.5742, "step": 48400 }, { "epoch": 3.0413243870320437, "grad_norm": 0.08916144073009491, "learning_rate": 5.334355240467523e-06, "loss": 0.5584, "step": 48500 }, { "epoch": 3.0413243870320437, "eval_loss": 0.8492663502693176, "eval_runtime": 247.0758, "eval_samples_per_second": 516.327, "eval_steps_per_second": 16.137, "step": 48500 }, { "epoch": 3.0475951589640684, "grad_norm": 15.887138366699219, "learning_rate": 5.299517497256527e-06, "loss": 0.9766, "step": 48600 }, { "epoch": 3.053865930896093, "grad_norm": 3.1666200160980225, "learning_rate": 5.264679754045533e-06, "loss": 0.6473, "step": 48700 }, { "epoch": 3.0601367028281183, "grad_norm": 4.730705261230469, "learning_rate": 5.229842010834538e-06, "loss": 0.5861, "step": 48800 }, { "epoch": 3.066407474760143, "grad_norm": 0.19111567735671997, "learning_rate": 5.195004267623544e-06, "loss": 0.6377, "step": 48900 }, { "epoch": 3.072678246692168, "grad_norm": 0.17477057874202728, "learning_rate": 5.1601665244125485e-06, "loss": 0.8393, "step": 49000 }, { "epoch": 3.072678246692168, "eval_loss": 0.8429604768753052, "eval_runtime": 246.6172, "eval_samples_per_second": 517.288, "eval_steps_per_second": 16.167, "step": 49000 }, { "epoch": 3.0789490186241926, "grad_norm": 0.47240251302719116, "learning_rate": 5.125328781201554e-06, "loss": 0.8385, "step": 49100 }, { "epoch": 3.0852197905562173, "grad_norm": 1.6392873525619507, "learning_rate": 5.0904910379905595e-06, "loss": 0.5523, "step": 49200 }, { "epoch": 3.0914905624882425, "grad_norm": 0.08180980384349823, "learning_rate": 5.055653294779564e-06, "loss": 0.6217, "step": 49300 }, { "epoch": 3.0977613344202672, "grad_norm": 10.683464050292969, "learning_rate": 5.02081555156857e-06, "loss": 0.5515, "step": 49400 }, { "epoch": 3.104032106352292, "grad_norm": 154.55838012695312, "learning_rate": 4.985977808357575e-06, "loss": 0.851, "step": 49500 }, { "epoch": 3.104032106352292, "eval_loss": 0.8000255227088928, "eval_runtime": 245.3365, "eval_samples_per_second": 519.988, "eval_steps_per_second": 16.251, "step": 49500 }, { "epoch": 3.1103028782843167, "grad_norm": 0.106838159263134, "learning_rate": 4.95114006514658e-06, "loss": 0.9247, "step": 49600 }, { "epoch": 3.1165736502163415, "grad_norm": 0.03634607046842575, "learning_rate": 4.916302321935585e-06, "loss": 0.655, "step": 49700 }, { "epoch": 3.1228444221483667, "grad_norm": 118.1080322265625, "learning_rate": 4.881464578724591e-06, "loss": 0.4979, "step": 49800 }, { "epoch": 3.1291151940803914, "grad_norm": 0.2726267874240875, "learning_rate": 4.846626835513596e-06, "loss": 0.7521, "step": 49900 }, { "epoch": 3.135385966012416, "grad_norm": 0.031166499480605125, "learning_rate": 4.811789092302601e-06, "loss": 0.53, "step": 50000 }, { "epoch": 3.135385966012416, "eval_loss": 0.8105431795120239, "eval_runtime": 248.1106, "eval_samples_per_second": 514.174, "eval_steps_per_second": 16.069, "step": 50000 }, { "epoch": 3.141656737944441, "grad_norm": 88.85710144042969, "learning_rate": 4.776951349091606e-06, "loss": 0.5943, "step": 50100 }, { "epoch": 3.1479275098764656, "grad_norm": 11.926735877990723, "learning_rate": 4.742113605880612e-06, "loss": 0.4659, "step": 50200 }, { "epoch": 3.154198281808491, "grad_norm": 17.817556381225586, "learning_rate": 4.7072758626696165e-06, "loss": 0.4843, "step": 50300 }, { "epoch": 3.1604690537405156, "grad_norm": 95.25701904296875, "learning_rate": 4.672438119458621e-06, "loss": 0.7577, "step": 50400 }, { "epoch": 3.1667398256725403, "grad_norm": 0.007618566509336233, "learning_rate": 4.637600376247627e-06, "loss": 0.3448, "step": 50500 }, { "epoch": 3.1667398256725403, "eval_loss": 0.8055439591407776, "eval_runtime": 246.7777, "eval_samples_per_second": 516.951, "eval_steps_per_second": 16.156, "step": 50500 }, { "epoch": 3.173010597604565, "grad_norm": 45.008056640625, "learning_rate": 4.602762633036632e-06, "loss": 0.8392, "step": 50600 }, { "epoch": 3.17928136953659, "grad_norm": 0.11749571561813354, "learning_rate": 4.567924889825638e-06, "loss": 0.75, "step": 50700 }, { "epoch": 3.185552141468615, "grad_norm": 0.04399213567376137, "learning_rate": 4.533087146614642e-06, "loss": 0.5195, "step": 50800 }, { "epoch": 3.1918229134006397, "grad_norm": 0.1250951737165451, "learning_rate": 4.498249403403648e-06, "loss": 0.617, "step": 50900 }, { "epoch": 3.1980936853326645, "grad_norm": 213.82589721679688, "learning_rate": 4.463411660192653e-06, "loss": 0.6892, "step": 51000 }, { "epoch": 3.1980936853326645, "eval_loss": 0.8293086290359497, "eval_runtime": 244.3828, "eval_samples_per_second": 522.017, "eval_steps_per_second": 16.315, "step": 51000 }, { "epoch": 3.204364457264689, "grad_norm": 12.81237506866455, "learning_rate": 4.428573916981658e-06, "loss": 0.497, "step": 51100 }, { "epoch": 3.210635229196714, "grad_norm": 0.06836537271738052, "learning_rate": 4.393736173770663e-06, "loss": 0.6793, "step": 51200 }, { "epoch": 3.216906001128739, "grad_norm": 0.30741751194000244, "learning_rate": 4.358898430559669e-06, "loss": 0.7251, "step": 51300 }, { "epoch": 3.223176773060764, "grad_norm": 299.8288269042969, "learning_rate": 4.324060687348674e-06, "loss": 0.6471, "step": 51400 }, { "epoch": 3.2294475449927886, "grad_norm": 39.92329406738281, "learning_rate": 4.289222944137679e-06, "loss": 0.775, "step": 51500 }, { "epoch": 3.2294475449927886, "eval_loss": 0.8012564778327942, "eval_runtime": 245.7294, "eval_samples_per_second": 519.157, "eval_steps_per_second": 16.225, "step": 51500 }, { "epoch": 3.2357183169248134, "grad_norm": 7.386813640594482, "learning_rate": 4.2543852009266845e-06, "loss": 0.7289, "step": 51600 }, { "epoch": 3.241989088856838, "grad_norm": 0.8339570760726929, "learning_rate": 4.21954745771569e-06, "loss": 0.6894, "step": 51700 }, { "epoch": 3.2482598607888633, "grad_norm": 103.696533203125, "learning_rate": 4.184709714504695e-06, "loss": 0.5677, "step": 51800 }, { "epoch": 3.254530632720888, "grad_norm": 157.94912719726562, "learning_rate": 4.149871971293699e-06, "loss": 0.317, "step": 51900 }, { "epoch": 3.260801404652913, "grad_norm": 0.6201029419898987, "learning_rate": 4.115034228082705e-06, "loss": 0.5376, "step": 52000 }, { "epoch": 3.260801404652913, "eval_loss": 0.785252034664154, "eval_runtime": 247.2797, "eval_samples_per_second": 515.902, "eval_steps_per_second": 16.123, "step": 52000 }, { "epoch": 3.2670721765849375, "grad_norm": 5.9356913566589355, "learning_rate": 4.08019648487171e-06, "loss": 0.4582, "step": 52100 }, { "epoch": 3.2733429485169623, "grad_norm": 99.29075622558594, "learning_rate": 4.045358741660716e-06, "loss": 0.8505, "step": 52200 }, { "epoch": 3.279613720448987, "grad_norm": 7.142418384552002, "learning_rate": 4.01052099844972e-06, "loss": 0.6236, "step": 52300 }, { "epoch": 3.285884492381012, "grad_norm": 0.18595051765441895, "learning_rate": 3.975683255238726e-06, "loss": 0.7388, "step": 52400 }, { "epoch": 3.292155264313037, "grad_norm": 0.26398783922195435, "learning_rate": 3.940845512027731e-06, "loss": 0.7061, "step": 52500 }, { "epoch": 3.292155264313037, "eval_loss": 0.786342203617096, "eval_runtime": 246.6236, "eval_samples_per_second": 517.274, "eval_steps_per_second": 16.166, "step": 52500 }, { "epoch": 3.2984260362450617, "grad_norm": 0.15353605151176453, "learning_rate": 3.906007768816736e-06, "loss": 0.5411, "step": 52600 }, { "epoch": 3.3046968081770864, "grad_norm": 0.6905626654624939, "learning_rate": 3.8711700256057415e-06, "loss": 0.9511, "step": 52700 }, { "epoch": 3.3109675801091116, "grad_norm": 120.66680145263672, "learning_rate": 3.836332282394747e-06, "loss": 0.5364, "step": 52800 }, { "epoch": 3.3172383520411364, "grad_norm": 22.492393493652344, "learning_rate": 3.801494539183752e-06, "loss": 0.5795, "step": 52900 }, { "epoch": 3.323509123973161, "grad_norm": 11.335774421691895, "learning_rate": 3.766656795972757e-06, "loss": 0.5305, "step": 53000 }, { "epoch": 3.323509123973161, "eval_loss": 0.787602961063385, "eval_runtime": 250.8532, "eval_samples_per_second": 508.552, "eval_steps_per_second": 15.894, "step": 53000 }, { "epoch": 3.329779895905186, "grad_norm": 0.07559686154127121, "learning_rate": 3.7318190527617626e-06, "loss": 0.8051, "step": 53100 }, { "epoch": 3.3360506678372106, "grad_norm": 0.06827156990766525, "learning_rate": 3.6969813095507677e-06, "loss": 0.5342, "step": 53200 }, { "epoch": 3.3423214397692353, "grad_norm": 1.358184576034546, "learning_rate": 3.662143566339773e-06, "loss": 0.4567, "step": 53300 }, { "epoch": 3.3485922117012605, "grad_norm": 58.48233413696289, "learning_rate": 3.627305823128778e-06, "loss": 0.9751, "step": 53400 }, { "epoch": 3.3548629836332853, "grad_norm": 0.13244691491127014, "learning_rate": 3.592468079917783e-06, "loss": 0.4413, "step": 53500 }, { "epoch": 3.3548629836332853, "eval_loss": 0.8008161783218384, "eval_runtime": 248.8641, "eval_samples_per_second": 512.617, "eval_steps_per_second": 16.021, "step": 53500 }, { "epoch": 3.36113375556531, "grad_norm": 5.010788917541504, "learning_rate": 3.5576303367067884e-06, "loss": 0.6011, "step": 53600 }, { "epoch": 3.3674045274973348, "grad_norm": 0.032868873327970505, "learning_rate": 3.5227925934957935e-06, "loss": 0.4708, "step": 53700 }, { "epoch": 3.37367529942936, "grad_norm": 2.3022570610046387, "learning_rate": 3.488303227716909e-06, "loss": 0.6167, "step": 53800 }, { "epoch": 3.3799460713613847, "grad_norm": 0.7494950890541077, "learning_rate": 3.453465484505914e-06, "loss": 0.7653, "step": 53900 }, { "epoch": 3.3862168432934094, "grad_norm": 1.9640907049179077, "learning_rate": 3.4186277412949194e-06, "loss": 0.7781, "step": 54000 }, { "epoch": 3.3862168432934094, "eval_loss": 0.7897498607635498, "eval_runtime": 250.7968, "eval_samples_per_second": 508.667, "eval_steps_per_second": 15.897, "step": 54000 }, { "epoch": 3.392487615225434, "grad_norm": 95.03298950195312, "learning_rate": 3.3837899980839245e-06, "loss": 0.9323, "step": 54100 }, { "epoch": 3.398758387157459, "grad_norm": 1.3489042520523071, "learning_rate": 3.3489522548729296e-06, "loss": 0.6003, "step": 54200 }, { "epoch": 3.4050291590894837, "grad_norm": 1.4920170307159424, "learning_rate": 3.314114511661935e-06, "loss": 0.5268, "step": 54300 }, { "epoch": 3.411299931021509, "grad_norm": 71.25545501708984, "learning_rate": 3.27927676845094e-06, "loss": 0.6639, "step": 54400 }, { "epoch": 3.4175707029535336, "grad_norm": 1.5343536138534546, "learning_rate": 3.2444390252399456e-06, "loss": 0.388, "step": 54500 }, { "epoch": 3.4175707029535336, "eval_loss": 0.7854874730110168, "eval_runtime": 247.7117, "eval_samples_per_second": 515.002, "eval_steps_per_second": 16.095, "step": 54500 }, { "epoch": 3.4238414748855583, "grad_norm": 0.22106263041496277, "learning_rate": 3.2096012820289502e-06, "loss": 0.7258, "step": 54600 }, { "epoch": 3.430112246817583, "grad_norm": 0.10803945362567902, "learning_rate": 3.1747635388179553e-06, "loss": 0.6475, "step": 54700 }, { "epoch": 3.4363830187496083, "grad_norm": 89.42733764648438, "learning_rate": 3.139925795606961e-06, "loss": 0.795, "step": 54800 }, { "epoch": 3.442653790681633, "grad_norm": 0.15668845176696777, "learning_rate": 3.105088052395966e-06, "loss": 0.4978, "step": 54900 }, { "epoch": 3.4489245626136578, "grad_norm": 60.56550216674805, "learning_rate": 3.070250309184971e-06, "loss": 0.6259, "step": 55000 }, { "epoch": 3.4489245626136578, "eval_loss": 0.7704712748527527, "eval_runtime": 250.1048, "eval_samples_per_second": 510.074, "eval_steps_per_second": 15.941, "step": 55000 }, { "epoch": 3.4551953345456825, "grad_norm": 0.28135305643081665, "learning_rate": 3.0354125659739764e-06, "loss": 0.791, "step": 55100 }, { "epoch": 3.4614661064777072, "grad_norm": 120.33629608154297, "learning_rate": 3.0005748227629815e-06, "loss": 0.7602, "step": 55200 }, { "epoch": 3.467736878409732, "grad_norm": 0.6213288903236389, "learning_rate": 2.965737079551987e-06, "loss": 0.2236, "step": 55300 }, { "epoch": 3.474007650341757, "grad_norm": 0.051405176520347595, "learning_rate": 2.930899336340992e-06, "loss": 0.5577, "step": 55400 }, { "epoch": 3.480278422273782, "grad_norm": 6.140790939331055, "learning_rate": 2.8960615931299975e-06, "loss": 0.4214, "step": 55500 }, { "epoch": 3.480278422273782, "eval_loss": 0.768252432346344, "eval_runtime": 248.6626, "eval_samples_per_second": 513.032, "eval_steps_per_second": 16.034, "step": 55500 }, { "epoch": 3.4865491942058067, "grad_norm": 0.051673661917448044, "learning_rate": 2.8612238499190026e-06, "loss": 0.7335, "step": 55600 }, { "epoch": 3.4928199661378314, "grad_norm": 5.123118877410889, "learning_rate": 2.8263861067080077e-06, "loss": 0.7536, "step": 55700 }, { "epoch": 3.4990907380698566, "grad_norm": 0.7104228734970093, "learning_rate": 2.791548363497013e-06, "loss": 0.4577, "step": 55800 }, { "epoch": 3.5053615100018813, "grad_norm": 49.410400390625, "learning_rate": 2.7567106202860182e-06, "loss": 0.5869, "step": 55900 }, { "epoch": 3.511632281933906, "grad_norm": 0.0593554824590683, "learning_rate": 2.7218728770750237e-06, "loss": 0.8563, "step": 56000 }, { "epoch": 3.511632281933906, "eval_loss": 0.7587498426437378, "eval_runtime": 247.0433, "eval_samples_per_second": 516.395, "eval_steps_per_second": 16.139, "step": 56000 }, { "epoch": 3.517903053865931, "grad_norm": 8.727328300476074, "learning_rate": 2.6870351338640284e-06, "loss": 0.9291, "step": 56100 }, { "epoch": 3.5241738257979556, "grad_norm": 0.023664651438593864, "learning_rate": 2.6521973906530334e-06, "loss": 0.4387, "step": 56200 }, { "epoch": 3.5304445977299803, "grad_norm": 2.834498405456543, "learning_rate": 2.617359647442039e-06, "loss": 0.4491, "step": 56300 }, { "epoch": 3.5367153696620055, "grad_norm": 1.9824761152267456, "learning_rate": 2.582870281663154e-06, "loss": 0.506, "step": 56400 }, { "epoch": 3.5429861415940302, "grad_norm": 0.7142437100410461, "learning_rate": 2.5480325384521594e-06, "loss": 0.6626, "step": 56500 }, { "epoch": 3.5429861415940302, "eval_loss": 0.7634491920471191, "eval_runtime": 247.6797, "eval_samples_per_second": 515.068, "eval_steps_per_second": 16.097, "step": 56500 }, { "epoch": 3.549256913526055, "grad_norm": 0.030130065977573395, "learning_rate": 2.5131947952411645e-06, "loss": 0.8654, "step": 56600 }, { "epoch": 3.5555276854580797, "grad_norm": 0.757265031337738, "learning_rate": 2.4783570520301695e-06, "loss": 0.4455, "step": 56700 }, { "epoch": 3.561798457390105, "grad_norm": 130.99807739257812, "learning_rate": 2.443519308819175e-06, "loss": 0.4593, "step": 56800 }, { "epoch": 3.5680692293221297, "grad_norm": 86.36803436279297, "learning_rate": 2.40868156560818e-06, "loss": 0.878, "step": 56900 }, { "epoch": 3.5743400012541544, "grad_norm": 0.8545703887939453, "learning_rate": 2.373843822397185e-06, "loss": 0.3737, "step": 57000 }, { "epoch": 3.5743400012541544, "eval_loss": 0.7617383599281311, "eval_runtime": 246.3862, "eval_samples_per_second": 517.773, "eval_steps_per_second": 16.182, "step": 57000 }, { "epoch": 3.580610773186179, "grad_norm": 100.52796173095703, "learning_rate": 2.3390060791861902e-06, "loss": 0.377, "step": 57100 }, { "epoch": 3.586881545118204, "grad_norm": 31.44060516357422, "learning_rate": 2.3041683359751957e-06, "loss": 0.6894, "step": 57200 }, { "epoch": 3.5931523170502286, "grad_norm": 0.2915436625480652, "learning_rate": 2.2693305927642008e-06, "loss": 0.6635, "step": 57300 }, { "epoch": 3.599423088982254, "grad_norm": 0.009617321193218231, "learning_rate": 2.2344928495532063e-06, "loss": 0.9224, "step": 57400 }, { "epoch": 3.6056938609142786, "grad_norm": 0.21305809915065765, "learning_rate": 2.1996551063422113e-06, "loss": 0.635, "step": 57500 }, { "epoch": 3.6056938609142786, "eval_loss": 0.7668555974960327, "eval_runtime": 248.1105, "eval_samples_per_second": 514.174, "eval_steps_per_second": 16.069, "step": 57500 }, { "epoch": 3.6119646328463033, "grad_norm": 11.654231071472168, "learning_rate": 2.164817363131217e-06, "loss": 0.6797, "step": 57600 }, { "epoch": 3.618235404778328, "grad_norm": 1.0893511772155762, "learning_rate": 2.1299796199202215e-06, "loss": 0.9814, "step": 57700 }, { "epoch": 3.6245061767103532, "grad_norm": 0.3305797278881073, "learning_rate": 2.095141876709227e-06, "loss": 0.9893, "step": 57800 }, { "epoch": 3.630776948642378, "grad_norm": 0.10635466873645782, "learning_rate": 2.060304133498232e-06, "loss": 0.6753, "step": 57900 }, { "epoch": 3.6370477205744027, "grad_norm": 0.09898664057254791, "learning_rate": 2.0254663902872375e-06, "loss": 0.8349, "step": 58000 }, { "epoch": 3.6370477205744027, "eval_loss": 0.7500940561294556, "eval_runtime": 252.1601, "eval_samples_per_second": 505.917, "eval_steps_per_second": 15.811, "step": 58000 }, { "epoch": 3.6433184925064275, "grad_norm": 0.05218241736292839, "learning_rate": 1.9906286470762426e-06, "loss": 0.8523, "step": 58100 }, { "epoch": 3.649589264438452, "grad_norm": 0.648098886013031, "learning_rate": 1.9557909038652477e-06, "loss": 0.2962, "step": 58200 }, { "epoch": 3.655860036370477, "grad_norm": 19.993263244628906, "learning_rate": 1.920953160654253e-06, "loss": 0.6585, "step": 58300 }, { "epoch": 3.662130808302502, "grad_norm": 549.2650146484375, "learning_rate": 1.886115417443258e-06, "loss": 1.0247, "step": 58400 }, { "epoch": 3.668401580234527, "grad_norm": 53.44794464111328, "learning_rate": 1.8516260516643734e-06, "loss": 0.8638, "step": 58500 }, { "epoch": 3.668401580234527, "eval_loss": 0.7576786279678345, "eval_runtime": 251.8307, "eval_samples_per_second": 506.578, "eval_steps_per_second": 15.832, "step": 58500 }, { "epoch": 3.6746723521665516, "grad_norm": 0.11638414114713669, "learning_rate": 1.8167883084533785e-06, "loss": 0.9456, "step": 58600 }, { "epoch": 3.6809431240985764, "grad_norm": 0.16805872321128845, "learning_rate": 1.7819505652423837e-06, "loss": 0.5401, "step": 58700 }, { "epoch": 3.6872138960306016, "grad_norm": 161.84934997558594, "learning_rate": 1.747112822031389e-06, "loss": 0.6602, "step": 58800 }, { "epoch": 3.6934846679626263, "grad_norm": 0.16537758708000183, "learning_rate": 1.7122750788203943e-06, "loss": 0.7543, "step": 58900 }, { "epoch": 3.699755439894651, "grad_norm": 82.06924438476562, "learning_rate": 1.6774373356093992e-06, "loss": 0.7893, "step": 59000 }, { "epoch": 3.699755439894651, "eval_loss": 0.7599766850471497, "eval_runtime": 255.6784, "eval_samples_per_second": 498.955, "eval_steps_per_second": 15.594, "step": 59000 }, { "epoch": 3.706026211826676, "grad_norm": 123.94532012939453, "learning_rate": 1.6425995923984044e-06, "loss": 0.7746, "step": 59100 }, { "epoch": 3.7122969837587005, "grad_norm": 0.06561436504125595, "learning_rate": 1.6077618491874097e-06, "loss": 0.6539, "step": 59200 }, { "epoch": 3.7185677556907253, "grad_norm": 243.56668090820312, "learning_rate": 1.572924105976415e-06, "loss": 0.8083, "step": 59300 }, { "epoch": 3.7248385276227505, "grad_norm": 0.13773566484451294, "learning_rate": 1.5380863627654203e-06, "loss": 0.3429, "step": 59400 }, { "epoch": 3.731109299554775, "grad_norm": 412.2792053222656, "learning_rate": 1.5032486195544256e-06, "loss": 0.5005, "step": 59500 }, { "epoch": 3.731109299554775, "eval_loss": 0.7445316314697266, "eval_runtime": 251.276, "eval_samples_per_second": 507.697, "eval_steps_per_second": 15.867, "step": 59500 }, { "epoch": 3.7373800714868, "grad_norm": 10.323953628540039, "learning_rate": 1.4684108763434306e-06, "loss": 0.6238, "step": 59600 }, { "epoch": 3.7436508434188247, "grad_norm": 34.32875061035156, "learning_rate": 1.4335731331324357e-06, "loss": 0.4343, "step": 59700 }, { "epoch": 3.74992161535085, "grad_norm": 0.08429472893476486, "learning_rate": 1.398735389921441e-06, "loss": 0.8189, "step": 59800 }, { "epoch": 3.7561923872828746, "grad_norm": 68.88423156738281, "learning_rate": 1.3638976467104463e-06, "loss": 0.6272, "step": 59900 }, { "epoch": 3.7624631592148994, "grad_norm": 0.1870589703321457, "learning_rate": 1.3290599034994513e-06, "loss": 0.2982, "step": 60000 }, { "epoch": 3.7624631592148994, "eval_loss": 0.7597461342811584, "eval_runtime": 254.4738, "eval_samples_per_second": 501.317, "eval_steps_per_second": 15.668, "step": 60000 }, { "epoch": 3.768733931146924, "grad_norm": 0.051242515444755554, "learning_rate": 1.2942221602884566e-06, "loss": 0.7028, "step": 60100 }, { "epoch": 3.775004703078949, "grad_norm": 187.53872680664062, "learning_rate": 1.2593844170774619e-06, "loss": 0.9447, "step": 60200 }, { "epoch": 3.7812754750109736, "grad_norm": 64.70340728759766, "learning_rate": 1.224546673866467e-06, "loss": 0.6175, "step": 60300 }, { "epoch": 3.787546246942999, "grad_norm": 0.8817376494407654, "learning_rate": 1.1897089306554722e-06, "loss": 0.5856, "step": 60400 }, { "epoch": 3.7938170188750235, "grad_norm": 88.64114379882812, "learning_rate": 1.1548711874444775e-06, "loss": 0.8249, "step": 60500 }, { "epoch": 3.7938170188750235, "eval_loss": 0.750523030757904, "eval_runtime": 252.8744, "eval_samples_per_second": 504.488, "eval_steps_per_second": 15.767, "step": 60500 }, { "epoch": 3.8000877908070483, "grad_norm": 0.041767679154872894, "learning_rate": 1.1203818216655927e-06, "loss": 0.6617, "step": 60600 }, { "epoch": 3.806358562739073, "grad_norm": 74.78905487060547, "learning_rate": 1.085544078454598e-06, "loss": 0.5767, "step": 60700 }, { "epoch": 3.812629334671098, "grad_norm": 0.11142675578594208, "learning_rate": 1.050706335243603e-06, "loss": 1.0094, "step": 60800 }, { "epoch": 3.818900106603123, "grad_norm": 92.60441589355469, "learning_rate": 1.0158685920326083e-06, "loss": 0.471, "step": 60900 }, { "epoch": 3.8251708785351477, "grad_norm": 42.58308410644531, "learning_rate": 9.810308488216134e-07, "loss": 0.6313, "step": 61000 }, { "epoch": 3.8251708785351477, "eval_loss": 0.7488948702812195, "eval_runtime": 251.0082, "eval_samples_per_second": 508.238, "eval_steps_per_second": 15.884, "step": 61000 }, { "epoch": 3.8314416504671724, "grad_norm": 46.6805305480957, "learning_rate": 9.461931056106186e-07, "loss": 0.6545, "step": 61100 }, { "epoch": 3.837712422399197, "grad_norm": 0.06978940218687057, "learning_rate": 9.113553623996238e-07, "loss": 0.699, "step": 61200 }, { "epoch": 3.843983194331222, "grad_norm": 0.933862030506134, "learning_rate": 8.76517619188629e-07, "loss": 0.6272, "step": 61300 }, { "epoch": 3.850253966263247, "grad_norm": 44.13498306274414, "learning_rate": 8.416798759776343e-07, "loss": 0.7375, "step": 61400 }, { "epoch": 3.856524738195272, "grad_norm": 3.0953245162963867, "learning_rate": 8.068421327666394e-07, "loss": 0.4213, "step": 61500 }, { "epoch": 3.856524738195272, "eval_loss": 0.7490043640136719, "eval_runtime": 251.9621, "eval_samples_per_second": 506.314, "eval_steps_per_second": 15.824, "step": 61500 }, { "epoch": 3.8627955101272966, "grad_norm": 15.084046363830566, "learning_rate": 7.720043895556446e-07, "loss": 0.6631, "step": 61600 }, { "epoch": 3.8690662820593213, "grad_norm": 34.3710823059082, "learning_rate": 7.371666463446499e-07, "loss": 0.552, "step": 61700 }, { "epoch": 3.8753370539913465, "grad_norm": 0.2596281170845032, "learning_rate": 7.023289031336551e-07, "loss": 0.7041, "step": 61800 }, { "epoch": 3.8816078259233713, "grad_norm": 0.04028361290693283, "learning_rate": 6.674911599226603e-07, "loss": 0.8457, "step": 61900 }, { "epoch": 3.887878597855396, "grad_norm": 0.2941274344921112, "learning_rate": 6.326534167116654e-07, "loss": 0.8104, "step": 62000 }, { "epoch": 3.887878597855396, "eval_loss": 0.7476946115493774, "eval_runtime": 276.1611, "eval_samples_per_second": 461.948, "eval_steps_per_second": 14.437, "step": 62000 }, { "epoch": 3.8941493697874208, "grad_norm": 51.24428939819336, "learning_rate": 5.978156735006706e-07, "loss": 0.4494, "step": 62100 }, { "epoch": 3.9004201417194455, "grad_norm": 89.3067855834961, "learning_rate": 5.629779302896759e-07, "loss": 0.6947, "step": 62200 }, { "epoch": 3.9066909136514703, "grad_norm": 0.06883756071329117, "learning_rate": 5.281401870786811e-07, "loss": 0.8061, "step": 62300 }, { "epoch": 3.9129616855834954, "grad_norm": 0.8000829219818115, "learning_rate": 4.933024438676863e-07, "loss": 0.416, "step": 62400 }, { "epoch": 3.91923245751552, "grad_norm": 119.61589813232422, "learning_rate": 4.5846470065669146e-07, "loss": 0.7359, "step": 62500 }, { "epoch": 3.91923245751552, "eval_loss": 0.7468039989471436, "eval_runtime": 257.4303, "eval_samples_per_second": 495.559, "eval_steps_per_second": 15.488, "step": 62500 }, { "epoch": 3.925503229447545, "grad_norm": 0.29899609088897705, "learning_rate": 4.2362695744569673e-07, "loss": 0.7408, "step": 62600 }, { "epoch": 3.9317740013795697, "grad_norm": 112.43661499023438, "learning_rate": 3.887892142347019e-07, "loss": 0.6255, "step": 62700 }, { "epoch": 3.938044773311595, "grad_norm": 474.4875793457031, "learning_rate": 3.5395147102370713e-07, "loss": 0.7865, "step": 62800 }, { "epoch": 3.9443155452436196, "grad_norm": 0.3225669860839844, "learning_rate": 3.191137278127123e-07, "loss": 0.4879, "step": 62900 }, { "epoch": 3.9505863171756443, "grad_norm": 7.089817047119141, "learning_rate": 2.8427598460171753e-07, "loss": 0.5196, "step": 63000 }, { "epoch": 3.9505863171756443, "eval_loss": 0.7484961748123169, "eval_runtime": 258.5356, "eval_samples_per_second": 493.441, "eval_steps_per_second": 15.421, "step": 63000 }, { "epoch": 3.956857089107669, "grad_norm": 0.09167669713497162, "learning_rate": 2.494382413907227e-07, "loss": 0.5683, "step": 63100 }, { "epoch": 3.963127861039694, "grad_norm": 12.482440948486328, "learning_rate": 2.1460049817972793e-07, "loss": 0.5141, "step": 63200 }, { "epoch": 3.9693986329717186, "grad_norm": 8.954193115234375, "learning_rate": 1.8011113240084312e-07, "loss": 0.6068, "step": 63300 }, { "epoch": 3.9756694049037438, "grad_norm": 219.27337646484375, "learning_rate": 1.452733891898483e-07, "loss": 0.5929, "step": 63400 }, { "epoch": 3.9819401768357685, "grad_norm": 1.6949673891067505, "learning_rate": 1.104356459788535e-07, "loss": 0.7513, "step": 63500 }, { "epoch": 3.9819401768357685, "eval_loss": 0.7482015490531921, "eval_runtime": 257.9174, "eval_samples_per_second": 494.623, "eval_steps_per_second": 15.458, "step": 63500 }, { "epoch": 3.9882109487677933, "grad_norm": 0.34383705258369446, "learning_rate": 7.55979027678587e-08, "loss": 0.5053, "step": 63600 }, { "epoch": 3.994481720699818, "grad_norm": 0.20212756097316742, "learning_rate": 4.0760159556863914e-08, "loss": 0.5707, "step": 63700 } ], "logging_steps": 100, "max_steps": 63788, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }