{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.8675496688741724, "eval_steps": 500, "global_step": 131400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0029433406916850625, "grad_norm": 1.6272720098495483, "learning_rate": 1.4716703458425313e-07, "loss": 4.7115, "step": 100 }, { "epoch": 0.005886681383370125, "grad_norm": 1.5667617321014404, "learning_rate": 2.9433406916850625e-07, "loss": 4.6948, "step": 200 }, { "epoch": 0.008830022075055188, "grad_norm": 1.686622142791748, "learning_rate": 4.4150110375275946e-07, "loss": 4.6548, "step": 300 }, { "epoch": 0.01177336276674025, "grad_norm": 1.721745491027832, "learning_rate": 5.886681383370125e-07, "loss": 4.6055, "step": 400 }, { "epoch": 0.014716703458425313, "grad_norm": 1.9134653806686401, "learning_rate": 7.358351729212657e-07, "loss": 4.5234, "step": 500 }, { "epoch": 0.014716703458425313, "eval_loss": 4.387777328491211, "eval_runtime": 650.7152, "eval_samples_per_second": 417.687, "eval_steps_per_second": 13.053, "step": 500 }, { "epoch": 0.017660044150110375, "grad_norm": 2.084204912185669, "learning_rate": 8.830022075055189e-07, "loss": 4.4338, "step": 600 }, { "epoch": 0.020603384841795438, "grad_norm": 2.1029446125030518, "learning_rate": 1.030169242089772e-06, "loss": 4.2938, "step": 700 }, { "epoch": 0.0235467255334805, "grad_norm": 2.0130162239074707, "learning_rate": 1.177336276674025e-06, "loss": 4.1176, "step": 800 }, { "epoch": 0.026490066225165563, "grad_norm": 1.6967988014221191, "learning_rate": 1.3245033112582784e-06, "loss": 3.9373, "step": 900 }, { "epoch": 0.029433406916850625, "grad_norm": 1.888842225074768, "learning_rate": 1.4716703458425313e-06, "loss": 3.7241, "step": 1000 }, { "epoch": 0.029433406916850625, "eval_loss": 3.4721217155456543, "eval_runtime": 659.4135, "eval_samples_per_second": 412.177, "eval_steps_per_second": 12.881, "step": 1000 }, { "epoch": 0.03237674760853569, "grad_norm": 1.4486851692199707, "learning_rate": 1.6188373804267845e-06, "loss": 3.5965, "step": 1100 }, { "epoch": 0.03532008830022075, "grad_norm": 1.3971041440963745, "learning_rate": 1.7660044150110378e-06, "loss": 3.4949, "step": 1200 }, { "epoch": 0.03826342899190582, "grad_norm": 1.4760615825653076, "learning_rate": 1.9131714495952908e-06, "loss": 3.4542, "step": 1300 }, { "epoch": 0.041206769683590876, "grad_norm": 1.036159634590149, "learning_rate": 2.060338484179544e-06, "loss": 3.4345, "step": 1400 }, { "epoch": 0.04415011037527594, "grad_norm": 0.8445461988449097, "learning_rate": 2.207505518763797e-06, "loss": 3.3955, "step": 1500 }, { "epoch": 0.04415011037527594, "eval_loss": 3.2453179359436035, "eval_runtime": 657.7735, "eval_samples_per_second": 413.205, "eval_steps_per_second": 12.913, "step": 1500 }, { "epoch": 0.047093451066961, "grad_norm": 0.5998972654342651, "learning_rate": 2.35467255334805e-06, "loss": 3.3818, "step": 1600 }, { "epoch": 0.05003679175864607, "grad_norm": 3.5936014652252197, "learning_rate": 2.5018395879323034e-06, "loss": 3.3608, "step": 1700 }, { "epoch": 0.052980132450331126, "grad_norm": 2.275876760482788, "learning_rate": 2.6490066225165567e-06, "loss": 3.3377, "step": 1800 }, { "epoch": 0.05592347314201619, "grad_norm": 0.41647854447364807, "learning_rate": 2.7961736571008097e-06, "loss": 3.326, "step": 1900 }, { "epoch": 0.05886681383370125, "grad_norm": 2.0164637565612793, "learning_rate": 2.9433406916850626e-06, "loss": 3.3061, "step": 2000 }, { "epoch": 0.05886681383370125, "eval_loss": 3.16919207572937, "eval_runtime": 654.8673, "eval_samples_per_second": 415.038, "eval_steps_per_second": 12.971, "step": 2000 }, { "epoch": 0.06181015452538632, "grad_norm": 5.002352714538574, "learning_rate": 3.090507726269316e-06, "loss": 3.308, "step": 2100 }, { "epoch": 0.06475349521707138, "grad_norm": 0.20406407117843628, "learning_rate": 3.2362030905077264e-06, "loss": 3.2887, "step": 2200 }, { "epoch": 0.06769683590875644, "grad_norm": 0.305467426776886, "learning_rate": 3.3833701250919798e-06, "loss": 3.2963, "step": 2300 }, { "epoch": 0.0706401766004415, "grad_norm": 1.8477975130081177, "learning_rate": 3.5305371596762327e-06, "loss": 3.2744, "step": 2400 }, { "epoch": 0.07358351729212656, "grad_norm": 0.42971035838127136, "learning_rate": 3.6777041942604856e-06, "loss": 3.2601, "step": 2500 }, { "epoch": 0.07358351729212656, "eval_loss": 3.141599655151367, "eval_runtime": 645.7029, "eval_samples_per_second": 420.929, "eval_steps_per_second": 13.155, "step": 2500 }, { "epoch": 0.07652685798381163, "grad_norm": 0.14378389716148376, "learning_rate": 3.824871228844739e-06, "loss": 3.271, "step": 2600 }, { "epoch": 0.07947019867549669, "grad_norm": 0.2861994802951813, "learning_rate": 3.972038263428992e-06, "loss": 3.2501, "step": 2700 }, { "epoch": 0.08241353936718175, "grad_norm": 0.1584373563528061, "learning_rate": 4.119205298013245e-06, "loss": 3.2536, "step": 2800 }, { "epoch": 0.08535688005886681, "grad_norm": 3.9958763122558594, "learning_rate": 4.266372332597499e-06, "loss": 3.2689, "step": 2900 }, { "epoch": 0.08830022075055188, "grad_norm": 0.23161353170871735, "learning_rate": 4.413539367181752e-06, "loss": 3.2362, "step": 3000 }, { "epoch": 0.08830022075055188, "eval_loss": 3.1195619106292725, "eval_runtime": 653.1511, "eval_samples_per_second": 416.129, "eval_steps_per_second": 13.005, "step": 3000 }, { "epoch": 0.09124356144223694, "grad_norm": 0.1324678361415863, "learning_rate": 4.5592347314201624e-06, "loss": 3.2281, "step": 3100 }, { "epoch": 0.094186902133922, "grad_norm": 12.537768363952637, "learning_rate": 4.706401766004415e-06, "loss": 3.2351, "step": 3200 }, { "epoch": 0.09713024282560706, "grad_norm": 0.3400228023529053, "learning_rate": 4.852097130242826e-06, "loss": 3.2173, "step": 3300 }, { "epoch": 0.10007358351729213, "grad_norm": 0.10861583054065704, "learning_rate": 4.999264164827079e-06, "loss": 3.2055, "step": 3400 }, { "epoch": 0.10301692420897719, "grad_norm": 0.10545468330383301, "learning_rate": 5.1464311994113325e-06, "loss": 3.2198, "step": 3500 }, { "epoch": 0.10301692420897719, "eval_loss": 3.1080849170684814, "eval_runtime": 651.2322, "eval_samples_per_second": 417.355, "eval_steps_per_second": 13.043, "step": 3500 }, { "epoch": 0.10596026490066225, "grad_norm": 0.11501341313123703, "learning_rate": 5.293598233995585e-06, "loss": 3.2116, "step": 3600 }, { "epoch": 0.10890360559234731, "grad_norm": 0.2385537326335907, "learning_rate": 5.440765268579838e-06, "loss": 3.2088, "step": 3700 }, { "epoch": 0.11184694628403238, "grad_norm": 0.11578717827796936, "learning_rate": 5.587932303164092e-06, "loss": 3.2043, "step": 3800 }, { "epoch": 0.11479028697571744, "grad_norm": 0.3394097685813904, "learning_rate": 5.735099337748344e-06, "loss": 3.1943, "step": 3900 }, { "epoch": 0.1177336276674025, "grad_norm": 0.3442012667655945, "learning_rate": 5.882266372332598e-06, "loss": 3.1897, "step": 4000 }, { "epoch": 0.1177336276674025, "eval_loss": 3.1026694774627686, "eval_runtime": 651.809, "eval_samples_per_second": 416.986, "eval_steps_per_second": 13.031, "step": 4000 }, { "epoch": 0.12067696835908756, "grad_norm": 0.10929220914840698, "learning_rate": 6.029433406916851e-06, "loss": 3.2131, "step": 4100 }, { "epoch": 0.12362030905077263, "grad_norm": 23.486799240112305, "learning_rate": 6.1766004415011035e-06, "loss": 3.198, "step": 4200 }, { "epoch": 0.12656364974245768, "grad_norm": 7.512275695800781, "learning_rate": 6.323767476085358e-06, "loss": 3.1892, "step": 4300 }, { "epoch": 0.12950699043414277, "grad_norm": 0.11411729454994202, "learning_rate": 6.470934510669611e-06, "loss": 3.1753, "step": 4400 }, { "epoch": 0.13245033112582782, "grad_norm": 0.12140727788209915, "learning_rate": 6.618101545253864e-06, "loss": 3.1722, "step": 4500 }, { "epoch": 0.13245033112582782, "eval_loss": 3.0840346813201904, "eval_runtime": 652.614, "eval_samples_per_second": 416.471, "eval_steps_per_second": 13.015, "step": 4500 }, { "epoch": 0.13539367181751288, "grad_norm": 0.23977194726467133, "learning_rate": 6.765268579838117e-06, "loss": 3.1599, "step": 4600 }, { "epoch": 0.13833701250919794, "grad_norm": 0.1104390099644661, "learning_rate": 6.91243561442237e-06, "loss": 3.166, "step": 4700 }, { "epoch": 0.141280353200883, "grad_norm": 0.11104759573936462, "learning_rate": 7.059602649006623e-06, "loss": 3.1585, "step": 4800 }, { "epoch": 0.14422369389256806, "grad_norm": 0.19156178832054138, "learning_rate": 7.206769683590876e-06, "loss": 3.1698, "step": 4900 }, { "epoch": 0.14716703458425312, "grad_norm": 0.10963413864374161, "learning_rate": 7.352465047829287e-06, "loss": 3.1766, "step": 5000 }, { "epoch": 0.14716703458425312, "eval_loss": 3.0782082080841064, "eval_runtime": 653.6547, "eval_samples_per_second": 415.808, "eval_steps_per_second": 12.995, "step": 5000 }, { "epoch": 0.15011037527593818, "grad_norm": 11.723798751831055, "learning_rate": 7.49963208241354e-06, "loss": 3.1515, "step": 5100 }, { "epoch": 0.15305371596762327, "grad_norm": 0.13782736659049988, "learning_rate": 7.646799116997793e-06, "loss": 3.1487, "step": 5200 }, { "epoch": 0.15599705665930833, "grad_norm": 0.11667618900537491, "learning_rate": 7.793966151582047e-06, "loss": 3.1579, "step": 5300 }, { "epoch": 0.15894039735099338, "grad_norm": 0.11695076525211334, "learning_rate": 7.9411331861663e-06, "loss": 3.1533, "step": 5400 }, { "epoch": 0.16188373804267844, "grad_norm": 0.1249813511967659, "learning_rate": 8.088300220750552e-06, "loss": 3.1433, "step": 5500 }, { "epoch": 0.16188373804267844, "eval_loss": 3.073453903198242, "eval_runtime": 649.6144, "eval_samples_per_second": 418.394, "eval_steps_per_second": 13.075, "step": 5500 }, { "epoch": 0.1648270787343635, "grad_norm": 0.10794921219348907, "learning_rate": 8.235467255334806e-06, "loss": 3.1454, "step": 5600 }, { "epoch": 0.16777041942604856, "grad_norm": 0.1324673742055893, "learning_rate": 8.382634289919059e-06, "loss": 3.1397, "step": 5700 }, { "epoch": 0.17071376011773362, "grad_norm": 0.1333872526884079, "learning_rate": 8.529801324503311e-06, "loss": 3.1422, "step": 5800 }, { "epoch": 0.17365710080941868, "grad_norm": 0.1157006174325943, "learning_rate": 8.676968359087566e-06, "loss": 3.1372, "step": 5900 }, { "epoch": 0.17660044150110377, "grad_norm": 0.17443318665027618, "learning_rate": 8.824135393671818e-06, "loss": 3.137, "step": 6000 }, { "epoch": 0.17660044150110377, "eval_loss": 3.0709736347198486, "eval_runtime": 650.5421, "eval_samples_per_second": 417.798, "eval_steps_per_second": 13.057, "step": 6000 }, { "epoch": 0.17954378219278883, "grad_norm": 0.17153222858905792, "learning_rate": 8.97130242825607e-06, "loss": 3.1297, "step": 6100 }, { "epoch": 0.18248712288447388, "grad_norm": 0.1273050606250763, "learning_rate": 9.118469462840325e-06, "loss": 3.1202, "step": 6200 }, { "epoch": 0.18543046357615894, "grad_norm": 0.12626317143440247, "learning_rate": 9.265636497424577e-06, "loss": 3.1256, "step": 6300 }, { "epoch": 0.188373804267844, "grad_norm": 0.12453300505876541, "learning_rate": 9.41280353200883e-06, "loss": 3.1185, "step": 6400 }, { "epoch": 0.19131714495952906, "grad_norm": 0.12433254718780518, "learning_rate": 9.559970566593084e-06, "loss": 3.1266, "step": 6500 }, { "epoch": 0.19131714495952906, "eval_loss": 3.0667169094085693, "eval_runtime": 645.9804, "eval_samples_per_second": 420.748, "eval_steps_per_second": 13.149, "step": 6500 }, { "epoch": 0.19426048565121412, "grad_norm": 0.8486097455024719, "learning_rate": 9.707137601177337e-06, "loss": 3.1197, "step": 6600 }, { "epoch": 0.19720382634289918, "grad_norm": 0.14500103890895844, "learning_rate": 9.85430463576159e-06, "loss": 3.1286, "step": 6700 }, { "epoch": 0.20014716703458427, "grad_norm": 0.1242411807179451, "learning_rate": 1.0001471670345843e-05, "loss": 3.1239, "step": 6800 }, { "epoch": 0.20309050772626933, "grad_norm": 0.12029566615819931, "learning_rate": 1.0148638704930096e-05, "loss": 3.1166, "step": 6900 }, { "epoch": 0.20603384841795438, "grad_norm": 0.11236037313938141, "learning_rate": 1.029580573951435e-05, "loss": 3.1054, "step": 7000 }, { "epoch": 0.20603384841795438, "eval_loss": 3.0663862228393555, "eval_runtime": 651.8183, "eval_samples_per_second": 416.98, "eval_steps_per_second": 13.031, "step": 7000 }, { "epoch": 0.20897718910963944, "grad_norm": 0.1379052996635437, "learning_rate": 1.0442972774098603e-05, "loss": 3.1103, "step": 7100 }, { "epoch": 0.2119205298013245, "grad_norm": 0.11774054914712906, "learning_rate": 1.0590139808682855e-05, "loss": 3.0929, "step": 7200 }, { "epoch": 0.21486387049300956, "grad_norm": 0.11906363815069199, "learning_rate": 1.073730684326711e-05, "loss": 3.1051, "step": 7300 }, { "epoch": 0.21780721118469462, "grad_norm": 0.11512956023216248, "learning_rate": 1.0884473877851362e-05, "loss": 3.1023, "step": 7400 }, { "epoch": 0.22075055187637968, "grad_norm": 0.11629810929298401, "learning_rate": 1.1031640912435614e-05, "loss": 3.0946, "step": 7500 }, { "epoch": 0.22075055187637968, "eval_loss": 3.0635571479797363, "eval_runtime": 648.2469, "eval_samples_per_second": 419.277, "eval_steps_per_second": 13.103, "step": 7500 }, { "epoch": 0.22369389256806477, "grad_norm": 0.2573147714138031, "learning_rate": 1.1178807947019867e-05, "loss": 3.0958, "step": 7600 }, { "epoch": 0.22663723325974983, "grad_norm": 0.1373881697654724, "learning_rate": 1.1325974981604123e-05, "loss": 3.0907, "step": 7700 }, { "epoch": 0.22958057395143489, "grad_norm": 0.11423376947641373, "learning_rate": 1.1473142016188374e-05, "loss": 3.1051, "step": 7800 }, { "epoch": 0.23252391464311994, "grad_norm": 0.12365728616714478, "learning_rate": 1.1620309050772626e-05, "loss": 3.0965, "step": 7900 }, { "epoch": 0.235467255334805, "grad_norm": 0.12054827809333801, "learning_rate": 1.1767476085356882e-05, "loss": 3.0954, "step": 8000 }, { "epoch": 0.235467255334805, "eval_loss": 3.0617218017578125, "eval_runtime": 644.9072, "eval_samples_per_second": 421.448, "eval_steps_per_second": 13.171, "step": 8000 }, { "epoch": 0.23841059602649006, "grad_norm": 0.1252882331609726, "learning_rate": 1.1914643119941135e-05, "loss": 3.0693, "step": 8100 }, { "epoch": 0.24135393671817512, "grad_norm": 0.11890527606010437, "learning_rate": 1.2061810154525387e-05, "loss": 3.0906, "step": 8200 }, { "epoch": 0.24429727740986018, "grad_norm": 0.11226367205381393, "learning_rate": 1.2208977189109641e-05, "loss": 3.0881, "step": 8300 }, { "epoch": 0.24724061810154527, "grad_norm": 0.11838987469673157, "learning_rate": 1.2356144223693894e-05, "loss": 3.0867, "step": 8400 }, { "epoch": 0.2501839587932303, "grad_norm": 0.1156696304678917, "learning_rate": 1.2503311258278146e-05, "loss": 3.0867, "step": 8500 }, { "epoch": 0.2501839587932303, "eval_loss": 3.061007261276245, "eval_runtime": 639.4271, "eval_samples_per_second": 425.06, "eval_steps_per_second": 13.284, "step": 8500 }, { "epoch": 0.25312729948491536, "grad_norm": 0.10668116062879562, "learning_rate": 1.26504782928624e-05, "loss": 3.0909, "step": 8600 }, { "epoch": 0.2560706401766004, "grad_norm": 0.10439509153366089, "learning_rate": 1.2797645327446653e-05, "loss": 3.0877, "step": 8700 }, { "epoch": 0.25901398086828553, "grad_norm": 0.11207351088523865, "learning_rate": 1.2944812362030906e-05, "loss": 3.0837, "step": 8800 }, { "epoch": 0.2619573215599706, "grad_norm": 0.11145055294036865, "learning_rate": 1.309197939661516e-05, "loss": 3.0865, "step": 8900 }, { "epoch": 0.26490066225165565, "grad_norm": 0.11839548498392105, "learning_rate": 1.3239146431199412e-05, "loss": 3.0846, "step": 9000 }, { "epoch": 0.26490066225165565, "eval_loss": 3.0607030391693115, "eval_runtime": 644.4968, "eval_samples_per_second": 421.717, "eval_steps_per_second": 13.179, "step": 9000 }, { "epoch": 0.2678440029433407, "grad_norm": 0.10559707880020142, "learning_rate": 1.3384841795437824e-05, "loss": 3.0798, "step": 9100 }, { "epoch": 0.27078734363502577, "grad_norm": 0.1135486364364624, "learning_rate": 1.3530537159676234e-05, "loss": 3.0928, "step": 9200 }, { "epoch": 0.2737306843267108, "grad_norm": 0.10340794175863266, "learning_rate": 1.3677704194260486e-05, "loss": 3.0794, "step": 9300 }, { "epoch": 0.2766740250183959, "grad_norm": 0.10411669313907623, "learning_rate": 1.382487122884474e-05, "loss": 3.0797, "step": 9400 }, { "epoch": 0.27961736571008095, "grad_norm": 0.11592269688844681, "learning_rate": 1.3972038263428993e-05, "loss": 3.0685, "step": 9500 }, { "epoch": 0.27961736571008095, "eval_loss": 3.0622897148132324, "eval_runtime": 641.85, "eval_samples_per_second": 423.456, "eval_steps_per_second": 13.234, "step": 9500 }, { "epoch": 0.282560706401766, "grad_norm": 0.10980008542537689, "learning_rate": 1.4119205298013246e-05, "loss": 3.0768, "step": 9600 }, { "epoch": 0.28550404709345106, "grad_norm": 0.10716495662927628, "learning_rate": 1.42663723325975e-05, "loss": 3.0657, "step": 9700 }, { "epoch": 0.2884473877851361, "grad_norm": 0.1136142909526825, "learning_rate": 1.4413539367181752e-05, "loss": 3.0838, "step": 9800 }, { "epoch": 0.2913907284768212, "grad_norm": 0.09968952089548111, "learning_rate": 1.4560706401766005e-05, "loss": 3.0775, "step": 9900 }, { "epoch": 0.29433406916850624, "grad_norm": 0.10055411607027054, "learning_rate": 1.470787343635026e-05, "loss": 3.0667, "step": 10000 }, { "epoch": 0.29433406916850624, "eval_loss": 3.0587265491485596, "eval_runtime": 644.5842, "eval_samples_per_second": 421.659, "eval_steps_per_second": 13.177, "step": 10000 }, { "epoch": 0.2972774098601913, "grad_norm": 0.10639077425003052, "learning_rate": 1.4855040470934512e-05, "loss": 3.088, "step": 10100 }, { "epoch": 0.30022075055187636, "grad_norm": 0.09320100396871567, "learning_rate": 1.5000735835172922e-05, "loss": 3.0824, "step": 10200 }, { "epoch": 0.3031640912435614, "grad_norm": 0.11812783777713776, "learning_rate": 1.5147902869757176e-05, "loss": 3.0754, "step": 10300 }, { "epoch": 0.30610743193524653, "grad_norm": 0.10227449983358383, "learning_rate": 1.529506990434143e-05, "loss": 3.064, "step": 10400 }, { "epoch": 0.3090507726269316, "grad_norm": 0.09317319840192795, "learning_rate": 1.5442236938925683e-05, "loss": 3.0637, "step": 10500 }, { "epoch": 0.3090507726269316, "eval_loss": 3.057762384414673, "eval_runtime": 635.7899, "eval_samples_per_second": 427.492, "eval_steps_per_second": 13.36, "step": 10500 }, { "epoch": 0.31199411331861665, "grad_norm": 0.10339343547821045, "learning_rate": 1.5589403973509937e-05, "loss": 3.0754, "step": 10600 }, { "epoch": 0.3149374540103017, "grad_norm": 0.10575199872255325, "learning_rate": 1.5736571008094188e-05, "loss": 3.0703, "step": 10700 }, { "epoch": 0.31788079470198677, "grad_norm": 0.09247788041830063, "learning_rate": 1.5883738042678442e-05, "loss": 3.0697, "step": 10800 }, { "epoch": 0.3208241353936718, "grad_norm": 0.10013869404792786, "learning_rate": 1.6030905077262696e-05, "loss": 3.0635, "step": 10900 }, { "epoch": 0.3237674760853569, "grad_norm": 0.10718971490859985, "learning_rate": 1.6178072111846947e-05, "loss": 3.0872, "step": 11000 }, { "epoch": 0.3237674760853569, "eval_loss": 3.057349920272827, "eval_runtime": 634.2613, "eval_samples_per_second": 428.522, "eval_steps_per_second": 13.392, "step": 11000 }, { "epoch": 0.32671081677704195, "grad_norm": 0.10293637216091156, "learning_rate": 1.63252391464312e-05, "loss": 3.0722, "step": 11100 }, { "epoch": 0.329654157468727, "grad_norm": 0.10080607235431671, "learning_rate": 1.6472406181015455e-05, "loss": 3.0633, "step": 11200 }, { "epoch": 0.33259749816041206, "grad_norm": 0.1010218933224678, "learning_rate": 1.6619573215599706e-05, "loss": 3.058, "step": 11300 }, { "epoch": 0.3355408388520971, "grad_norm": 0.09535133838653564, "learning_rate": 1.676674025018396e-05, "loss": 3.0601, "step": 11400 }, { "epoch": 0.3384841795437822, "grad_norm": 0.10082606226205826, "learning_rate": 1.6913907284768215e-05, "loss": 3.0732, "step": 11500 }, { "epoch": 0.3384841795437822, "eval_loss": 3.0582568645477295, "eval_runtime": 637.0131, "eval_samples_per_second": 426.671, "eval_steps_per_second": 13.334, "step": 11500 }, { "epoch": 0.34142752023546724, "grad_norm": 0.09819087386131287, "learning_rate": 1.7061074319352465e-05, "loss": 3.0565, "step": 11600 }, { "epoch": 0.3443708609271523, "grad_norm": 0.09251740574836731, "learning_rate": 1.720824135393672e-05, "loss": 3.0735, "step": 11700 }, { "epoch": 0.34731420161883736, "grad_norm": 0.10251673310995102, "learning_rate": 1.7355408388520974e-05, "loss": 3.0656, "step": 11800 }, { "epoch": 0.3502575423105224, "grad_norm": 0.14669595658779144, "learning_rate": 1.7502575423105225e-05, "loss": 3.0583, "step": 11900 }, { "epoch": 0.35320088300220753, "grad_norm": 0.10411660373210907, "learning_rate": 1.764974245768948e-05, "loss": 3.0714, "step": 12000 }, { "epoch": 0.35320088300220753, "eval_loss": 3.057373046875, "eval_runtime": 632.7196, "eval_samples_per_second": 429.566, "eval_steps_per_second": 13.425, "step": 12000 }, { "epoch": 0.3561442236938926, "grad_norm": 0.08433002978563309, "learning_rate": 1.7796909492273733e-05, "loss": 3.0647, "step": 12100 }, { "epoch": 0.35908756438557765, "grad_norm": 1.3428891897201538, "learning_rate": 1.7944076526857984e-05, "loss": 3.0522, "step": 12200 }, { "epoch": 0.3620309050772627, "grad_norm": 0.10056746751070023, "learning_rate": 1.8091243561442238e-05, "loss": 3.0668, "step": 12300 }, { "epoch": 0.36497424576894777, "grad_norm": 0.0967116504907608, "learning_rate": 1.8238410596026492e-05, "loss": 3.071, "step": 12400 }, { "epoch": 0.36791758646063283, "grad_norm": 0.09388457983732224, "learning_rate": 1.8385577630610743e-05, "loss": 3.0667, "step": 12500 }, { "epoch": 0.36791758646063283, "eval_loss": 3.0555548667907715, "eval_runtime": 643.9482, "eval_samples_per_second": 422.076, "eval_steps_per_second": 13.191, "step": 12500 }, { "epoch": 0.3708609271523179, "grad_norm": 0.22833067178726196, "learning_rate": 1.8532744665194997e-05, "loss": 3.0568, "step": 12600 }, { "epoch": 0.37380426784400295, "grad_norm": 0.09870638698339462, "learning_rate": 1.867991169977925e-05, "loss": 3.0642, "step": 12700 }, { "epoch": 0.376747608535688, "grad_norm": 0.08119294047355652, "learning_rate": 1.8827078734363506e-05, "loss": 3.0607, "step": 12800 }, { "epoch": 0.37969094922737306, "grad_norm": 0.08860599249601364, "learning_rate": 1.8974245768947757e-05, "loss": 3.0679, "step": 12900 }, { "epoch": 0.3826342899190581, "grad_norm": 0.08334629982709885, "learning_rate": 1.912141280353201e-05, "loss": 3.0547, "step": 13000 }, { "epoch": 0.3826342899190581, "eval_loss": 3.054713249206543, "eval_runtime": 635.6402, "eval_samples_per_second": 427.593, "eval_steps_per_second": 13.363, "step": 13000 }, { "epoch": 0.3855776306107432, "grad_norm": 0.08470544964075089, "learning_rate": 1.9268579838116265e-05, "loss": 3.0714, "step": 13100 }, { "epoch": 0.38852097130242824, "grad_norm": 0.09742748737335205, "learning_rate": 1.9415746872700516e-05, "loss": 3.0692, "step": 13200 }, { "epoch": 0.3914643119941133, "grad_norm": 0.08357506990432739, "learning_rate": 1.956291390728477e-05, "loss": 3.0597, "step": 13300 }, { "epoch": 0.39440765268579836, "grad_norm": 0.09091360121965408, "learning_rate": 1.9710080941869024e-05, "loss": 3.067, "step": 13400 }, { "epoch": 0.3973509933774834, "grad_norm": 0.07428716868162155, "learning_rate": 1.9857247976453275e-05, "loss": 3.0626, "step": 13500 }, { "epoch": 0.3973509933774834, "eval_loss": 3.0551042556762695, "eval_runtime": 638.2346, "eval_samples_per_second": 425.854, "eval_steps_per_second": 13.309, "step": 13500 }, { "epoch": 0.40029433406916853, "grad_norm": 0.09140598773956299, "learning_rate": 1.9999509443218055e-05, "loss": 3.0708, "step": 13600 }, { "epoch": 0.4032376747608536, "grad_norm": 0.09259914606809616, "learning_rate": 1.9983321069413787e-05, "loss": 3.065, "step": 13700 }, { "epoch": 0.40618101545253865, "grad_norm": 0.07939771562814713, "learning_rate": 1.99669691766822e-05, "loss": 3.0619, "step": 13800 }, { "epoch": 0.4091243561442237, "grad_norm": 0.07500491291284561, "learning_rate": 1.995061728395062e-05, "loss": 3.0556, "step": 13900 }, { "epoch": 0.41206769683590877, "grad_norm": 1.3940569162368774, "learning_rate": 1.9934265391219034e-05, "loss": 3.0708, "step": 14000 }, { "epoch": 0.41206769683590877, "eval_loss": 3.0523788928985596, "eval_runtime": 641.1493, "eval_samples_per_second": 423.918, "eval_steps_per_second": 13.248, "step": 14000 }, { "epoch": 0.41501103752759383, "grad_norm": 0.08724746853113174, "learning_rate": 1.991791349848745e-05, "loss": 3.0634, "step": 14100 }, { "epoch": 0.4179543782192789, "grad_norm": 0.07986593246459961, "learning_rate": 1.990156160575587e-05, "loss": 3.0605, "step": 14200 }, { "epoch": 0.42089771891096395, "grad_norm": 0.08554524183273315, "learning_rate": 1.9885373231951598e-05, "loss": 3.0555, "step": 14300 }, { "epoch": 0.423841059602649, "grad_norm": 0.08447366207838058, "learning_rate": 1.9869021339220015e-05, "loss": 3.0624, "step": 14400 }, { "epoch": 0.42678440029433407, "grad_norm": 0.4958445429801941, "learning_rate": 1.9852669446488433e-05, "loss": 3.0468, "step": 14500 }, { "epoch": 0.42678440029433407, "eval_loss": 3.0509705543518066, "eval_runtime": 640.6352, "eval_samples_per_second": 424.259, "eval_steps_per_second": 13.259, "step": 14500 }, { "epoch": 0.4297277409860191, "grad_norm": 0.08235494792461395, "learning_rate": 1.983631755375685e-05, "loss": 3.0534, "step": 14600 }, { "epoch": 0.4326710816777042, "grad_norm": 0.08496395498514175, "learning_rate": 1.9819965661025265e-05, "loss": 3.0671, "step": 14700 }, { "epoch": 0.43561442236938924, "grad_norm": 0.08186972141265869, "learning_rate": 1.9803613768293683e-05, "loss": 3.0714, "step": 14800 }, { "epoch": 0.4385577630610743, "grad_norm": 0.08084297180175781, "learning_rate": 1.9787261875562097e-05, "loss": 3.0493, "step": 14900 }, { "epoch": 0.44150110375275936, "grad_norm": 0.07661929726600647, "learning_rate": 1.9770909982830515e-05, "loss": 3.0457, "step": 15000 }, { "epoch": 0.44150110375275936, "eval_loss": 3.046680450439453, "eval_runtime": 635.3933, "eval_samples_per_second": 427.759, "eval_steps_per_second": 13.368, "step": 15000 }, { "epoch": 0.4444444444444444, "grad_norm": 0.07370521128177643, "learning_rate": 1.975455809009893e-05, "loss": 3.0599, "step": 15100 }, { "epoch": 0.44738778513612953, "grad_norm": 0.07756703346967697, "learning_rate": 1.9738206197367347e-05, "loss": 3.0554, "step": 15200 }, { "epoch": 0.4503311258278146, "grad_norm": 0.08237345516681671, "learning_rate": 1.9721854304635765e-05, "loss": 3.0466, "step": 15300 }, { "epoch": 0.45327446651949965, "grad_norm": 0.0812094658613205, "learning_rate": 1.970550241190418e-05, "loss": 3.0471, "step": 15400 }, { "epoch": 0.4562178072111847, "grad_norm": 0.0817616805434227, "learning_rate": 1.9689150519172597e-05, "loss": 3.0465, "step": 15500 }, { "epoch": 0.4562178072111847, "eval_loss": 3.0500435829162598, "eval_runtime": 621.3275, "eval_samples_per_second": 437.442, "eval_steps_per_second": 13.671, "step": 15500 }, { "epoch": 0.45916114790286977, "grad_norm": 0.08888303488492966, "learning_rate": 1.967279862644101e-05, "loss": 3.0556, "step": 15600 }, { "epoch": 0.46210448859455483, "grad_norm": 0.08648809045553207, "learning_rate": 1.965644673370943e-05, "loss": 3.0444, "step": 15700 }, { "epoch": 0.4650478292862399, "grad_norm": 0.06965779513120651, "learning_rate": 1.9640094840977846e-05, "loss": 3.0468, "step": 15800 }, { "epoch": 0.46799116997792495, "grad_norm": 0.07509485632181168, "learning_rate": 1.962374294824626e-05, "loss": 3.0554, "step": 15900 }, { "epoch": 0.47093451066961, "grad_norm": 0.08047256618738174, "learning_rate": 1.960739105551468e-05, "loss": 3.0573, "step": 16000 }, { "epoch": 0.47093451066961, "eval_loss": 3.0468764305114746, "eval_runtime": 621.8732, "eval_samples_per_second": 437.059, "eval_steps_per_second": 13.659, "step": 16000 }, { "epoch": 0.47387785136129507, "grad_norm": 0.07823370397090912, "learning_rate": 1.9591039162783093e-05, "loss": 3.049, "step": 16100 }, { "epoch": 0.4768211920529801, "grad_norm": 0.07442843168973923, "learning_rate": 1.957468727005151e-05, "loss": 3.0539, "step": 16200 }, { "epoch": 0.4797645327446652, "grad_norm": 0.07984434813261032, "learning_rate": 1.9558335377319925e-05, "loss": 3.052, "step": 16300 }, { "epoch": 0.48270787343635024, "grad_norm": 0.08642031252384186, "learning_rate": 1.9541983484588342e-05, "loss": 3.0538, "step": 16400 }, { "epoch": 0.4856512141280353, "grad_norm": 0.07261636853218079, "learning_rate": 1.952563159185676e-05, "loss": 3.045, "step": 16500 }, { "epoch": 0.4856512141280353, "eval_loss": 3.044436454772949, "eval_runtime": 630.5854, "eval_samples_per_second": 431.02, "eval_steps_per_second": 13.47, "step": 16500 }, { "epoch": 0.48859455481972036, "grad_norm": 0.06609112024307251, "learning_rate": 1.9509279699125174e-05, "loss": 3.0381, "step": 16600 }, { "epoch": 0.4915378955114054, "grad_norm": 0.06717189401388168, "learning_rate": 1.9492927806393592e-05, "loss": 3.0517, "step": 16700 }, { "epoch": 0.49448123620309054, "grad_norm": 0.07005080580711365, "learning_rate": 1.9476575913662006e-05, "loss": 3.0598, "step": 16800 }, { "epoch": 0.4974245768947756, "grad_norm": 1.338317632675171, "learning_rate": 1.9460224020930424e-05, "loss": 3.046, "step": 16900 }, { "epoch": 0.5003679175864606, "grad_norm": 0.07369880378246307, "learning_rate": 1.944387212819884e-05, "loss": 3.0478, "step": 17000 }, { "epoch": 0.5003679175864606, "eval_loss": 3.0446791648864746, "eval_runtime": 631.0548, "eval_samples_per_second": 430.7, "eval_steps_per_second": 13.46, "step": 17000 }, { "epoch": 0.5033112582781457, "grad_norm": 0.24674178659915924, "learning_rate": 1.9427520235467256e-05, "loss": 3.054, "step": 17100 }, { "epoch": 0.5062545989698307, "grad_norm": 0.07370496541261673, "learning_rate": 1.9411168342735674e-05, "loss": 3.0471, "step": 17200 }, { "epoch": 0.5091979396615158, "grad_norm": 0.08675287663936615, "learning_rate": 1.939481645000409e-05, "loss": 3.0383, "step": 17300 }, { "epoch": 0.5121412803532008, "grad_norm": 0.07724553346633911, "learning_rate": 1.9378464557272506e-05, "loss": 3.0539, "step": 17400 }, { "epoch": 0.515084621044886, "grad_norm": 0.0678177997469902, "learning_rate": 1.936211266454092e-05, "loss": 3.0457, "step": 17500 }, { "epoch": 0.515084621044886, "eval_loss": 3.043174982070923, "eval_runtime": 631.0334, "eval_samples_per_second": 430.714, "eval_steps_per_second": 13.46, "step": 17500 }, { "epoch": 0.5180279617365711, "grad_norm": 0.07132332026958466, "learning_rate": 1.9345760771809338e-05, "loss": 3.05, "step": 17600 }, { "epoch": 0.5209713024282561, "grad_norm": 0.0750725194811821, "learning_rate": 1.9329408879077756e-05, "loss": 3.05, "step": 17700 }, { "epoch": 0.5239146431199412, "grad_norm": 0.07662118226289749, "learning_rate": 1.931305698634617e-05, "loss": 3.0512, "step": 17800 }, { "epoch": 0.5268579838116262, "grad_norm": 0.06863994896411896, "learning_rate": 1.9296705093614588e-05, "loss": 3.0399, "step": 17900 }, { "epoch": 0.5298013245033113, "grad_norm": 0.06392034888267517, "learning_rate": 1.9280353200883005e-05, "loss": 3.048, "step": 18000 }, { "epoch": 0.5298013245033113, "eval_loss": 3.043093681335449, "eval_runtime": 629.5297, "eval_samples_per_second": 431.743, "eval_steps_per_second": 13.493, "step": 18000 }, { "epoch": 0.5327446651949963, "grad_norm": 0.07065685093402863, "learning_rate": 1.926400130815142e-05, "loss": 3.0367, "step": 18100 }, { "epoch": 0.5356880058866814, "grad_norm": 0.0722690150141716, "learning_rate": 1.9247649415419837e-05, "loss": 3.0442, "step": 18200 }, { "epoch": 0.5386313465783664, "grad_norm": 0.06672331690788269, "learning_rate": 1.923129752268825e-05, "loss": 3.0472, "step": 18300 }, { "epoch": 0.5415746872700515, "grad_norm": 0.06574855744838715, "learning_rate": 1.921494562995667e-05, "loss": 3.0335, "step": 18400 }, { "epoch": 0.5445180279617365, "grad_norm": 0.06836330145597458, "learning_rate": 1.9198593737225087e-05, "loss": 3.0465, "step": 18500 }, { "epoch": 0.5445180279617365, "eval_loss": 3.0458669662475586, "eval_runtime": 624.0717, "eval_samples_per_second": 435.519, "eval_steps_per_second": 13.611, "step": 18500 }, { "epoch": 0.5474613686534217, "grad_norm": 0.07036128640174866, "learning_rate": 1.91822418444935e-05, "loss": 3.054, "step": 18600 }, { "epoch": 0.5504047093451067, "grad_norm": 0.06945749372243881, "learning_rate": 1.916588995176192e-05, "loss": 3.0489, "step": 18700 }, { "epoch": 0.5533480500367918, "grad_norm": 0.06926793605089188, "learning_rate": 1.9149538059030333e-05, "loss": 3.037, "step": 18800 }, { "epoch": 0.5562913907284768, "grad_norm": 0.06429161131381989, "learning_rate": 1.913318616629875e-05, "loss": 3.0432, "step": 18900 }, { "epoch": 0.5592347314201619, "grad_norm": 0.05955299735069275, "learning_rate": 1.9116834273567165e-05, "loss": 3.0401, "step": 19000 }, { "epoch": 0.5592347314201619, "eval_loss": 3.0425612926483154, "eval_runtime": 610.3038, "eval_samples_per_second": 445.344, "eval_steps_per_second": 13.918, "step": 19000 }, { "epoch": 0.5621780721118469, "grad_norm": 0.0627388134598732, "learning_rate": 1.9100482380835583e-05, "loss": 3.0369, "step": 19100 }, { "epoch": 0.565121412803532, "grad_norm": 0.06677763164043427, "learning_rate": 1.9084294007031315e-05, "loss": 3.0561, "step": 19200 }, { "epoch": 0.5680647534952171, "grad_norm": 0.06971177458763123, "learning_rate": 1.9067942114299733e-05, "loss": 3.0469, "step": 19300 }, { "epoch": 0.5710080941869021, "grad_norm": 0.058204129338264465, "learning_rate": 1.9051590221568147e-05, "loss": 3.0468, "step": 19400 }, { "epoch": 0.5739514348785872, "grad_norm": 0.05811809375882149, "learning_rate": 1.9035238328836565e-05, "loss": 3.0455, "step": 19500 }, { "epoch": 0.5739514348785872, "eval_loss": 3.043268918991089, "eval_runtime": 623.3051, "eval_samples_per_second": 436.054, "eval_steps_per_second": 13.627, "step": 19500 }, { "epoch": 0.5768947755702722, "grad_norm": 0.058524224907159805, "learning_rate": 1.901888643610498e-05, "loss": 3.0512, "step": 19600 }, { "epoch": 0.5798381162619574, "grad_norm": 0.07287425547838211, "learning_rate": 1.9002534543373397e-05, "loss": 3.0474, "step": 19700 }, { "epoch": 0.5827814569536424, "grad_norm": 0.06839476525783539, "learning_rate": 1.8986182650641815e-05, "loss": 3.043, "step": 19800 }, { "epoch": 0.5857247976453275, "grad_norm": 0.0597374401986599, "learning_rate": 1.896983075791023e-05, "loss": 3.0473, "step": 19900 }, { "epoch": 0.5886681383370125, "grad_norm": 0.06223360449075699, "learning_rate": 1.8953478865178647e-05, "loss": 3.0448, "step": 20000 }, { "epoch": 0.5886681383370125, "eval_loss": 3.0415148735046387, "eval_runtime": 626.1739, "eval_samples_per_second": 434.057, "eval_steps_per_second": 13.565, "step": 20000 }, { "epoch": 0.5916114790286976, "grad_norm": 0.07109396904706955, "learning_rate": 1.893712697244706e-05, "loss": 3.0441, "step": 20100 }, { "epoch": 0.5945548197203826, "grad_norm": 0.057101842015981674, "learning_rate": 1.892077507971548e-05, "loss": 3.0403, "step": 20200 }, { "epoch": 0.5974981604120677, "grad_norm": 0.05395849794149399, "learning_rate": 1.8904423186983896e-05, "loss": 3.0516, "step": 20300 }, { "epoch": 0.6004415011037527, "grad_norm": 0.06337323784828186, "learning_rate": 1.888807129425231e-05, "loss": 3.0459, "step": 20400 }, { "epoch": 0.6033848417954378, "grad_norm": 0.06040395051240921, "learning_rate": 1.887171940152073e-05, "loss": 3.0415, "step": 20500 }, { "epoch": 0.6033848417954378, "eval_loss": 3.041532278060913, "eval_runtime": 627.156, "eval_samples_per_second": 433.377, "eval_steps_per_second": 13.544, "step": 20500 }, { "epoch": 0.6063281824871228, "grad_norm": 0.06913723796606064, "learning_rate": 1.8855367508789143e-05, "loss": 3.034, "step": 20600 }, { "epoch": 0.609271523178808, "grad_norm": 0.052955783903598785, "learning_rate": 1.883901561605756e-05, "loss": 3.0483, "step": 20700 }, { "epoch": 0.6122148638704931, "grad_norm": 0.05427427589893341, "learning_rate": 1.8822827242253292e-05, "loss": 3.0538, "step": 20800 }, { "epoch": 0.6151582045621781, "grad_norm": 0.06078154593706131, "learning_rate": 1.880647534952171e-05, "loss": 3.0458, "step": 20900 }, { "epoch": 0.6181015452538632, "grad_norm": 0.059606559574604034, "learning_rate": 1.8790123456790124e-05, "loss": 3.0445, "step": 21000 }, { "epoch": 0.6181015452538632, "eval_loss": 3.0372297763824463, "eval_runtime": 625.3598, "eval_samples_per_second": 434.622, "eval_steps_per_second": 13.583, "step": 21000 }, { "epoch": 0.6210448859455482, "grad_norm": 0.05104444921016693, "learning_rate": 1.8773771564058542e-05, "loss": 3.0414, "step": 21100 }, { "epoch": 0.6239882266372333, "grad_norm": 0.059070318937301636, "learning_rate": 1.8757419671326956e-05, "loss": 3.0476, "step": 21200 }, { "epoch": 0.6269315673289183, "grad_norm": 0.06030017510056496, "learning_rate": 1.8741067778595374e-05, "loss": 3.0638, "step": 21300 }, { "epoch": 0.6298749080206034, "grad_norm": 0.06045697256922722, "learning_rate": 1.8724715885863792e-05, "loss": 3.0375, "step": 21400 }, { "epoch": 0.6328182487122884, "grad_norm": 0.058795489370822906, "learning_rate": 1.8708363993132206e-05, "loss": 3.0425, "step": 21500 }, { "epoch": 0.6328182487122884, "eval_loss": 3.039712429046631, "eval_runtime": 633.5451, "eval_samples_per_second": 429.007, "eval_steps_per_second": 13.407, "step": 21500 }, { "epoch": 0.6357615894039735, "grad_norm": 0.053084999322891235, "learning_rate": 1.8692012100400624e-05, "loss": 3.0394, "step": 21600 }, { "epoch": 0.6387049300956585, "grad_norm": 0.05448750779032707, "learning_rate": 1.8675660207669038e-05, "loss": 3.0443, "step": 21700 }, { "epoch": 0.6416482707873437, "grad_norm": 0.057745374739170074, "learning_rate": 1.8659308314937456e-05, "loss": 3.0381, "step": 21800 }, { "epoch": 0.6445916114790287, "grad_norm": 0.06150789186358452, "learning_rate": 1.864295642220587e-05, "loss": 3.0387, "step": 21900 }, { "epoch": 0.6475349521707138, "grad_norm": 0.055741336196660995, "learning_rate": 1.8626604529474288e-05, "loss": 3.0255, "step": 22000 }, { "epoch": 0.6475349521707138, "eval_loss": 3.038139581680298, "eval_runtime": 625.7503, "eval_samples_per_second": 434.351, "eval_steps_per_second": 13.574, "step": 22000 }, { "epoch": 0.6504782928623988, "grad_norm": 0.06111091375350952, "learning_rate": 1.8610252636742706e-05, "loss": 3.0355, "step": 22100 }, { "epoch": 0.6534216335540839, "grad_norm": 0.05908087268471718, "learning_rate": 1.8593900744011123e-05, "loss": 3.0411, "step": 22200 }, { "epoch": 0.6563649742457689, "grad_norm": 0.061592113226652145, "learning_rate": 1.8577548851279538e-05, "loss": 3.0436, "step": 22300 }, { "epoch": 0.659308314937454, "grad_norm": 0.05346628278493881, "learning_rate": 1.8561196958547952e-05, "loss": 3.038, "step": 22400 }, { "epoch": 0.6622516556291391, "grad_norm": 0.05568494647741318, "learning_rate": 1.854484506581637e-05, "loss": 3.0336, "step": 22500 }, { "epoch": 0.6622516556291391, "eval_loss": 3.0325162410736084, "eval_runtime": 630.352, "eval_samples_per_second": 431.18, "eval_steps_per_second": 13.475, "step": 22500 }, { "epoch": 0.6651949963208241, "grad_norm": 0.05503956228494644, "learning_rate": 1.8528493173084784e-05, "loss": 3.0404, "step": 22600 }, { "epoch": 0.6681383370125092, "grad_norm": 0.05491860955953598, "learning_rate": 1.8512141280353202e-05, "loss": 3.0374, "step": 22700 }, { "epoch": 0.6710816777041942, "grad_norm": 0.055585723370313644, "learning_rate": 1.849578938762162e-05, "loss": 3.0342, "step": 22800 }, { "epoch": 0.6740250183958794, "grad_norm": 0.057793911546468735, "learning_rate": 1.8479437494890037e-05, "loss": 3.0385, "step": 22900 }, { "epoch": 0.6769683590875644, "grad_norm": 0.05358234792947769, "learning_rate": 1.846308560215845e-05, "loss": 3.0329, "step": 23000 }, { "epoch": 0.6769683590875644, "eval_loss": 3.034156322479248, "eval_runtime": 628.0026, "eval_samples_per_second": 432.793, "eval_steps_per_second": 13.525, "step": 23000 }, { "epoch": 0.6799116997792495, "grad_norm": 0.05742792412638664, "learning_rate": 1.8446733709426866e-05, "loss": 3.0391, "step": 23100 }, { "epoch": 0.6828550404709345, "grad_norm": 0.0546625517308712, "learning_rate": 1.8430381816695283e-05, "loss": 3.0366, "step": 23200 }, { "epoch": 0.6857983811626196, "grad_norm": 0.06406642496585846, "learning_rate": 1.84140299239637e-05, "loss": 3.0284, "step": 23300 }, { "epoch": 0.6887417218543046, "grad_norm": 0.052560534328222275, "learning_rate": 1.8397678031232115e-05, "loss": 3.0328, "step": 23400 }, { "epoch": 0.6916850625459897, "grad_norm": 0.056005433201789856, "learning_rate": 1.8381326138500533e-05, "loss": 3.0322, "step": 23500 }, { "epoch": 0.6916850625459897, "eval_loss": 3.0332555770874023, "eval_runtime": 627.3412, "eval_samples_per_second": 433.249, "eval_steps_per_second": 13.54, "step": 23500 }, { "epoch": 0.6946284032376747, "grad_norm": 0.06161920353770256, "learning_rate": 1.836497424576895e-05, "loss": 3.0353, "step": 23600 }, { "epoch": 0.6975717439293598, "grad_norm": 0.056377191096544266, "learning_rate": 1.8348622353037365e-05, "loss": 3.0371, "step": 23700 }, { "epoch": 0.7005150846210448, "grad_norm": 0.05168746039271355, "learning_rate": 1.833227046030578e-05, "loss": 3.0321, "step": 23800 }, { "epoch": 0.70345842531273, "grad_norm": 0.0548856221139431, "learning_rate": 1.8315918567574197e-05, "loss": 3.0365, "step": 23900 }, { "epoch": 0.7064017660044151, "grad_norm": 0.053563591092824936, "learning_rate": 1.8299566674842615e-05, "loss": 3.0302, "step": 24000 }, { "epoch": 0.7064017660044151, "eval_loss": 3.0342206954956055, "eval_runtime": 622.4723, "eval_samples_per_second": 436.638, "eval_steps_per_second": 13.646, "step": 24000 }, { "epoch": 0.7093451066961001, "grad_norm": 0.05473559722304344, "learning_rate": 1.8283214782111033e-05, "loss": 3.0352, "step": 24100 }, { "epoch": 0.7122884473877852, "grad_norm": 0.050641730427742004, "learning_rate": 1.8266862889379447e-05, "loss": 3.0277, "step": 24200 }, { "epoch": 0.7152317880794702, "grad_norm": 0.047202397137880325, "learning_rate": 1.8250510996647865e-05, "loss": 3.0402, "step": 24300 }, { "epoch": 0.7181751287711553, "grad_norm": 0.05609762296080589, "learning_rate": 1.823415910391628e-05, "loss": 3.0364, "step": 24400 }, { "epoch": 0.7211184694628403, "grad_norm": 0.056129444390535355, "learning_rate": 1.8217807211184697e-05, "loss": 3.0439, "step": 24500 }, { "epoch": 0.7211184694628403, "eval_loss": 3.033571720123291, "eval_runtime": 628.4264, "eval_samples_per_second": 432.501, "eval_steps_per_second": 13.516, "step": 24500 }, { "epoch": 0.7240618101545254, "grad_norm": 0.05424582213163376, "learning_rate": 1.820145531845311e-05, "loss": 3.0396, "step": 24600 }, { "epoch": 0.7270051508462104, "grad_norm": 0.04740852490067482, "learning_rate": 1.818510342572153e-05, "loss": 3.0475, "step": 24700 }, { "epoch": 0.7299484915378955, "grad_norm": 0.051815129816532135, "learning_rate": 1.8168751532989946e-05, "loss": 3.0258, "step": 24800 }, { "epoch": 0.7328918322295805, "grad_norm": 0.04678037017583847, "learning_rate": 1.8152399640258364e-05, "loss": 3.0345, "step": 24900 }, { "epoch": 0.7358351729212657, "grad_norm": 0.055440157651901245, "learning_rate": 1.813604774752678e-05, "loss": 3.0326, "step": 25000 }, { "epoch": 0.7358351729212657, "eval_loss": 3.035003423690796, "eval_runtime": 633.5947, "eval_samples_per_second": 428.973, "eval_steps_per_second": 13.406, "step": 25000 }, { "epoch": 0.7387785136129507, "grad_norm": 0.04886231571435928, "learning_rate": 1.8119695854795193e-05, "loss": 3.0357, "step": 25100 }, { "epoch": 0.7417218543046358, "grad_norm": 0.05547178536653519, "learning_rate": 1.810334396206361e-05, "loss": 3.0413, "step": 25200 }, { "epoch": 0.7446651949963208, "grad_norm": 0.056609105318784714, "learning_rate": 1.8086992069332025e-05, "loss": 3.0326, "step": 25300 }, { "epoch": 0.7476085356880059, "grad_norm": 0.04905085265636444, "learning_rate": 1.8070640176600442e-05, "loss": 3.0401, "step": 25400 }, { "epoch": 0.7505518763796909, "grad_norm": 0.05483212694525719, "learning_rate": 1.805428828386886e-05, "loss": 3.0313, "step": 25500 }, { "epoch": 0.7505518763796909, "eval_loss": 3.036458730697632, "eval_runtime": 639.6107, "eval_samples_per_second": 424.938, "eval_steps_per_second": 13.28, "step": 25500 }, { "epoch": 0.753495217071376, "grad_norm": 0.05586619675159454, "learning_rate": 1.8037936391137278e-05, "loss": 3.04, "step": 25600 }, { "epoch": 0.7564385577630611, "grad_norm": 0.05478832498192787, "learning_rate": 1.8021584498405692e-05, "loss": 3.0382, "step": 25700 }, { "epoch": 0.7593818984547461, "grad_norm": 0.04507046192884445, "learning_rate": 1.8005232605674107e-05, "loss": 3.0344, "step": 25800 }, { "epoch": 0.7623252391464312, "grad_norm": 0.052001506090164185, "learning_rate": 1.7988880712942524e-05, "loss": 3.0325, "step": 25900 }, { "epoch": 0.7652685798381162, "grad_norm": 0.04894082248210907, "learning_rate": 1.7972528820210942e-05, "loss": 3.0475, "step": 26000 }, { "epoch": 0.7652685798381162, "eval_loss": 3.0339725017547607, "eval_runtime": 660.2919, "eval_samples_per_second": 411.629, "eval_steps_per_second": 12.864, "step": 26000 }, { "epoch": 0.7682119205298014, "grad_norm": 0.05490482226014137, "learning_rate": 1.7956176927479356e-05, "loss": 3.0256, "step": 26100 }, { "epoch": 0.7711552612214864, "grad_norm": 0.04921165108680725, "learning_rate": 1.7939825034747774e-05, "loss": 3.0331, "step": 26200 }, { "epoch": 0.7740986019131715, "grad_norm": 0.05163867026567459, "learning_rate": 1.792347314201619e-05, "loss": 3.0325, "step": 26300 }, { "epoch": 0.7770419426048565, "grad_norm": 0.049339670687913895, "learning_rate": 1.7907121249284606e-05, "loss": 3.0431, "step": 26400 }, { "epoch": 0.7799852832965416, "grad_norm": 0.05160528048872948, "learning_rate": 1.789076935655302e-05, "loss": 3.04, "step": 26500 }, { "epoch": 0.7799852832965416, "eval_loss": 3.0371901988983154, "eval_runtime": 664.2183, "eval_samples_per_second": 409.195, "eval_steps_per_second": 12.788, "step": 26500 }, { "epoch": 0.7829286239882266, "grad_norm": 0.045647360384464264, "learning_rate": 1.7874417463821438e-05, "loss": 3.0393, "step": 26600 }, { "epoch": 0.7858719646799117, "grad_norm": 0.05694318935275078, "learning_rate": 1.7858065571089856e-05, "loss": 3.0374, "step": 26700 }, { "epoch": 0.7888153053715967, "grad_norm": 0.049800559878349304, "learning_rate": 1.7841713678358273e-05, "loss": 3.0406, "step": 26800 }, { "epoch": 0.7917586460632818, "grad_norm": 0.04780496656894684, "learning_rate": 1.7825361785626688e-05, "loss": 3.0343, "step": 26900 }, { "epoch": 0.7947019867549668, "grad_norm": 0.05478132143616676, "learning_rate": 1.7809009892895105e-05, "loss": 3.0374, "step": 27000 }, { "epoch": 0.7947019867549668, "eval_loss": 3.032538890838623, "eval_runtime": 674.1631, "eval_samples_per_second": 403.159, "eval_steps_per_second": 12.599, "step": 27000 }, { "epoch": 0.797645327446652, "grad_norm": 0.053836237639188766, "learning_rate": 1.779265800016352e-05, "loss": 3.0262, "step": 27100 }, { "epoch": 0.8005886681383371, "grad_norm": 0.051207367330789566, "learning_rate": 1.7776306107431937e-05, "loss": 3.0393, "step": 27200 }, { "epoch": 0.8035320088300221, "grad_norm": 0.04601539298892021, "learning_rate": 1.7759954214700352e-05, "loss": 3.0255, "step": 27300 }, { "epoch": 0.8064753495217072, "grad_norm": 0.044452965259552, "learning_rate": 1.774360232196877e-05, "loss": 3.0305, "step": 27400 }, { "epoch": 0.8094186902133922, "grad_norm": 0.04643453285098076, "learning_rate": 1.7727250429237187e-05, "loss": 3.0324, "step": 27500 }, { "epoch": 0.8094186902133922, "eval_loss": 3.032275915145874, "eval_runtime": 674.0778, "eval_samples_per_second": 403.21, "eval_steps_per_second": 12.601, "step": 27500 }, { "epoch": 0.8123620309050773, "grad_norm": 0.057971298694610596, "learning_rate": 1.77108985365056e-05, "loss": 3.0317, "step": 27600 }, { "epoch": 0.8153053715967623, "grad_norm": 0.05186161771416664, "learning_rate": 1.769454664377402e-05, "loss": 3.0267, "step": 27700 }, { "epoch": 0.8182487122884474, "grad_norm": 0.051766593009233475, "learning_rate": 1.7678194751042434e-05, "loss": 3.0299, "step": 27800 }, { "epoch": 0.8211920529801324, "grad_norm": 0.04471235349774361, "learning_rate": 1.766184285831085e-05, "loss": 3.0305, "step": 27900 }, { "epoch": 0.8241353936718175, "grad_norm": 0.04845237731933594, "learning_rate": 1.764549096557927e-05, "loss": 3.0336, "step": 28000 }, { "epoch": 0.8241353936718175, "eval_loss": 3.031904935836792, "eval_runtime": 679.5763, "eval_samples_per_second": 399.948, "eval_steps_per_second": 12.499, "step": 28000 }, { "epoch": 0.8270787343635025, "grad_norm": 0.05482972040772438, "learning_rate": 1.7629139072847683e-05, "loss": 3.0373, "step": 28100 }, { "epoch": 0.8300220750551877, "grad_norm": 0.04692622274160385, "learning_rate": 1.76127871801161e-05, "loss": 3.0342, "step": 28200 }, { "epoch": 0.8329654157468727, "grad_norm": 0.04758935794234276, "learning_rate": 1.7596435287384515e-05, "loss": 3.0436, "step": 28300 }, { "epoch": 0.8359087564385578, "grad_norm": 0.05063100531697273, "learning_rate": 1.7580083394652933e-05, "loss": 3.0354, "step": 28400 }, { "epoch": 0.8388520971302428, "grad_norm": 0.0534328930079937, "learning_rate": 1.7563731501921347e-05, "loss": 3.0373, "step": 28500 }, { "epoch": 0.8388520971302428, "eval_loss": 3.02909779548645, "eval_runtime": 679.4175, "eval_samples_per_second": 400.041, "eval_steps_per_second": 12.502, "step": 28500 }, { "epoch": 0.8417954378219279, "grad_norm": 0.059875261038541794, "learning_rate": 1.7547379609189765e-05, "loss": 3.0292, "step": 28600 }, { "epoch": 0.8447387785136129, "grad_norm": 0.046500641852617264, "learning_rate": 1.7531027716458183e-05, "loss": 3.0229, "step": 28700 }, { "epoch": 0.847682119205298, "grad_norm": 0.04668549448251724, "learning_rate": 1.7514675823726597e-05, "loss": 3.0348, "step": 28800 }, { "epoch": 0.8506254598969831, "grad_norm": 0.04407795891165733, "learning_rate": 1.7498323930995015e-05, "loss": 3.041, "step": 28900 }, { "epoch": 0.8535688005886681, "grad_norm": 0.04846508800983429, "learning_rate": 1.748197203826343e-05, "loss": 3.031, "step": 29000 }, { "epoch": 0.8535688005886681, "eval_loss": 3.032353162765503, "eval_runtime": 686.3738, "eval_samples_per_second": 395.987, "eval_steps_per_second": 12.375, "step": 29000 }, { "epoch": 0.8565121412803532, "grad_norm": 0.048291005194187164, "learning_rate": 1.7465620145531847e-05, "loss": 3.0354, "step": 29100 }, { "epoch": 0.8594554819720382, "grad_norm": 0.04581734165549278, "learning_rate": 1.744926825280026e-05, "loss": 3.0242, "step": 29200 }, { "epoch": 0.8623988226637234, "grad_norm": 0.04933261498808861, "learning_rate": 1.743291636006868e-05, "loss": 3.026, "step": 29300 }, { "epoch": 0.8653421633554084, "grad_norm": 0.04945338889956474, "learning_rate": 1.7416564467337096e-05, "loss": 3.0373, "step": 29400 }, { "epoch": 0.8682855040470935, "grad_norm": 0.04295671731233597, "learning_rate": 1.7400212574605514e-05, "loss": 3.0298, "step": 29500 }, { "epoch": 0.8682855040470935, "eval_loss": 3.0275745391845703, "eval_runtime": 692.7622, "eval_samples_per_second": 392.335, "eval_steps_per_second": 12.261, "step": 29500 }, { "epoch": 0.8712288447387785, "grad_norm": 0.04119986295700073, "learning_rate": 1.738386068187393e-05, "loss": 3.0341, "step": 29600 }, { "epoch": 0.8741721854304636, "grad_norm": 0.05200904980301857, "learning_rate": 1.7367508789142343e-05, "loss": 3.0304, "step": 29700 }, { "epoch": 0.8771155261221486, "grad_norm": 0.05692379176616669, "learning_rate": 1.735115689641076e-05, "loss": 3.0241, "step": 29800 }, { "epoch": 0.8800588668138337, "grad_norm": 0.04549058526754379, "learning_rate": 1.7334805003679178e-05, "loss": 3.0304, "step": 29900 }, { "epoch": 0.8830022075055187, "grad_norm": 0.03618421033024788, "learning_rate": 1.7318453110947593e-05, "loss": 3.0279, "step": 30000 }, { "epoch": 0.8830022075055187, "eval_loss": 3.0278401374816895, "eval_runtime": 692.6493, "eval_samples_per_second": 392.399, "eval_steps_per_second": 12.263, "step": 30000 }, { "epoch": 0.8859455481972038, "grad_norm": 0.04941558837890625, "learning_rate": 1.730210121821601e-05, "loss": 3.026, "step": 30100 }, { "epoch": 0.8888888888888888, "grad_norm": 0.04869082197546959, "learning_rate": 1.7285749325484428e-05, "loss": 3.0272, "step": 30200 }, { "epoch": 0.891832229580574, "grad_norm": 0.05301811546087265, "learning_rate": 1.7269397432752842e-05, "loss": 3.0372, "step": 30300 }, { "epoch": 0.8947755702722591, "grad_norm": 0.04805745556950569, "learning_rate": 1.7253045540021257e-05, "loss": 3.0241, "step": 30400 }, { "epoch": 0.8977189109639441, "grad_norm": 0.05369238927960396, "learning_rate": 1.7236693647289674e-05, "loss": 3.0347, "step": 30500 }, { "epoch": 0.8977189109639441, "eval_loss": 3.0275983810424805, "eval_runtime": 686.6237, "eval_samples_per_second": 395.843, "eval_steps_per_second": 12.371, "step": 30500 }, { "epoch": 0.9006622516556292, "grad_norm": 0.04474586620926857, "learning_rate": 1.7220341754558092e-05, "loss": 3.0335, "step": 30600 }, { "epoch": 0.9036055923473142, "grad_norm": 0.04293316230177879, "learning_rate": 1.720398986182651e-05, "loss": 3.0316, "step": 30700 }, { "epoch": 0.9065489330389993, "grad_norm": 0.04896510764956474, "learning_rate": 1.7187637969094924e-05, "loss": 3.0372, "step": 30800 }, { "epoch": 0.9094922737306843, "grad_norm": 0.03395694121718407, "learning_rate": 1.717128607636334e-05, "loss": 3.0234, "step": 30900 }, { "epoch": 0.9124356144223694, "grad_norm": 0.04948301613330841, "learning_rate": 1.7154934183631756e-05, "loss": 3.0303, "step": 31000 }, { "epoch": 0.9124356144223694, "eval_loss": 3.0278146266937256, "eval_runtime": 686.1208, "eval_samples_per_second": 396.133, "eval_steps_per_second": 12.38, "step": 31000 }, { "epoch": 0.9153789551140544, "grad_norm": 0.05113914608955383, "learning_rate": 1.7138745809827488e-05, "loss": 3.0466, "step": 31100 }, { "epoch": 0.9183222958057395, "grad_norm": 0.04693964868783951, "learning_rate": 1.7122393917095906e-05, "loss": 3.0391, "step": 31200 }, { "epoch": 0.9212656364974245, "grad_norm": 0.049486856907606125, "learning_rate": 1.7106042024364323e-05, "loss": 3.0334, "step": 31300 }, { "epoch": 0.9242089771891097, "grad_norm": 0.056477587670087814, "learning_rate": 1.7089853650560052e-05, "loss": 3.029, "step": 31400 }, { "epoch": 0.9271523178807947, "grad_norm": 0.04910438507795334, "learning_rate": 1.707350175782847e-05, "loss": 3.0322, "step": 31500 }, { "epoch": 0.9271523178807947, "eval_loss": 3.0279619693756104, "eval_runtime": 688.4367, "eval_samples_per_second": 394.8, "eval_steps_per_second": 12.338, "step": 31500 }, { "epoch": 0.9300956585724798, "grad_norm": 0.041470855474472046, "learning_rate": 1.7057149865096888e-05, "loss": 3.0272, "step": 31600 }, { "epoch": 0.9330389992641648, "grad_norm": 0.04160791262984276, "learning_rate": 1.7040797972365302e-05, "loss": 3.0315, "step": 31700 }, { "epoch": 0.9359823399558499, "grad_norm": 0.04551346227526665, "learning_rate": 1.702444607963372e-05, "loss": 3.0297, "step": 31800 }, { "epoch": 0.9389256806475349, "grad_norm": 0.039839088916778564, "learning_rate": 1.7008094186902134e-05, "loss": 3.0228, "step": 31900 }, { "epoch": 0.94186902133922, "grad_norm": 0.046781621873378754, "learning_rate": 1.699174229417055e-05, "loss": 3.0246, "step": 32000 }, { "epoch": 0.94186902133922, "eval_loss": 3.0271835327148438, "eval_runtime": 692.7811, "eval_samples_per_second": 392.324, "eval_steps_per_second": 12.261, "step": 32000 }, { "epoch": 0.9448123620309051, "grad_norm": 0.044704366475343704, "learning_rate": 1.6975390401438966e-05, "loss": 3.0215, "step": 32100 }, { "epoch": 0.9477557027225901, "grad_norm": 0.04478640854358673, "learning_rate": 1.6959038508707384e-05, "loss": 3.0246, "step": 32200 }, { "epoch": 0.9506990434142752, "grad_norm": 0.05617917701601982, "learning_rate": 1.69426866159758e-05, "loss": 3.0333, "step": 32300 }, { "epoch": 0.9536423841059603, "grad_norm": 0.050819963216781616, "learning_rate": 1.692633472324422e-05, "loss": 3.0334, "step": 32400 }, { "epoch": 0.9565857247976454, "grad_norm": 0.05514681711792946, "learning_rate": 1.6909982830512633e-05, "loss": 3.029, "step": 32500 }, { "epoch": 0.9565857247976454, "eval_loss": 3.027134895324707, "eval_runtime": 693.2503, "eval_samples_per_second": 392.059, "eval_steps_per_second": 12.252, "step": 32500 }, { "epoch": 0.9595290654893304, "grad_norm": 0.052329398691654205, "learning_rate": 1.6893630937781048e-05, "loss": 3.0328, "step": 32600 }, { "epoch": 0.9624724061810155, "grad_norm": 0.041431643068790436, "learning_rate": 1.6877279045049465e-05, "loss": 3.0284, "step": 32700 }, { "epoch": 0.9654157468727005, "grad_norm": 0.05102865770459175, "learning_rate": 1.6860927152317883e-05, "loss": 3.0327, "step": 32800 }, { "epoch": 0.9683590875643856, "grad_norm": 0.04777172952890396, "learning_rate": 1.6844575259586297e-05, "loss": 3.0228, "step": 32900 }, { "epoch": 0.9713024282560706, "grad_norm": 0.04937748238444328, "learning_rate": 1.6828223366854715e-05, "loss": 3.0321, "step": 33000 }, { "epoch": 0.9713024282560706, "eval_loss": 3.0266990661621094, "eval_runtime": 699.1186, "eval_samples_per_second": 388.768, "eval_steps_per_second": 12.15, "step": 33000 }, { "epoch": 0.9742457689477557, "grad_norm": 0.04047486186027527, "learning_rate": 1.6811871474123133e-05, "loss": 3.0277, "step": 33100 }, { "epoch": 0.9771891096394407, "grad_norm": 0.04767972230911255, "learning_rate": 1.6795519581391547e-05, "loss": 3.0309, "step": 33200 }, { "epoch": 0.9801324503311258, "grad_norm": 0.05382005497813225, "learning_rate": 1.677916768865996e-05, "loss": 3.0265, "step": 33300 }, { "epoch": 0.9830757910228108, "grad_norm": 0.04794803634285927, "learning_rate": 1.676281579592838e-05, "loss": 3.029, "step": 33400 }, { "epoch": 0.986019131714496, "grad_norm": 0.0460403673350811, "learning_rate": 1.6746463903196797e-05, "loss": 3.0315, "step": 33500 }, { "epoch": 0.986019131714496, "eval_loss": 3.025664806365967, "eval_runtime": 699.719, "eval_samples_per_second": 388.434, "eval_steps_per_second": 12.139, "step": 33500 }, { "epoch": 0.9889624724061811, "grad_norm": 0.044208597391843796, "learning_rate": 1.6730112010465214e-05, "loss": 3.0233, "step": 33600 }, { "epoch": 0.9919058130978661, "grad_norm": 0.04156910628080368, "learning_rate": 1.671376011773363e-05, "loss": 3.0208, "step": 33700 }, { "epoch": 0.9948491537895512, "grad_norm": 0.045962389558553696, "learning_rate": 1.6697408225002047e-05, "loss": 3.0296, "step": 33800 }, { "epoch": 0.9977924944812362, "grad_norm": 0.04260554164648056, "learning_rate": 1.668105633227046e-05, "loss": 3.0271, "step": 33900 }, { "epoch": 1.0007358351729212, "grad_norm": 0.046383216977119446, "learning_rate": 1.666470443953888e-05, "loss": 3.0258, "step": 34000 }, { "epoch": 1.0007358351729212, "eval_loss": 3.026073932647705, "eval_runtime": 703.0834, "eval_samples_per_second": 386.576, "eval_steps_per_second": 12.081, "step": 34000 }, { "epoch": 1.0036791758646064, "grad_norm": 0.05162841081619263, "learning_rate": 1.6648352546807293e-05, "loss": 3.0233, "step": 34100 }, { "epoch": 1.0066225165562914, "grad_norm": 0.04869641363620758, "learning_rate": 1.663200065407571e-05, "loss": 3.0283, "step": 34200 }, { "epoch": 1.0095658572479764, "grad_norm": 0.04071588069200516, "learning_rate": 1.6615648761344128e-05, "loss": 3.0277, "step": 34300 }, { "epoch": 1.0125091979396614, "grad_norm": 0.044576410204172134, "learning_rate": 1.6599296868612543e-05, "loss": 3.0233, "step": 34400 }, { "epoch": 1.0154525386313467, "grad_norm": 0.04745708778500557, "learning_rate": 1.658294497588096e-05, "loss": 3.0296, "step": 34500 }, { "epoch": 1.0154525386313467, "eval_loss": 3.027028799057007, "eval_runtime": 701.3622, "eval_samples_per_second": 387.524, "eval_steps_per_second": 12.111, "step": 34500 }, { "epoch": 1.0183958793230317, "grad_norm": 0.052702803164720535, "learning_rate": 1.6566593083149375e-05, "loss": 3.0321, "step": 34600 }, { "epoch": 1.0213392200147167, "grad_norm": 0.045018065720796585, "learning_rate": 1.6550241190417792e-05, "loss": 3.0314, "step": 34700 }, { "epoch": 1.0242825607064017, "grad_norm": 0.043952833861112595, "learning_rate": 1.6533889297686207e-05, "loss": 3.0458, "step": 34800 }, { "epoch": 1.027225901398087, "grad_norm": 0.04676663130521774, "learning_rate": 1.6517700923881942e-05, "loss": 3.0415, "step": 34900 }, { "epoch": 1.030169242089772, "grad_norm": 0.04945986717939377, "learning_rate": 1.6501349031150356e-05, "loss": 3.0271, "step": 35000 }, { "epoch": 1.030169242089772, "eval_loss": 3.0261125564575195, "eval_runtime": 699.4882, "eval_samples_per_second": 388.563, "eval_steps_per_second": 12.143, "step": 35000 }, { "epoch": 1.033112582781457, "grad_norm": 0.03854461386799812, "learning_rate": 1.648499713841877e-05, "loss": 3.0252, "step": 35100 }, { "epoch": 1.0360559234731421, "grad_norm": 0.0414164774119854, "learning_rate": 1.646864524568719e-05, "loss": 3.0327, "step": 35200 }, { "epoch": 1.0389992641648271, "grad_norm": 0.04749782383441925, "learning_rate": 1.6452293352955606e-05, "loss": 3.0302, "step": 35300 }, { "epoch": 1.0419426048565121, "grad_norm": 0.04278954491019249, "learning_rate": 1.6435941460224024e-05, "loss": 3.0264, "step": 35400 }, { "epoch": 1.0448859455481971, "grad_norm": 0.04500415176153183, "learning_rate": 1.6419589567492438e-05, "loss": 3.0314, "step": 35500 }, { "epoch": 1.0448859455481971, "eval_loss": 3.026909589767456, "eval_runtime": 696.7549, "eval_samples_per_second": 390.087, "eval_steps_per_second": 12.191, "step": 35500 }, { "epoch": 1.0478292862398824, "grad_norm": 0.03979481756687164, "learning_rate": 1.6403237674760856e-05, "loss": 3.0252, "step": 35600 }, { "epoch": 1.0507726269315674, "grad_norm": 0.0392538346350193, "learning_rate": 1.638688578202927e-05, "loss": 3.0302, "step": 35700 }, { "epoch": 1.0537159676232524, "grad_norm": 0.041618045419454575, "learning_rate": 1.6370533889297688e-05, "loss": 3.0339, "step": 35800 }, { "epoch": 1.0566593083149374, "grad_norm": 0.0361081026494503, "learning_rate": 1.6354181996566102e-05, "loss": 3.0277, "step": 35900 }, { "epoch": 1.0596026490066226, "grad_norm": 0.041705869138240814, "learning_rate": 1.633783010383452e-05, "loss": 3.0314, "step": 36000 }, { "epoch": 1.0596026490066226, "eval_loss": 3.023240327835083, "eval_runtime": 696.5603, "eval_samples_per_second": 390.196, "eval_steps_per_second": 12.194, "step": 36000 }, { "epoch": 1.0625459896983076, "grad_norm": 0.0506197027862072, "learning_rate": 1.6321478211102938e-05, "loss": 3.0339, "step": 36100 }, { "epoch": 1.0654893303899926, "grad_norm": 0.04140015318989754, "learning_rate": 1.6305126318371355e-05, "loss": 3.0233, "step": 36200 }, { "epoch": 1.0684326710816776, "grad_norm": 0.04486558958888054, "learning_rate": 1.628877442563977e-05, "loss": 3.0264, "step": 36300 }, { "epoch": 1.0713760117733628, "grad_norm": 0.04434411600232124, "learning_rate": 1.62725860518355e-05, "loss": 3.0246, "step": 36400 }, { "epoch": 1.0743193524650478, "grad_norm": 0.04165821895003319, "learning_rate": 1.625623415910392e-05, "loss": 3.0252, "step": 36500 }, { "epoch": 1.0743193524650478, "eval_loss": 3.024155616760254, "eval_runtime": 699.4613, "eval_samples_per_second": 388.578, "eval_steps_per_second": 12.144, "step": 36500 }, { "epoch": 1.0772626931567328, "grad_norm": 0.044000472873449326, "learning_rate": 1.6239882266372334e-05, "loss": 3.027, "step": 36600 }, { "epoch": 1.0802060338484178, "grad_norm": 0.04268326982855797, "learning_rate": 1.622353037364075e-05, "loss": 3.0202, "step": 36700 }, { "epoch": 1.083149374540103, "grad_norm": 0.0391557477414608, "learning_rate": 1.6207178480909166e-05, "loss": 3.0245, "step": 36800 }, { "epoch": 1.086092715231788, "grad_norm": 0.03775768727064133, "learning_rate": 1.6190826588177583e-05, "loss": 3.0239, "step": 36900 }, { "epoch": 1.089036055923473, "grad_norm": 0.04041856899857521, "learning_rate": 1.6174474695445998e-05, "loss": 3.022, "step": 37000 }, { "epoch": 1.089036055923473, "eval_loss": 3.0228731632232666, "eval_runtime": 705.4577, "eval_samples_per_second": 385.275, "eval_steps_per_second": 12.04, "step": 37000 }, { "epoch": 1.0919793966151583, "grad_norm": 0.04425014555454254, "learning_rate": 1.6158122802714415e-05, "loss": 3.0164, "step": 37100 }, { "epoch": 1.0949227373068433, "grad_norm": 0.043370459228754044, "learning_rate": 1.6141770909982833e-05, "loss": 3.0289, "step": 37200 }, { "epoch": 1.0978660779985283, "grad_norm": 0.03540320694446564, "learning_rate": 1.612541901725125e-05, "loss": 3.012, "step": 37300 }, { "epoch": 1.1008094186902133, "grad_norm": 0.04062052443623543, "learning_rate": 1.6109067124519665e-05, "loss": 3.027, "step": 37400 }, { "epoch": 1.1037527593818985, "grad_norm": 0.039283912628889084, "learning_rate": 1.609271523178808e-05, "loss": 3.0283, "step": 37500 }, { "epoch": 1.1037527593818985, "eval_loss": 3.0229454040527344, "eval_runtime": 705.5784, "eval_samples_per_second": 385.209, "eval_steps_per_second": 12.038, "step": 37500 }, { "epoch": 1.1066961000735835, "grad_norm": 0.03529166430234909, "learning_rate": 1.6076363339056497e-05, "loss": 3.0289, "step": 37600 }, { "epoch": 1.1096394407652685, "grad_norm": 0.03998275101184845, "learning_rate": 1.606001144632491e-05, "loss": 3.0264, "step": 37700 }, { "epoch": 1.1125827814569536, "grad_norm": 0.037398431450128555, "learning_rate": 1.604365955359333e-05, "loss": 3.0295, "step": 37800 }, { "epoch": 1.1155261221486388, "grad_norm": 0.04391919821500778, "learning_rate": 1.6027307660861747e-05, "loss": 3.0245, "step": 37900 }, { "epoch": 1.1184694628403238, "grad_norm": 0.03190884366631508, "learning_rate": 1.6010955768130165e-05, "loss": 3.0301, "step": 38000 }, { "epoch": 1.1184694628403238, "eval_loss": 3.0226452350616455, "eval_runtime": 697.3322, "eval_samples_per_second": 389.764, "eval_steps_per_second": 12.181, "step": 38000 }, { "epoch": 1.1214128035320088, "grad_norm": 0.04810122027993202, "learning_rate": 1.599460387539858e-05, "loss": 3.0276, "step": 38100 }, { "epoch": 1.124356144223694, "grad_norm": 0.044550951570272446, "learning_rate": 1.5978251982666993e-05, "loss": 3.0264, "step": 38200 }, { "epoch": 1.127299484915379, "grad_norm": 0.04414360597729683, "learning_rate": 1.596190008993541e-05, "loss": 3.0264, "step": 38300 }, { "epoch": 1.130242825607064, "grad_norm": 0.0366351418197155, "learning_rate": 1.594554819720383e-05, "loss": 3.022, "step": 38400 }, { "epoch": 1.133186166298749, "grad_norm": 0.0371224544942379, "learning_rate": 1.5929196304472243e-05, "loss": 3.0308, "step": 38500 }, { "epoch": 1.133186166298749, "eval_loss": 3.0242979526519775, "eval_runtime": 704.2311, "eval_samples_per_second": 385.946, "eval_steps_per_second": 12.061, "step": 38500 }, { "epoch": 1.136129506990434, "grad_norm": 0.04241457208991051, "learning_rate": 1.591284441174066e-05, "loss": 3.022, "step": 38600 }, { "epoch": 1.1390728476821192, "grad_norm": 0.04301483929157257, "learning_rate": 1.589649251900908e-05, "loss": 3.027, "step": 38700 }, { "epoch": 1.1420161883738043, "grad_norm": 0.0391111858189106, "learning_rate": 1.5880140626277493e-05, "loss": 3.0189, "step": 38800 }, { "epoch": 1.1449595290654893, "grad_norm": 0.043348073959350586, "learning_rate": 1.5863788733545907e-05, "loss": 3.0282, "step": 38900 }, { "epoch": 1.1479028697571745, "grad_norm": 0.044031333178281784, "learning_rate": 1.5847436840814325e-05, "loss": 3.0226, "step": 39000 }, { "epoch": 1.1479028697571745, "eval_loss": 3.0228075981140137, "eval_runtime": 705.3629, "eval_samples_per_second": 385.326, "eval_steps_per_second": 12.042, "step": 39000 }, { "epoch": 1.1508462104488595, "grad_norm": 0.03850887343287468, "learning_rate": 1.5831084948082742e-05, "loss": 3.0257, "step": 39100 }, { "epoch": 1.1537895511405445, "grad_norm": 0.03670928254723549, "learning_rate": 1.581473305535116e-05, "loss": 3.0201, "step": 39200 }, { "epoch": 1.1567328918322295, "grad_norm": 0.04201032221317291, "learning_rate": 1.5798381162619574e-05, "loss": 3.0282, "step": 39300 }, { "epoch": 1.1596762325239147, "grad_norm": 0.042614907026290894, "learning_rate": 1.5782029269887992e-05, "loss": 3.0395, "step": 39400 }, { "epoch": 1.1626195732155997, "grad_norm": 0.04534539580345154, "learning_rate": 1.5765677377156406e-05, "loss": 3.042, "step": 39500 }, { "epoch": 1.1626195732155997, "eval_loss": 3.033963203430176, "eval_runtime": 707.6872, "eval_samples_per_second": 384.061, "eval_steps_per_second": 12.002, "step": 39500 }, { "epoch": 1.1655629139072847, "grad_norm": 0.03145177662372589, "learning_rate": 1.5749325484424824e-05, "loss": 3.0432, "step": 39600 }, { "epoch": 1.1685062545989697, "grad_norm": 0.035807639360427856, "learning_rate": 1.573297359169324e-05, "loss": 3.0214, "step": 39700 }, { "epoch": 1.171449595290655, "grad_norm": 0.04198603332042694, "learning_rate": 1.5716621698961656e-05, "loss": 3.022, "step": 39800 }, { "epoch": 1.17439293598234, "grad_norm": 0.04382916912436485, "learning_rate": 1.5700269806230074e-05, "loss": 3.0245, "step": 39900 }, { "epoch": 1.177336276674025, "grad_norm": 0.03799319639801979, "learning_rate": 1.5683917913498488e-05, "loss": 3.032, "step": 40000 }, { "epoch": 1.177336276674025, "eval_loss": 3.0275678634643555, "eval_runtime": 705.2955, "eval_samples_per_second": 385.363, "eval_steps_per_second": 12.043, "step": 40000 }, { "epoch": 1.1802796173657102, "grad_norm": 0.035891782492399216, "learning_rate": 1.5667566020766906e-05, "loss": 3.0389, "step": 40100 }, { "epoch": 1.1832229580573952, "grad_norm": 0.0373951718211174, "learning_rate": 1.565121412803532e-05, "loss": 3.0332, "step": 40200 }, { "epoch": 1.1861662987490802, "grad_norm": 0.041627395898103714, "learning_rate": 1.5635025754231056e-05, "loss": 3.0689, "step": 40300 }, { "epoch": 1.1891096394407652, "grad_norm": 0.03872070834040642, "learning_rate": 1.561867386149947e-05, "loss": 3.0476, "step": 40400 }, { "epoch": 1.1920529801324504, "grad_norm": 0.030206803232431412, "learning_rate": 1.5602321968767888e-05, "loss": 3.0626, "step": 40500 }, { "epoch": 1.1920529801324504, "eval_loss": 3.0399138927459717, "eval_runtime": 700.1653, "eval_samples_per_second": 388.187, "eval_steps_per_second": 12.131, "step": 40500 }, { "epoch": 1.1949963208241354, "grad_norm": 0.03909817337989807, "learning_rate": 1.5585970076036302e-05, "loss": 3.0357, "step": 40600 }, { "epoch": 1.1979396615158204, "grad_norm": 0.029385680332779884, "learning_rate": 1.5569618183304716e-05, "loss": 3.0282, "step": 40700 }, { "epoch": 1.2008830022075054, "grad_norm": 0.03951384499669075, "learning_rate": 1.5553266290573134e-05, "loss": 3.0276, "step": 40800 }, { "epoch": 1.2038263428991907, "grad_norm": 0.03907546401023865, "learning_rate": 1.553691439784155e-05, "loss": 3.032, "step": 40900 }, { "epoch": 1.2067696835908757, "grad_norm": 0.03716601803898811, "learning_rate": 1.552056250510997e-05, "loss": 3.0189, "step": 41000 }, { "epoch": 1.2067696835908757, "eval_loss": 3.0256309509277344, "eval_runtime": 705.1784, "eval_samples_per_second": 385.427, "eval_steps_per_second": 12.045, "step": 41000 }, { "epoch": 1.2097130242825607, "grad_norm": 0.037342239171266556, "learning_rate": 1.5504210612378384e-05, "loss": 3.0276, "step": 41100 }, { "epoch": 1.212656364974246, "grad_norm": 0.03826428949832916, "learning_rate": 1.54878587196468e-05, "loss": 3.0276, "step": 41200 }, { "epoch": 1.215599705665931, "grad_norm": 0.03287817910313606, "learning_rate": 1.5471506826915216e-05, "loss": 3.0276, "step": 41300 }, { "epoch": 1.218543046357616, "grad_norm": 0.02971961908042431, "learning_rate": 1.5455154934183633e-05, "loss": 3.0301, "step": 41400 }, { "epoch": 1.221486387049301, "grad_norm": 0.03172152116894722, "learning_rate": 1.5438803041452048e-05, "loss": 3.0238, "step": 41500 }, { "epoch": 1.221486387049301, "eval_loss": 3.0262269973754883, "eval_runtime": 710.6269, "eval_samples_per_second": 382.472, "eval_steps_per_second": 11.953, "step": 41500 }, { "epoch": 1.224429727740986, "grad_norm": 0.035707708448171616, "learning_rate": 1.5422451148720465e-05, "loss": 3.0326, "step": 41600 }, { "epoch": 1.2273730684326711, "grad_norm": 0.04119321331381798, "learning_rate": 1.5406099255988883e-05, "loss": 3.0295, "step": 41700 }, { "epoch": 1.2303164091243561, "grad_norm": 0.03912203758955002, "learning_rate": 1.5389747363257297e-05, "loss": 3.0307, "step": 41800 }, { "epoch": 1.2332597498160411, "grad_norm": 0.03561973571777344, "learning_rate": 1.5373395470525715e-05, "loss": 3.0351, "step": 41900 }, { "epoch": 1.2362030905077264, "grad_norm": 0.033763110637664795, "learning_rate": 1.535704357779413e-05, "loss": 3.0301, "step": 42000 }, { "epoch": 1.2362030905077264, "eval_loss": 3.024171829223633, "eval_runtime": 706.4214, "eval_samples_per_second": 384.749, "eval_steps_per_second": 12.024, "step": 42000 }, { "epoch": 1.2391464311994114, "grad_norm": 0.032466236501932144, "learning_rate": 1.5340691685062547e-05, "loss": 3.0238, "step": 42100 }, { "epoch": 1.2420897718910964, "grad_norm": 0.0365438349545002, "learning_rate": 1.5324339792330965e-05, "loss": 3.0232, "step": 42200 }, { "epoch": 1.2450331125827814, "grad_norm": 0.03168785944581032, "learning_rate": 1.530798789959938e-05, "loss": 3.0301, "step": 42300 }, { "epoch": 1.2479764532744666, "grad_norm": 0.030201533809304237, "learning_rate": 1.5291636006867797e-05, "loss": 3.0201, "step": 42400 }, { "epoch": 1.2509197939661516, "grad_norm": 0.035826168954372406, "learning_rate": 1.5275284114136215e-05, "loss": 3.0295, "step": 42500 }, { "epoch": 1.2509197939661516, "eval_loss": 3.0241646766662598, "eval_runtime": 710.8074, "eval_samples_per_second": 382.375, "eval_steps_per_second": 11.95, "step": 42500 }, { "epoch": 1.2538631346578366, "grad_norm": 0.03881455212831497, "learning_rate": 1.525893222140463e-05, "loss": 3.0326, "step": 42600 }, { "epoch": 1.2568064753495216, "grad_norm": 0.0400051511824131, "learning_rate": 1.5242580328673045e-05, "loss": 3.0232, "step": 42700 }, { "epoch": 1.2597498160412068, "grad_norm": 0.03861435502767563, "learning_rate": 1.5226228435941461e-05, "loss": 3.0213, "step": 42800 }, { "epoch": 1.2626931567328918, "grad_norm": 0.03706415370106697, "learning_rate": 1.5209876543209879e-05, "loss": 3.0263, "step": 42900 }, { "epoch": 1.2656364974245768, "grad_norm": 0.03346759453415871, "learning_rate": 1.5193524650478295e-05, "loss": 3.0351, "step": 43000 }, { "epoch": 1.2656364974245768, "eval_loss": 3.023573160171509, "eval_runtime": 711.7038, "eval_samples_per_second": 381.893, "eval_steps_per_second": 11.935, "step": 43000 }, { "epoch": 1.268579838116262, "grad_norm": 0.034871216863393784, "learning_rate": 1.517717275774671e-05, "loss": 3.0295, "step": 43100 }, { "epoch": 1.271523178807947, "grad_norm": 0.03629124537110329, "learning_rate": 1.5160820865015127e-05, "loss": 3.0232, "step": 43200 }, { "epoch": 1.274466519499632, "grad_norm": 0.04041403532028198, "learning_rate": 1.5144468972283544e-05, "loss": 3.0207, "step": 43300 }, { "epoch": 1.277409860191317, "grad_norm": 0.04015040770173073, "learning_rate": 1.5128117079551959e-05, "loss": 3.027, "step": 43400 }, { "epoch": 1.280353200883002, "grad_norm": 0.03469613566994667, "learning_rate": 1.5111765186820375e-05, "loss": 3.0276, "step": 43500 }, { "epoch": 1.280353200883002, "eval_loss": 3.0234475135803223, "eval_runtime": 710.4796, "eval_samples_per_second": 382.551, "eval_steps_per_second": 11.955, "step": 43500 }, { "epoch": 1.2832965415746873, "grad_norm": 0.0314650759100914, "learning_rate": 1.5095413294088792e-05, "loss": 3.0257, "step": 43600 }, { "epoch": 1.2862398822663723, "grad_norm": 0.03408525511622429, "learning_rate": 1.5079061401357208e-05, "loss": 3.0263, "step": 43700 }, { "epoch": 1.2891832229580573, "grad_norm": 0.028295455500483513, "learning_rate": 1.5062709508625624e-05, "loss": 3.0163, "step": 43800 }, { "epoch": 1.2921265636497425, "grad_norm": 0.03355014696717262, "learning_rate": 1.504635761589404e-05, "loss": 3.0282, "step": 43900 }, { "epoch": 1.2950699043414275, "grad_norm": 0.03388101980090141, "learning_rate": 1.5030005723162458e-05, "loss": 3.0276, "step": 44000 }, { "epoch": 1.2950699043414275, "eval_loss": 3.0269548892974854, "eval_runtime": 702.7114, "eval_samples_per_second": 386.78, "eval_steps_per_second": 12.087, "step": 44000 }, { "epoch": 1.2980132450331126, "grad_norm": 0.04474165663123131, "learning_rate": 1.5013653830430874e-05, "loss": 3.032, "step": 44100 }, { "epoch": 1.3009565857247978, "grad_norm": 0.03156202286481857, "learning_rate": 1.4997301937699288e-05, "loss": 3.0326, "step": 44200 }, { "epoch": 1.3038999264164828, "grad_norm": 0.035475365817546844, "learning_rate": 1.4980950044967706e-05, "loss": 3.0288, "step": 44300 }, { "epoch": 1.3068432671081678, "grad_norm": 0.0380512960255146, "learning_rate": 1.4964598152236122e-05, "loss": 3.0263, "step": 44400 }, { "epoch": 1.3097866077998528, "grad_norm": 0.033017829060554504, "learning_rate": 1.494824625950454e-05, "loss": 3.0251, "step": 44500 }, { "epoch": 1.3097866077998528, "eval_loss": 3.0231380462646484, "eval_runtime": 701.806, "eval_samples_per_second": 387.279, "eval_steps_per_second": 12.103, "step": 44500 }, { "epoch": 1.3127299484915378, "grad_norm": 0.034274764358997345, "learning_rate": 1.4931894366772954e-05, "loss": 3.0188, "step": 44600 }, { "epoch": 1.315673289183223, "grad_norm": 0.04117763042449951, "learning_rate": 1.4915542474041372e-05, "loss": 3.0213, "step": 44700 }, { "epoch": 1.318616629874908, "grad_norm": 0.0369747020304203, "learning_rate": 1.4899190581309788e-05, "loss": 3.0157, "step": 44800 }, { "epoch": 1.321559970566593, "grad_norm": 0.03137701004743576, "learning_rate": 1.4882838688578206e-05, "loss": 3.0238, "step": 44900 }, { "epoch": 1.3245033112582782, "grad_norm": 0.03650786727666855, "learning_rate": 1.486648679584662e-05, "loss": 3.0263, "step": 45000 }, { "epoch": 1.3245033112582782, "eval_loss": 3.021385431289673, "eval_runtime": 706.3344, "eval_samples_per_second": 384.796, "eval_steps_per_second": 12.025, "step": 45000 }, { "epoch": 1.3274466519499633, "grad_norm": 0.036012180149555206, "learning_rate": 1.4850134903115036e-05, "loss": 3.0194, "step": 45100 }, { "epoch": 1.3303899926416483, "grad_norm": 0.03227170184254646, "learning_rate": 1.4833783010383454e-05, "loss": 3.0301, "step": 45200 }, { "epoch": 1.3333333333333333, "grad_norm": 0.03220890462398529, "learning_rate": 1.481743111765187e-05, "loss": 3.0232, "step": 45300 }, { "epoch": 1.3362766740250183, "grad_norm": 0.031222395598888397, "learning_rate": 1.4801079224920286e-05, "loss": 3.0163, "step": 45400 }, { "epoch": 1.3392200147167035, "grad_norm": 0.03059220314025879, "learning_rate": 1.4784727332188702e-05, "loss": 3.0157, "step": 45500 }, { "epoch": 1.3392200147167035, "eval_loss": 3.021414041519165, "eval_runtime": 706.6231, "eval_samples_per_second": 384.639, "eval_steps_per_second": 12.021, "step": 45500 }, { "epoch": 1.3421633554083885, "grad_norm": 0.029994744807481766, "learning_rate": 1.476837543945712e-05, "loss": 3.0219, "step": 45600 }, { "epoch": 1.3451066961000735, "grad_norm": 0.03986198455095291, "learning_rate": 1.4752023546725535e-05, "loss": 3.0169, "step": 45700 }, { "epoch": 1.3480500367917587, "grad_norm": 0.03173942491412163, "learning_rate": 1.473567165399395e-05, "loss": 3.0232, "step": 45800 }, { "epoch": 1.3509933774834437, "grad_norm": 0.03520718961954117, "learning_rate": 1.4719319761262367e-05, "loss": 3.0344, "step": 45900 }, { "epoch": 1.3539367181751287, "grad_norm": 0.024432122707366943, "learning_rate": 1.4702967868530783e-05, "loss": 3.0219, "step": 46000 }, { "epoch": 1.3539367181751287, "eval_loss": 3.0208773612976074, "eval_runtime": 699.7911, "eval_samples_per_second": 388.394, "eval_steps_per_second": 12.138, "step": 46000 }, { "epoch": 1.356880058866814, "grad_norm": 0.03163484111428261, "learning_rate": 1.4686779494726515e-05, "loss": 3.0183, "step": 46100 }, { "epoch": 1.359823399558499, "grad_norm": 0.03501145541667938, "learning_rate": 1.4670427601994932e-05, "loss": 3.0207, "step": 46200 }, { "epoch": 1.362766740250184, "grad_norm": 0.026868801563978195, "learning_rate": 1.465407570926335e-05, "loss": 3.0351, "step": 46300 }, { "epoch": 1.365710080941869, "grad_norm": 0.03200540691614151, "learning_rate": 1.4637723816531764e-05, "loss": 3.0244, "step": 46400 }, { "epoch": 1.368653421633554, "grad_norm": 0.039930980652570724, "learning_rate": 1.4621371923800181e-05, "loss": 3.0194, "step": 46500 }, { "epoch": 1.368653421633554, "eval_loss": 3.0208253860473633, "eval_runtime": 698.6849, "eval_samples_per_second": 389.009, "eval_steps_per_second": 12.157, "step": 46500 }, { "epoch": 1.3715967623252392, "grad_norm": 0.037147484719753265, "learning_rate": 1.4605020031068597e-05, "loss": 3.0176, "step": 46600 }, { "epoch": 1.3745401030169242, "grad_norm": 0.0367245152592659, "learning_rate": 1.4588668138337015e-05, "loss": 3.0244, "step": 46700 }, { "epoch": 1.3774834437086092, "grad_norm": 0.03272568807005882, "learning_rate": 1.457231624560543e-05, "loss": 3.0263, "step": 46800 }, { "epoch": 1.3804267844002944, "grad_norm": 0.0346045047044754, "learning_rate": 1.4555964352873845e-05, "loss": 3.0151, "step": 46900 }, { "epoch": 1.3833701250919794, "grad_norm": 0.028040537610650063, "learning_rate": 1.4539612460142263e-05, "loss": 3.0226, "step": 47000 }, { "epoch": 1.3833701250919794, "eval_loss": 3.020796060562134, "eval_runtime": 690.659, "eval_samples_per_second": 393.53, "eval_steps_per_second": 12.298, "step": 47000 }, { "epoch": 1.3863134657836644, "grad_norm": 0.028161419555544853, "learning_rate": 1.4523260567410679e-05, "loss": 3.0213, "step": 47100 }, { "epoch": 1.3892568064753497, "grad_norm": 0.026188310235738754, "learning_rate": 1.4506908674679095e-05, "loss": 3.0307, "step": 47200 }, { "epoch": 1.3922001471670347, "grad_norm": 0.026172220706939697, "learning_rate": 1.4490556781947511e-05, "loss": 3.0244, "step": 47300 }, { "epoch": 1.3951434878587197, "grad_norm": 0.04115700721740723, "learning_rate": 1.4474204889215929e-05, "loss": 3.0238, "step": 47400 }, { "epoch": 1.3980868285504047, "grad_norm": 0.029974378645420074, "learning_rate": 1.4457852996484345e-05, "loss": 3.0276, "step": 47500 }, { "epoch": 1.3980868285504047, "eval_loss": 3.0206704139709473, "eval_runtime": 676.6168, "eval_samples_per_second": 401.697, "eval_steps_per_second": 12.554, "step": 47500 }, { "epoch": 1.4010301692420897, "grad_norm": 0.0302524883300066, "learning_rate": 1.4441501103752759e-05, "loss": 3.0282, "step": 47600 }, { "epoch": 1.403973509933775, "grad_norm": 0.03158809244632721, "learning_rate": 1.4425149211021177e-05, "loss": 3.0201, "step": 47700 }, { "epoch": 1.40691685062546, "grad_norm": 0.03227172791957855, "learning_rate": 1.4408797318289593e-05, "loss": 3.0226, "step": 47800 }, { "epoch": 1.409860191317145, "grad_norm": 0.031490180641412735, "learning_rate": 1.439244542555801e-05, "loss": 3.0263, "step": 47900 }, { "epoch": 1.4128035320088301, "grad_norm": 0.029675040394067764, "learning_rate": 1.4376093532826425e-05, "loss": 3.0213, "step": 48000 }, { "epoch": 1.4128035320088301, "eval_loss": 3.0207791328430176, "eval_runtime": 676.9788, "eval_samples_per_second": 401.482, "eval_steps_per_second": 12.547, "step": 48000 }, { "epoch": 1.4157468727005151, "grad_norm": 0.026579368859529495, "learning_rate": 1.4359741640094842e-05, "loss": 3.0201, "step": 48100 }, { "epoch": 1.4186902133922001, "grad_norm": 0.03473450243473053, "learning_rate": 1.4343389747363258e-05, "loss": 3.0207, "step": 48200 }, { "epoch": 1.4216335540838851, "grad_norm": 0.031997546553611755, "learning_rate": 1.4327037854631676e-05, "loss": 3.0288, "step": 48300 }, { "epoch": 1.4245768947755701, "grad_norm": 0.033612512052059174, "learning_rate": 1.431068596190009e-05, "loss": 3.0182, "step": 48400 }, { "epoch": 1.4275202354672554, "grad_norm": 0.028787825256586075, "learning_rate": 1.4294334069168507e-05, "loss": 3.0263, "step": 48500 }, { "epoch": 1.4275202354672554, "eval_loss": 3.020042896270752, "eval_runtime": 675.8716, "eval_samples_per_second": 402.14, "eval_steps_per_second": 12.567, "step": 48500 }, { "epoch": 1.4304635761589404, "grad_norm": 0.028570111840963364, "learning_rate": 1.4277982176436924e-05, "loss": 3.0207, "step": 48600 }, { "epoch": 1.4334069168506254, "grad_norm": 0.03498288244009018, "learning_rate": 1.4261630283705342e-05, "loss": 3.0332, "step": 48700 }, { "epoch": 1.4363502575423106, "grad_norm": 0.030701283365488052, "learning_rate": 1.4245278390973756e-05, "loss": 3.0201, "step": 48800 }, { "epoch": 1.4392935982339956, "grad_norm": 0.03406507149338722, "learning_rate": 1.4228926498242172e-05, "loss": 3.0182, "step": 48900 }, { "epoch": 1.4422369389256806, "grad_norm": 0.02542264200747013, "learning_rate": 1.421257460551059e-05, "loss": 3.0188, "step": 49000 }, { "epoch": 1.4422369389256806, "eval_loss": 3.020019054412842, "eval_runtime": 673.1567, "eval_samples_per_second": 403.762, "eval_steps_per_second": 12.618, "step": 49000 }, { "epoch": 1.4451802796173658, "grad_norm": 0.032538481056690216, "learning_rate": 1.4196222712779006e-05, "loss": 3.0213, "step": 49100 }, { "epoch": 1.4481236203090508, "grad_norm": 0.033189062029123306, "learning_rate": 1.417987082004742e-05, "loss": 3.0144, "step": 49200 }, { "epoch": 1.4510669610007358, "grad_norm": 0.03463747352361679, "learning_rate": 1.4163518927315838e-05, "loss": 3.0257, "step": 49300 }, { "epoch": 1.4540103016924208, "grad_norm": 0.026184063404798508, "learning_rate": 1.4147167034584256e-05, "loss": 3.0201, "step": 49400 }, { "epoch": 1.4569536423841059, "grad_norm": 0.031247887760400772, "learning_rate": 1.413081514185267e-05, "loss": 3.0238, "step": 49500 }, { "epoch": 1.4569536423841059, "eval_loss": 3.0190837383270264, "eval_runtime": 681.9079, "eval_samples_per_second": 398.58, "eval_steps_per_second": 12.456, "step": 49500 }, { "epoch": 1.459896983075791, "grad_norm": 0.023587316274642944, "learning_rate": 1.4114463249121086e-05, "loss": 3.0294, "step": 49600 }, { "epoch": 1.462840323767476, "grad_norm": 0.029222723096609116, "learning_rate": 1.4098111356389504e-05, "loss": 3.0226, "step": 49700 }, { "epoch": 1.465783664459161, "grad_norm": 0.028335383161902428, "learning_rate": 1.408175946365792e-05, "loss": 3.0194, "step": 49800 }, { "epoch": 1.4687270051508463, "grad_norm": 0.026601411402225494, "learning_rate": 1.4065407570926336e-05, "loss": 3.0169, "step": 49900 }, { "epoch": 1.4716703458425313, "grad_norm": 0.03292285278439522, "learning_rate": 1.4049055678194752e-05, "loss": 3.0207, "step": 50000 }, { "epoch": 1.4716703458425313, "eval_loss": 3.018892288208008, "eval_runtime": 674.9438, "eval_samples_per_second": 402.693, "eval_steps_per_second": 12.585, "step": 50000 }, { "epoch": 1.4746136865342163, "grad_norm": 0.02647383138537407, "learning_rate": 1.403270378546317e-05, "loss": 3.0219, "step": 50100 }, { "epoch": 1.4775570272259013, "grad_norm": 0.03448876366019249, "learning_rate": 1.4016351892731585e-05, "loss": 3.0194, "step": 50200 }, { "epoch": 1.4805003679175863, "grad_norm": 0.031194327399134636, "learning_rate": 1.4e-05, "loss": 3.0126, "step": 50300 }, { "epoch": 1.4834437086092715, "grad_norm": 0.02944614365696907, "learning_rate": 1.3983648107268417e-05, "loss": 3.0194, "step": 50400 }, { "epoch": 1.4863870493009566, "grad_norm": 0.027645617723464966, "learning_rate": 1.3967296214536834e-05, "loss": 3.0163, "step": 50500 }, { "epoch": 1.4863870493009566, "eval_loss": 3.020841121673584, "eval_runtime": 672.2554, "eval_samples_per_second": 404.303, "eval_steps_per_second": 12.635, "step": 50500 }, { "epoch": 1.4893303899926416, "grad_norm": 0.02899807132780552, "learning_rate": 1.3950944321805251e-05, "loss": 3.0182, "step": 50600 }, { "epoch": 1.4922737306843268, "grad_norm": 0.029714033007621765, "learning_rate": 1.3934592429073666e-05, "loss": 3.0169, "step": 50700 }, { "epoch": 1.4952170713760118, "grad_norm": 0.022833334282040596, "learning_rate": 1.3918240536342083e-05, "loss": 3.0188, "step": 50800 }, { "epoch": 1.4981604120676968, "grad_norm": 0.021909192204475403, "learning_rate": 1.39018886436105e-05, "loss": 3.0219, "step": 50900 }, { "epoch": 1.501103752759382, "grad_norm": 0.029616709798574448, "learning_rate": 1.3885536750878917e-05, "loss": 3.0169, "step": 51000 }, { "epoch": 1.501103752759382, "eval_loss": 3.0200018882751465, "eval_runtime": 670.0957, "eval_samples_per_second": 405.606, "eval_steps_per_second": 12.676, "step": 51000 }, { "epoch": 1.5040470934510668, "grad_norm": 0.0254743043333292, "learning_rate": 1.3869184858147331e-05, "loss": 3.0294, "step": 51100 }, { "epoch": 1.506990434142752, "grad_norm": 0.030154503881931305, "learning_rate": 1.3852832965415747e-05, "loss": 3.0207, "step": 51200 }, { "epoch": 1.5099337748344372, "grad_norm": 0.03610174357891083, "learning_rate": 1.3836481072684165e-05, "loss": 3.02, "step": 51300 }, { "epoch": 1.512877115526122, "grad_norm": 0.030488867312669754, "learning_rate": 1.3820129179952581e-05, "loss": 3.0207, "step": 51400 }, { "epoch": 1.5158204562178073, "grad_norm": 0.029709601774811745, "learning_rate": 1.3803777287220997e-05, "loss": 3.0175, "step": 51500 }, { "epoch": 1.5158204562178073, "eval_loss": 3.01955246925354, "eval_runtime": 666.5545, "eval_samples_per_second": 407.761, "eval_steps_per_second": 12.743, "step": 51500 }, { "epoch": 1.5187637969094923, "grad_norm": 0.023889383301138878, "learning_rate": 1.3787425394489413e-05, "loss": 3.0225, "step": 51600 }, { "epoch": 1.5217071376011773, "grad_norm": 0.02330232597887516, "learning_rate": 1.377107350175783e-05, "loss": 3.0213, "step": 51700 }, { "epoch": 1.5246504782928625, "grad_norm": 0.028098197653889656, "learning_rate": 1.3754721609026247e-05, "loss": 3.02, "step": 51800 }, { "epoch": 1.5275938189845475, "grad_norm": 0.026527347043156624, "learning_rate": 1.3738369716294661e-05, "loss": 3.0232, "step": 51900 }, { "epoch": 1.5305371596762325, "grad_norm": 0.02484341710805893, "learning_rate": 1.3722017823563079e-05, "loss": 3.0275, "step": 52000 }, { "epoch": 1.5305371596762325, "eval_loss": 3.0187878608703613, "eval_runtime": 665.1343, "eval_samples_per_second": 408.632, "eval_steps_per_second": 12.77, "step": 52000 }, { "epoch": 1.5334805003679177, "grad_norm": 0.02860873006284237, "learning_rate": 1.3705665930831495e-05, "loss": 3.0169, "step": 52100 }, { "epoch": 1.5364238410596025, "grad_norm": 0.027475519105792046, "learning_rate": 1.368931403809991e-05, "loss": 3.02, "step": 52200 }, { "epoch": 1.5393671817512877, "grad_norm": 0.024033529683947563, "learning_rate": 1.3672962145368327e-05, "loss": 3.0232, "step": 52300 }, { "epoch": 1.5423105224429727, "grad_norm": 0.03173499554395676, "learning_rate": 1.3656610252636744e-05, "loss": 3.0125, "step": 52400 }, { "epoch": 1.5452538631346577, "grad_norm": 0.029520176351070404, "learning_rate": 1.364025835990516e-05, "loss": 3.0163, "step": 52500 }, { "epoch": 1.5452538631346577, "eval_loss": 3.0187578201293945, "eval_runtime": 659.8212, "eval_samples_per_second": 411.922, "eval_steps_per_second": 12.873, "step": 52500 }, { "epoch": 1.548197203826343, "grad_norm": 0.030009903013706207, "learning_rate": 1.3623906467173575e-05, "loss": 3.0163, "step": 52600 }, { "epoch": 1.551140544518028, "grad_norm": 0.026769885793328285, "learning_rate": 1.3607554574441993e-05, "loss": 3.0269, "step": 52700 }, { "epoch": 1.554083885209713, "grad_norm": 0.024303771555423737, "learning_rate": 1.3591202681710409e-05, "loss": 3.0194, "step": 52800 }, { "epoch": 1.5570272259013982, "grad_norm": 0.019666366279125214, "learning_rate": 1.3574850788978826e-05, "loss": 3.0238, "step": 52900 }, { "epoch": 1.5599705665930832, "grad_norm": 0.01830855756998062, "learning_rate": 1.355849889624724e-05, "loss": 3.02, "step": 53000 }, { "epoch": 1.5599705665930832, "eval_loss": 3.0183305740356445, "eval_runtime": 662.8104, "eval_samples_per_second": 410.064, "eval_steps_per_second": 12.815, "step": 53000 }, { "epoch": 1.5629139072847682, "grad_norm": 0.018736068159341812, "learning_rate": 1.3542147003515658e-05, "loss": 3.0175, "step": 53100 }, { "epoch": 1.5658572479764534, "grad_norm": 0.029150404036045074, "learning_rate": 1.3525795110784074e-05, "loss": 3.0157, "step": 53200 }, { "epoch": 1.5688005886681382, "grad_norm": 0.02467174641788006, "learning_rate": 1.3509443218052492e-05, "loss": 3.0157, "step": 53300 }, { "epoch": 1.5717439293598234, "grad_norm": 0.030708668753504753, "learning_rate": 1.3493091325320906e-05, "loss": 3.0232, "step": 53400 }, { "epoch": 1.5746872700515084, "grad_norm": 0.023188477382063866, "learning_rate": 1.3476739432589322e-05, "loss": 3.0238, "step": 53500 }, { "epoch": 1.5746872700515084, "eval_loss": 3.0181756019592285, "eval_runtime": 663.5687, "eval_samples_per_second": 409.596, "eval_steps_per_second": 12.8, "step": 53500 }, { "epoch": 1.5776306107431934, "grad_norm": 0.03518206626176834, "learning_rate": 1.346038753985774e-05, "loss": 3.0207, "step": 53600 }, { "epoch": 1.5805739514348787, "grad_norm": 0.026725132018327713, "learning_rate": 1.3444035647126156e-05, "loss": 3.0182, "step": 53700 }, { "epoch": 1.5835172921265637, "grad_norm": 0.030984964221715927, "learning_rate": 1.3427683754394572e-05, "loss": 3.0213, "step": 53800 }, { "epoch": 1.5864606328182487, "grad_norm": 0.024168606847524643, "learning_rate": 1.3411331861662988e-05, "loss": 3.0213, "step": 53900 }, { "epoch": 1.589403973509934, "grad_norm": 0.026421204209327698, "learning_rate": 1.3394979968931406e-05, "loss": 3.0125, "step": 54000 }, { "epoch": 1.589403973509934, "eval_loss": 3.0181386470794678, "eval_runtime": 663.2697, "eval_samples_per_second": 409.781, "eval_steps_per_second": 12.806, "step": 54000 }, { "epoch": 1.5923473142016187, "grad_norm": 0.027295192703604698, "learning_rate": 1.3378628076199822e-05, "loss": 3.0119, "step": 54100 }, { "epoch": 1.595290654893304, "grad_norm": 0.02131040394306183, "learning_rate": 1.3362276183468236e-05, "loss": 3.0194, "step": 54200 }, { "epoch": 1.598233995584989, "grad_norm": 0.023170916363596916, "learning_rate": 1.3345924290736654e-05, "loss": 3.0125, "step": 54300 }, { "epoch": 1.601177336276674, "grad_norm": 0.025637265294790268, "learning_rate": 1.3329572398005071e-05, "loss": 3.0257, "step": 54400 }, { "epoch": 1.6041206769683591, "grad_norm": 0.0258557740598917, "learning_rate": 1.3313220505273487e-05, "loss": 3.02, "step": 54500 }, { "epoch": 1.6041206769683591, "eval_loss": 3.0181374549865723, "eval_runtime": 665.1767, "eval_samples_per_second": 408.606, "eval_steps_per_second": 12.77, "step": 54500 }, { "epoch": 1.6070640176600441, "grad_norm": 0.024576343595981598, "learning_rate": 1.3296868612541902e-05, "loss": 3.0232, "step": 54600 }, { "epoch": 1.6100073583517291, "grad_norm": 0.028620576485991478, "learning_rate": 1.328051671981032e-05, "loss": 3.025, "step": 54700 }, { "epoch": 1.6129506990434144, "grad_norm": 0.02308052033185959, "learning_rate": 1.3264164827078736e-05, "loss": 3.0263, "step": 54800 }, { "epoch": 1.6158940397350994, "grad_norm": 0.026176270097494125, "learning_rate": 1.3247812934347153e-05, "loss": 3.0144, "step": 54900 }, { "epoch": 1.6188373804267844, "grad_norm": 0.02688279189169407, "learning_rate": 1.3231461041615568e-05, "loss": 3.0138, "step": 55000 }, { "epoch": 1.6188373804267844, "eval_loss": 3.017655611038208, "eval_runtime": 664.4845, "eval_samples_per_second": 409.031, "eval_steps_per_second": 12.783, "step": 55000 }, { "epoch": 1.6217807211184696, "grad_norm": 0.030299944803118706, "learning_rate": 1.3215109148883985e-05, "loss": 3.0207, "step": 55100 }, { "epoch": 1.6247240618101544, "grad_norm": 0.023143861442804337, "learning_rate": 1.3198757256152401e-05, "loss": 3.015, "step": 55200 }, { "epoch": 1.6276674025018396, "grad_norm": 0.03064398467540741, "learning_rate": 1.3182405363420816e-05, "loss": 3.0175, "step": 55300 }, { "epoch": 1.6306107431935246, "grad_norm": 0.03012877143919468, "learning_rate": 1.3166053470689233e-05, "loss": 3.0163, "step": 55400 }, { "epoch": 1.6335540838852096, "grad_norm": 0.030317330732941628, "learning_rate": 1.314970157795765e-05, "loss": 3.0157, "step": 55500 }, { "epoch": 1.6335540838852096, "eval_loss": 3.0172173976898193, "eval_runtime": 661.5886, "eval_samples_per_second": 410.822, "eval_steps_per_second": 12.839, "step": 55500 }, { "epoch": 1.6364974245768948, "grad_norm": 0.02268531732261181, "learning_rate": 1.3133349685226067e-05, "loss": 3.01, "step": 55600 }, { "epoch": 1.6394407652685798, "grad_norm": 0.027603456750512123, "learning_rate": 1.3116997792494481e-05, "loss": 3.0132, "step": 55700 }, { "epoch": 1.6423841059602649, "grad_norm": 0.02504940703511238, "learning_rate": 1.3100645899762899e-05, "loss": 3.0232, "step": 55800 }, { "epoch": 1.64532744665195, "grad_norm": 0.02349354512989521, "learning_rate": 1.3084294007031315e-05, "loss": 3.02, "step": 55900 }, { "epoch": 1.648270787343635, "grad_norm": 0.038360998034477234, "learning_rate": 1.3067942114299733e-05, "loss": 3.0163, "step": 56000 }, { "epoch": 1.648270787343635, "eval_loss": 3.014486312866211, "eval_runtime": 659.795, "eval_samples_per_second": 411.939, "eval_steps_per_second": 12.874, "step": 56000 }, { "epoch": 1.65121412803532, "grad_norm": 0.033007778227329254, "learning_rate": 1.3051590221568147e-05, "loss": 3.0132, "step": 56100 }, { "epoch": 1.6541574687270053, "grad_norm": 0.022328654304146767, "learning_rate": 1.3035238328836563e-05, "loss": 3.0219, "step": 56200 }, { "epoch": 1.65710080941869, "grad_norm": 0.024724245071411133, "learning_rate": 1.301888643610498e-05, "loss": 3.0188, "step": 56300 }, { "epoch": 1.6600441501103753, "grad_norm": 0.02304394729435444, "learning_rate": 1.3002534543373397e-05, "loss": 3.015, "step": 56400 }, { "epoch": 1.6629874908020603, "grad_norm": 0.022705044597387314, "learning_rate": 1.2986182650641813e-05, "loss": 3.0157, "step": 56500 }, { "epoch": 1.6629874908020603, "eval_loss": 3.0145657062530518, "eval_runtime": 657.2064, "eval_samples_per_second": 413.561, "eval_steps_per_second": 12.924, "step": 56500 }, { "epoch": 1.6659308314937453, "grad_norm": 0.0278596431016922, "learning_rate": 1.2969830757910229e-05, "loss": 3.0188, "step": 56600 }, { "epoch": 1.6688741721854305, "grad_norm": 0.026077425107359886, "learning_rate": 1.2953478865178646e-05, "loss": 3.0225, "step": 56700 }, { "epoch": 1.6718175128771156, "grad_norm": 0.03256196528673172, "learning_rate": 1.2937126972447062e-05, "loss": 3.0094, "step": 56800 }, { "epoch": 1.6747608535688006, "grad_norm": 0.0225417148321867, "learning_rate": 1.2920775079715477e-05, "loss": 3.0163, "step": 56900 }, { "epoch": 1.6777041942604858, "grad_norm": 0.022531842812895775, "learning_rate": 1.2904423186983895e-05, "loss": 3.0244, "step": 57000 }, { "epoch": 1.6777041942604858, "eval_loss": 3.015845537185669, "eval_runtime": 651.1304, "eval_samples_per_second": 417.42, "eval_steps_per_second": 13.045, "step": 57000 }, { "epoch": 1.6806475349521706, "grad_norm": 0.021297652274370193, "learning_rate": 1.288807129425231e-05, "loss": 3.0157, "step": 57100 }, { "epoch": 1.6835908756438558, "grad_norm": 0.024818047881126404, "learning_rate": 1.2871719401520728e-05, "loss": 3.0157, "step": 57200 }, { "epoch": 1.6865342163355408, "grad_norm": 0.021963195875287056, "learning_rate": 1.2855367508789143e-05, "loss": 3.015, "step": 57300 }, { "epoch": 1.6894775570272258, "grad_norm": 0.02945674955844879, "learning_rate": 1.283901561605756e-05, "loss": 3.0125, "step": 57400 }, { "epoch": 1.692420897718911, "grad_norm": 0.021940065547823906, "learning_rate": 1.2822663723325976e-05, "loss": 3.0169, "step": 57500 }, { "epoch": 1.692420897718911, "eval_loss": 3.015117645263672, "eval_runtime": 650.6369, "eval_samples_per_second": 417.737, "eval_steps_per_second": 13.055, "step": 57500 }, { "epoch": 1.695364238410596, "grad_norm": 0.026226382702589035, "learning_rate": 1.2806311830594394e-05, "loss": 3.02, "step": 57600 }, { "epoch": 1.698307579102281, "grad_norm": 0.026587575674057007, "learning_rate": 1.2789959937862808e-05, "loss": 3.0138, "step": 57700 }, { "epoch": 1.7012509197939663, "grad_norm": 0.03163275122642517, "learning_rate": 1.2773608045131224e-05, "loss": 3.0163, "step": 57800 }, { "epoch": 1.7041942604856513, "grad_norm": 0.018358003348112106, "learning_rate": 1.2757256152399642e-05, "loss": 3.0169, "step": 57900 }, { "epoch": 1.7071376011773363, "grad_norm": 0.026742225512862206, "learning_rate": 1.2740904259668056e-05, "loss": 3.0169, "step": 58000 }, { "epoch": 1.7071376011773363, "eval_loss": 3.015300989151001, "eval_runtime": 647.8008, "eval_samples_per_second": 419.566, "eval_steps_per_second": 13.112, "step": 58000 }, { "epoch": 1.7100809418690215, "grad_norm": 0.022859064862132072, "learning_rate": 1.2724552366936474e-05, "loss": 3.0119, "step": 58100 }, { "epoch": 1.7130242825607063, "grad_norm": 0.022774986922740936, "learning_rate": 1.270820047420489e-05, "loss": 3.0132, "step": 58200 }, { "epoch": 1.7159676232523915, "grad_norm": 0.017126472666859627, "learning_rate": 1.2691848581473308e-05, "loss": 3.0138, "step": 58300 }, { "epoch": 1.7189109639440765, "grad_norm": 0.022204402834177017, "learning_rate": 1.2675496688741722e-05, "loss": 3.0225, "step": 58400 }, { "epoch": 1.7218543046357615, "grad_norm": 0.024221548810601234, "learning_rate": 1.2659144796010138e-05, "loss": 3.02, "step": 58500 }, { "epoch": 1.7218543046357615, "eval_loss": 3.0148227214813232, "eval_runtime": 654.7218, "eval_samples_per_second": 415.131, "eval_steps_per_second": 12.973, "step": 58500 }, { "epoch": 1.7247976453274467, "grad_norm": 0.018151937052607536, "learning_rate": 1.2642792903278556e-05, "loss": 3.015, "step": 58600 }, { "epoch": 1.7277409860191317, "grad_norm": 0.02599366195499897, "learning_rate": 1.2626441010546972e-05, "loss": 3.0188, "step": 58700 }, { "epoch": 1.7306843267108167, "grad_norm": 0.025381946936249733, "learning_rate": 1.2610089117815388e-05, "loss": 3.015, "step": 58800 }, { "epoch": 1.733627667402502, "grad_norm": 0.024460801854729652, "learning_rate": 1.2593737225083804e-05, "loss": 3.015, "step": 58900 }, { "epoch": 1.7365710080941867, "grad_norm": 0.02724149078130722, "learning_rate": 1.2577385332352222e-05, "loss": 3.0082, "step": 59000 }, { "epoch": 1.7365710080941867, "eval_loss": 3.014829158782959, "eval_runtime": 653.8011, "eval_samples_per_second": 415.715, "eval_steps_per_second": 12.992, "step": 59000 }, { "epoch": 1.739514348785872, "grad_norm": 0.0220405925065279, "learning_rate": 1.2561033439620638e-05, "loss": 3.0213, "step": 59100 }, { "epoch": 1.7424576894775572, "grad_norm": 0.02122543565928936, "learning_rate": 1.2544681546889052e-05, "loss": 3.0094, "step": 59200 }, { "epoch": 1.745401030169242, "grad_norm": 0.03133600950241089, "learning_rate": 1.252832965415747e-05, "loss": 3.0188, "step": 59300 }, { "epoch": 1.7483443708609272, "grad_norm": 0.02161826193332672, "learning_rate": 1.2511977761425886e-05, "loss": 3.0138, "step": 59400 }, { "epoch": 1.7512877115526122, "grad_norm": 0.01612456515431404, "learning_rate": 1.2495625868694303e-05, "loss": 3.0138, "step": 59500 }, { "epoch": 1.7512877115526122, "eval_loss": 3.0147769451141357, "eval_runtime": 659.5195, "eval_samples_per_second": 412.111, "eval_steps_per_second": 12.879, "step": 59500 }, { "epoch": 1.7542310522442972, "grad_norm": 0.02746340073645115, "learning_rate": 1.2479273975962718e-05, "loss": 3.0188, "step": 59600 }, { "epoch": 1.7571743929359824, "grad_norm": 0.023313304409384727, "learning_rate": 1.2462922083231135e-05, "loss": 3.0107, "step": 59700 }, { "epoch": 1.7601177336276674, "grad_norm": 0.019365200772881508, "learning_rate": 1.2446570190499551e-05, "loss": 3.0119, "step": 59800 }, { "epoch": 1.7630610743193524, "grad_norm": 0.02569250389933586, "learning_rate": 1.2430218297767969e-05, "loss": 3.015, "step": 59900 }, { "epoch": 1.7660044150110377, "grad_norm": 0.023310605436563492, "learning_rate": 1.2413866405036383e-05, "loss": 3.0194, "step": 60000 }, { "epoch": 1.7660044150110377, "eval_loss": 3.0147032737731934, "eval_runtime": 655.2161, "eval_samples_per_second": 414.817, "eval_steps_per_second": 12.964, "step": 60000 }, { "epoch": 1.7689477557027224, "grad_norm": 0.0224477406591177, "learning_rate": 1.2397514512304801e-05, "loss": 3.0144, "step": 60100 }, { "epoch": 1.7718910963944077, "grad_norm": 0.024044036865234375, "learning_rate": 1.2381162619573217e-05, "loss": 3.0182, "step": 60200 }, { "epoch": 1.7748344370860927, "grad_norm": 0.026123568415641785, "learning_rate": 1.2364810726841635e-05, "loss": 3.0213, "step": 60300 }, { "epoch": 1.7777777777777777, "grad_norm": 0.02102663181722164, "learning_rate": 1.2348458834110049e-05, "loss": 3.0144, "step": 60400 }, { "epoch": 1.780721118469463, "grad_norm": 0.018051892518997192, "learning_rate": 1.2332106941378465e-05, "loss": 3.0157, "step": 60500 }, { "epoch": 1.780721118469463, "eval_loss": 3.01468825340271, "eval_runtime": 651.1742, "eval_samples_per_second": 417.392, "eval_steps_per_second": 13.044, "step": 60500 }, { "epoch": 1.783664459161148, "grad_norm": 0.01973419263958931, "learning_rate": 1.2315755048646883e-05, "loss": 3.0132, "step": 60600 }, { "epoch": 1.786607799852833, "grad_norm": 0.02773173898458481, "learning_rate": 1.2299403155915297e-05, "loss": 3.0163, "step": 60700 }, { "epoch": 1.7895511405445181, "grad_norm": 0.018074948340654373, "learning_rate": 1.2283051263183715e-05, "loss": 3.0182, "step": 60800 }, { "epoch": 1.7924944812362031, "grad_norm": 0.023806726559996605, "learning_rate": 1.226669937045213e-05, "loss": 3.015, "step": 60900 }, { "epoch": 1.7954378219278881, "grad_norm": 0.022285934537649155, "learning_rate": 1.2250347477720548e-05, "loss": 3.0088, "step": 61000 }, { "epoch": 1.7954378219278881, "eval_loss": 3.0148134231567383, "eval_runtime": 651.1463, "eval_samples_per_second": 417.41, "eval_steps_per_second": 13.045, "step": 61000 }, { "epoch": 1.7983811626195734, "grad_norm": 0.026986967772245407, "learning_rate": 1.2233995584988963e-05, "loss": 3.015, "step": 61100 }, { "epoch": 1.8013245033112582, "grad_norm": 0.02346801571547985, "learning_rate": 1.2217643692257379e-05, "loss": 3.0144, "step": 61200 }, { "epoch": 1.8042678440029434, "grad_norm": 0.021403305232524872, "learning_rate": 1.2201291799525797e-05, "loss": 3.0113, "step": 61300 }, { "epoch": 1.8072111846946284, "grad_norm": 0.02141435444355011, "learning_rate": 1.2184939906794213e-05, "loss": 3.0182, "step": 61400 }, { "epoch": 1.8101545253863134, "grad_norm": 0.026809660717844963, "learning_rate": 1.2168588014062629e-05, "loss": 3.0194, "step": 61500 }, { "epoch": 1.8101545253863134, "eval_loss": 3.0147249698638916, "eval_runtime": 647.6045, "eval_samples_per_second": 419.693, "eval_steps_per_second": 13.116, "step": 61500 }, { "epoch": 1.8130978660779986, "grad_norm": 0.0247699785977602, "learning_rate": 1.2152236121331045e-05, "loss": 3.02, "step": 61600 }, { "epoch": 1.8160412067696836, "grad_norm": 0.023311134427785873, "learning_rate": 1.2135884228599462e-05, "loss": 3.0125, "step": 61700 }, { "epoch": 1.8189845474613686, "grad_norm": 0.0420430451631546, "learning_rate": 1.2119532335867878e-05, "loss": 3.015, "step": 61800 }, { "epoch": 1.8219278881530538, "grad_norm": 0.026497777551412582, "learning_rate": 1.2103180443136293e-05, "loss": 3.0175, "step": 61900 }, { "epoch": 1.8248712288447386, "grad_norm": 0.02670120820403099, "learning_rate": 1.208682855040471e-05, "loss": 3.0119, "step": 62000 }, { "epoch": 1.8248712288447386, "eval_loss": 3.0145998001098633, "eval_runtime": 652.627, "eval_samples_per_second": 416.463, "eval_steps_per_second": 13.015, "step": 62000 }, { "epoch": 1.8278145695364238, "grad_norm": 0.020710265263915062, "learning_rate": 1.2070476657673126e-05, "loss": 3.0169, "step": 62100 }, { "epoch": 1.8307579102281089, "grad_norm": 0.02028392255306244, "learning_rate": 1.2054124764941544e-05, "loss": 3.0225, "step": 62200 }, { "epoch": 1.8337012509197939, "grad_norm": 0.025742698460817337, "learning_rate": 1.2037772872209958e-05, "loss": 3.0207, "step": 62300 }, { "epoch": 1.836644591611479, "grad_norm": 0.034112364053726196, "learning_rate": 1.2021420979478376e-05, "loss": 3.0169, "step": 62400 }, { "epoch": 1.839587932303164, "grad_norm": 0.024309339001774788, "learning_rate": 1.2005069086746792e-05, "loss": 3.0125, "step": 62500 }, { "epoch": 1.839587932303164, "eval_loss": 3.017049789428711, "eval_runtime": 649.1833, "eval_samples_per_second": 418.672, "eval_steps_per_second": 13.084, "step": 62500 }, { "epoch": 1.842531272994849, "grad_norm": 0.024736078456044197, "learning_rate": 1.198871719401521e-05, "loss": 3.0188, "step": 62600 }, { "epoch": 1.8454746136865343, "grad_norm": 0.020415807142853737, "learning_rate": 1.1972365301283624e-05, "loss": 3.0157, "step": 62700 }, { "epoch": 1.8484179543782193, "grad_norm": 0.023319318890571594, "learning_rate": 1.195601340855204e-05, "loss": 3.0182, "step": 62800 }, { "epoch": 1.8513612950699043, "grad_norm": 0.01736253686249256, "learning_rate": 1.1939661515820458e-05, "loss": 3.01, "step": 62900 }, { "epoch": 1.8543046357615895, "grad_norm": 0.01918075419962406, "learning_rate": 1.1923309623088874e-05, "loss": 3.0138, "step": 63000 }, { "epoch": 1.8543046357615895, "eval_loss": 3.014754056930542, "eval_runtime": 652.4575, "eval_samples_per_second": 416.571, "eval_steps_per_second": 13.018, "step": 63000 }, { "epoch": 1.8572479764532743, "grad_norm": 0.028959564864635468, "learning_rate": 1.190695773035729e-05, "loss": 3.0094, "step": 63100 }, { "epoch": 1.8601913171449596, "grad_norm": 0.021600542590022087, "learning_rate": 1.1890605837625706e-05, "loss": 3.0157, "step": 63200 }, { "epoch": 1.8631346578366446, "grad_norm": 0.02049010805785656, "learning_rate": 1.1874253944894124e-05, "loss": 3.02, "step": 63300 }, { "epoch": 1.8660779985283296, "grad_norm": 0.025215979665517807, "learning_rate": 1.185790205216254e-05, "loss": 3.0094, "step": 63400 }, { "epoch": 1.8690213392200148, "grad_norm": 0.029572071507573128, "learning_rate": 1.1841550159430954e-05, "loss": 3.0182, "step": 63500 }, { "epoch": 1.8690213392200148, "eval_loss": 3.014533281326294, "eval_runtime": 647.4833, "eval_samples_per_second": 419.771, "eval_steps_per_second": 13.118, "step": 63500 }, { "epoch": 1.8719646799116998, "grad_norm": 0.01876496709883213, "learning_rate": 1.1825198266699372e-05, "loss": 3.0157, "step": 63600 }, { "epoch": 1.8749080206033848, "grad_norm": 0.024910956621170044, "learning_rate": 1.1808846373967788e-05, "loss": 3.0138, "step": 63700 }, { "epoch": 1.87785136129507, "grad_norm": 0.018276214599609375, "learning_rate": 1.1792494481236204e-05, "loss": 3.0125, "step": 63800 }, { "epoch": 1.8807947019867548, "grad_norm": 0.020034488290548325, "learning_rate": 1.177614258850462e-05, "loss": 3.015, "step": 63900 }, { "epoch": 1.88373804267844, "grad_norm": 0.02025388740003109, "learning_rate": 1.1759790695773037e-05, "loss": 3.0075, "step": 64000 }, { "epoch": 1.88373804267844, "eval_loss": 3.0144081115722656, "eval_runtime": 642.7162, "eval_samples_per_second": 422.885, "eval_steps_per_second": 13.216, "step": 64000 }, { "epoch": 1.8866813833701253, "grad_norm": 0.023673560470342636, "learning_rate": 1.1743438803041453e-05, "loss": 3.0157, "step": 64100 }, { "epoch": 1.88962472406181, "grad_norm": 0.01916837878525257, "learning_rate": 1.1727086910309868e-05, "loss": 3.0088, "step": 64200 }, { "epoch": 1.8925680647534953, "grad_norm": 0.018505992367863655, "learning_rate": 1.1710735017578285e-05, "loss": 3.0225, "step": 64300 }, { "epoch": 1.8955114054451803, "grad_norm": 0.01704845391213894, "learning_rate": 1.1694383124846701e-05, "loss": 3.0175, "step": 64400 }, { "epoch": 1.8984547461368653, "grad_norm": 0.028741225600242615, "learning_rate": 1.1678031232115119e-05, "loss": 3.0232, "step": 64500 }, { "epoch": 1.8984547461368653, "eval_loss": 3.0178887844085693, "eval_runtime": 648.9846, "eval_samples_per_second": 418.8, "eval_steps_per_second": 13.088, "step": 64500 }, { "epoch": 1.9013980868285505, "grad_norm": 0.018123416230082512, "learning_rate": 1.1661679339383533e-05, "loss": 3.0257, "step": 64600 }, { "epoch": 1.9043414275202355, "grad_norm": 0.023796789348125458, "learning_rate": 1.1645327446651951e-05, "loss": 3.0175, "step": 64700 }, { "epoch": 1.9072847682119205, "grad_norm": 0.017267512157559395, "learning_rate": 1.1628975553920367e-05, "loss": 3.0188, "step": 64800 }, { "epoch": 1.9102281089036057, "grad_norm": 0.01945037581026554, "learning_rate": 1.1612623661188785e-05, "loss": 3.0125, "step": 64900 }, { "epoch": 1.9131714495952905, "grad_norm": 0.020120374858379364, "learning_rate": 1.1596271768457199e-05, "loss": 3.0225, "step": 65000 }, { "epoch": 1.9131714495952905, "eval_loss": 3.0170493125915527, "eval_runtime": 636.8498, "eval_samples_per_second": 426.78, "eval_steps_per_second": 13.338, "step": 65000 }, { "epoch": 1.9161147902869757, "grad_norm": 0.021748850122094154, "learning_rate": 1.1580083394652933e-05, "loss": 3.02, "step": 65100 }, { "epoch": 1.9190581309786607, "grad_norm": 0.025073816999793053, "learning_rate": 1.1563731501921349e-05, "loss": 3.0213, "step": 65200 }, { "epoch": 1.9220014716703457, "grad_norm": 0.020357929170131683, "learning_rate": 1.1547379609189763e-05, "loss": 3.0113, "step": 65300 }, { "epoch": 1.924944812362031, "grad_norm": 0.01657239720225334, "learning_rate": 1.153102771645818e-05, "loss": 3.0182, "step": 65400 }, { "epoch": 1.927888153053716, "grad_norm": 0.017468733713030815, "learning_rate": 1.1514675823726597e-05, "loss": 3.0232, "step": 65500 }, { "epoch": 1.927888153053716, "eval_loss": 3.0168654918670654, "eval_runtime": 631.807, "eval_samples_per_second": 430.187, "eval_steps_per_second": 13.444, "step": 65500 }, { "epoch": 1.930831493745401, "grad_norm": 0.018981963396072388, "learning_rate": 1.1498323930995015e-05, "loss": 3.0225, "step": 65600 }, { "epoch": 1.9337748344370862, "grad_norm": 0.02063673920929432, "learning_rate": 1.1481972038263429e-05, "loss": 3.0181, "step": 65700 }, { "epoch": 1.9367181751287712, "grad_norm": 0.01954454928636551, "learning_rate": 1.1465620145531847e-05, "loss": 3.0181, "step": 65800 }, { "epoch": 1.9396615158204562, "grad_norm": 0.021739047020673752, "learning_rate": 1.1449268252800263e-05, "loss": 3.0194, "step": 65900 }, { "epoch": 1.9426048565121414, "grad_norm": 0.01561459619551897, "learning_rate": 1.143291636006868e-05, "loss": 3.0175, "step": 66000 }, { "epoch": 1.9426048565121414, "eval_loss": 3.0167696475982666, "eval_runtime": 638.7494, "eval_samples_per_second": 425.511, "eval_steps_per_second": 13.298, "step": 66000 }, { "epoch": 1.9455481972038262, "grad_norm": 0.017708543688058853, "learning_rate": 1.1416564467337095e-05, "loss": 3.0181, "step": 66100 }, { "epoch": 1.9484915378955114, "grad_norm": 0.01939385198056698, "learning_rate": 1.140021257460551e-05, "loss": 3.0157, "step": 66200 }, { "epoch": 1.9514348785871964, "grad_norm": 0.021550865843892097, "learning_rate": 1.1383860681873928e-05, "loss": 3.0169, "step": 66300 }, { "epoch": 1.9543782192788814, "grad_norm": 0.01408279500901699, "learning_rate": 1.1367508789142346e-05, "loss": 3.0181, "step": 66400 }, { "epoch": 1.9573215599705667, "grad_norm": 0.015906205400824547, "learning_rate": 1.135115689641076e-05, "loss": 3.0138, "step": 66500 }, { "epoch": 1.9573215599705667, "eval_loss": 3.0152392387390137, "eval_runtime": 638.2504, "eval_samples_per_second": 425.844, "eval_steps_per_second": 13.308, "step": 66500 }, { "epoch": 1.9602649006622517, "grad_norm": 0.019635720178484917, "learning_rate": 1.1334805003679176e-05, "loss": 3.0175, "step": 66600 }, { "epoch": 1.9632082413539367, "grad_norm": 0.02460162527859211, "learning_rate": 1.1318453110947594e-05, "loss": 3.0156, "step": 66700 }, { "epoch": 1.966151582045622, "grad_norm": 0.019855447113513947, "learning_rate": 1.1302101218216008e-05, "loss": 3.0106, "step": 66800 }, { "epoch": 1.9690949227373067, "grad_norm": 0.018803175538778305, "learning_rate": 1.1285749325484426e-05, "loss": 3.01, "step": 66900 }, { "epoch": 1.972038263428992, "grad_norm": 0.015566165558993816, "learning_rate": 1.1269397432752842e-05, "loss": 3.0175, "step": 67000 }, { "epoch": 1.972038263428992, "eval_loss": 3.014082670211792, "eval_runtime": 645.6422, "eval_samples_per_second": 420.968, "eval_steps_per_second": 13.156, "step": 67000 }, { "epoch": 1.974981604120677, "grad_norm": 0.01584579609334469, "learning_rate": 1.1253209058948572e-05, "loss": 3.0144, "step": 67100 }, { "epoch": 1.977924944812362, "grad_norm": 0.027248069643974304, "learning_rate": 1.123685716621699e-05, "loss": 3.0131, "step": 67200 }, { "epoch": 1.9808682855040471, "grad_norm": 0.025776633992791176, "learning_rate": 1.1220505273485406e-05, "loss": 3.0113, "step": 67300 }, { "epoch": 1.9838116261957321, "grad_norm": 0.018177872523665428, "learning_rate": 1.1204153380753824e-05, "loss": 3.0113, "step": 67400 }, { "epoch": 1.9867549668874172, "grad_norm": 0.016431670635938644, "learning_rate": 1.1187801488022238e-05, "loss": 3.0125, "step": 67500 }, { "epoch": 1.9867549668874172, "eval_loss": 3.0139944553375244, "eval_runtime": 642.7036, "eval_samples_per_second": 422.893, "eval_steps_per_second": 13.216, "step": 67500 }, { "epoch": 1.9896983075791024, "grad_norm": 0.02130896784365177, "learning_rate": 1.1171449595290656e-05, "loss": 3.0119, "step": 67600 }, { "epoch": 1.9926416482707874, "grad_norm": 0.01546257920563221, "learning_rate": 1.1155097702559072e-05, "loss": 3.02, "step": 67700 }, { "epoch": 1.9955849889624724, "grad_norm": 0.023275157436728477, "learning_rate": 1.113874580982749e-05, "loss": 3.0125, "step": 67800 }, { "epoch": 1.9985283296541576, "grad_norm": 0.015336192212998867, "learning_rate": 1.1122393917095904e-05, "loss": 3.01, "step": 67900 }, { "epoch": 2.0014716703458424, "grad_norm": 0.025023488327860832, "learning_rate": 1.1106042024364322e-05, "loss": 3.0156, "step": 68000 }, { "epoch": 2.0014716703458424, "eval_loss": 3.0139219760894775, "eval_runtime": 642.9749, "eval_samples_per_second": 422.715, "eval_steps_per_second": 13.21, "step": 68000 }, { "epoch": 2.0044150110375276, "grad_norm": 0.011586506851017475, "learning_rate": 1.1089690131632738e-05, "loss": 3.0131, "step": 68100 }, { "epoch": 2.007358351729213, "grad_norm": 0.017889559268951416, "learning_rate": 1.1073338238901155e-05, "loss": 3.015, "step": 68200 }, { "epoch": 2.0103016924208976, "grad_norm": 0.023214193060994148, "learning_rate": 1.105698634616957e-05, "loss": 3.0169, "step": 68300 }, { "epoch": 2.013245033112583, "grad_norm": 0.01742887683212757, "learning_rate": 1.1040634453437986e-05, "loss": 3.0169, "step": 68400 }, { "epoch": 2.0161883738042676, "grad_norm": 0.020295754075050354, "learning_rate": 1.1024282560706403e-05, "loss": 3.0119, "step": 68500 }, { "epoch": 2.0161883738042676, "eval_loss": 3.01391339302063, "eval_runtime": 639.198, "eval_samples_per_second": 425.213, "eval_steps_per_second": 13.289, "step": 68500 }, { "epoch": 2.019131714495953, "grad_norm": 0.01707482524216175, "learning_rate": 1.100793066797482e-05, "loss": 3.0138, "step": 68600 }, { "epoch": 2.022075055187638, "grad_norm": 0.01827945001423359, "learning_rate": 1.0991578775243235e-05, "loss": 3.0138, "step": 68700 }, { "epoch": 2.025018395879323, "grad_norm": 0.02108182944357395, "learning_rate": 1.0975226882511651e-05, "loss": 3.0163, "step": 68800 }, { "epoch": 2.027961736571008, "grad_norm": 0.018957845866680145, "learning_rate": 1.0958874989780069e-05, "loss": 3.0188, "step": 68900 }, { "epoch": 2.0309050772626933, "grad_norm": 0.022794615477323532, "learning_rate": 1.0942523097048485e-05, "loss": 3.0188, "step": 69000 }, { "epoch": 2.0309050772626933, "eval_loss": 3.0139060020446777, "eval_runtime": 642.9922, "eval_samples_per_second": 422.703, "eval_steps_per_second": 13.21, "step": 69000 }, { "epoch": 2.033848417954378, "grad_norm": 0.015071883797645569, "learning_rate": 1.09261712043169e-05, "loss": 3.01, "step": 69100 }, { "epoch": 2.0367917586460633, "grad_norm": 0.013901627622544765, "learning_rate": 1.0909819311585317e-05, "loss": 3.015, "step": 69200 }, { "epoch": 2.0397350993377485, "grad_norm": 0.020578749477863312, "learning_rate": 1.0893630937781051e-05, "loss": 3.0175, "step": 69300 }, { "epoch": 2.0426784400294333, "grad_norm": 0.02017301507294178, "learning_rate": 1.0877279045049465e-05, "loss": 3.0144, "step": 69400 }, { "epoch": 2.0456217807211186, "grad_norm": 0.01854505017399788, "learning_rate": 1.0860927152317881e-05, "loss": 3.0188, "step": 69500 }, { "epoch": 2.0456217807211186, "eval_loss": 3.013876438140869, "eval_runtime": 639.0822, "eval_samples_per_second": 425.29, "eval_steps_per_second": 13.291, "step": 69500 }, { "epoch": 2.0485651214128033, "grad_norm": 0.015308015048503876, "learning_rate": 1.0844575259586299e-05, "loss": 3.0119, "step": 69600 }, { "epoch": 2.0515084621044886, "grad_norm": 0.018516451120376587, "learning_rate": 1.0828223366854715e-05, "loss": 3.0131, "step": 69700 }, { "epoch": 2.054451802796174, "grad_norm": 0.015313582494854927, "learning_rate": 1.0811871474123131e-05, "loss": 3.0131, "step": 69800 }, { "epoch": 2.0573951434878586, "grad_norm": 0.02003653533756733, "learning_rate": 1.0795519581391547e-05, "loss": 3.0144, "step": 69900 }, { "epoch": 2.060338484179544, "grad_norm": 0.012848121114075184, "learning_rate": 1.0779167688659965e-05, "loss": 3.0144, "step": 70000 }, { "epoch": 2.060338484179544, "eval_loss": 3.01387619972229, "eval_runtime": 638.7562, "eval_samples_per_second": 425.507, "eval_steps_per_second": 13.298, "step": 70000 }, { "epoch": 2.063281824871229, "grad_norm": 0.015932172536849976, "learning_rate": 1.0762815795928379e-05, "loss": 3.0163, "step": 70100 }, { "epoch": 2.066225165562914, "grad_norm": 0.018866583704948425, "learning_rate": 1.0746463903196795e-05, "loss": 3.0069, "step": 70200 }, { "epoch": 2.069168506254599, "grad_norm": 0.02176310494542122, "learning_rate": 1.0730112010465213e-05, "loss": 3.0213, "step": 70300 }, { "epoch": 2.0721118469462843, "grad_norm": 0.016189567744731903, "learning_rate": 1.0713760117733629e-05, "loss": 3.0188, "step": 70400 }, { "epoch": 2.075055187637969, "grad_norm": 0.012160216458141804, "learning_rate": 1.0697408225002045e-05, "loss": 3.0131, "step": 70500 }, { "epoch": 2.075055187637969, "eval_loss": 3.0107858180999756, "eval_runtime": 639.4549, "eval_samples_per_second": 425.042, "eval_steps_per_second": 13.283, "step": 70500 }, { "epoch": 2.0779985283296543, "grad_norm": 0.018697241321206093, "learning_rate": 1.068105633227046e-05, "loss": 3.0131, "step": 70600 }, { "epoch": 2.080941869021339, "grad_norm": 0.016789721325039864, "learning_rate": 1.0664704439538878e-05, "loss": 3.0094, "step": 70700 }, { "epoch": 2.0838852097130243, "grad_norm": 0.01461260486394167, "learning_rate": 1.0648352546807294e-05, "loss": 3.0131, "step": 70800 }, { "epoch": 2.0868285504047095, "grad_norm": 0.016812961548566818, "learning_rate": 1.0632000654075709e-05, "loss": 3.0119, "step": 70900 }, { "epoch": 2.0897718910963943, "grad_norm": 0.019109327346086502, "learning_rate": 1.0615648761344126e-05, "loss": 3.0106, "step": 71000 }, { "epoch": 2.0897718910963943, "eval_loss": 3.011734962463379, "eval_runtime": 633.6075, "eval_samples_per_second": 428.964, "eval_steps_per_second": 13.406, "step": 71000 }, { "epoch": 2.0927152317880795, "grad_norm": 0.017822248861193657, "learning_rate": 1.0599296868612542e-05, "loss": 3.015, "step": 71100 }, { "epoch": 2.0956585724797647, "grad_norm": 0.018988199532032013, "learning_rate": 1.058294497588096e-05, "loss": 3.0106, "step": 71200 }, { "epoch": 2.0986019131714495, "grad_norm": 0.018966784700751305, "learning_rate": 1.0566593083149374e-05, "loss": 3.0106, "step": 71300 }, { "epoch": 2.1015452538631347, "grad_norm": 0.01005831640213728, "learning_rate": 1.0550241190417792e-05, "loss": 3.0113, "step": 71400 }, { "epoch": 2.1044885945548195, "grad_norm": 0.01784975454211235, "learning_rate": 1.0533889297686208e-05, "loss": 3.01, "step": 71500 }, { "epoch": 2.1044885945548195, "eval_loss": 3.0117275714874268, "eval_runtime": 636.9098, "eval_samples_per_second": 426.74, "eval_steps_per_second": 13.336, "step": 71500 }, { "epoch": 2.1074319352465047, "grad_norm": 0.020945832133293152, "learning_rate": 1.0517537404954626e-05, "loss": 3.01, "step": 71600 }, { "epoch": 2.11037527593819, "grad_norm": 0.015002132393419743, "learning_rate": 1.050118551222304e-05, "loss": 3.0138, "step": 71700 }, { "epoch": 2.1133186166298747, "grad_norm": 0.01465610321611166, "learning_rate": 1.0484833619491456e-05, "loss": 3.0088, "step": 71800 }, { "epoch": 2.11626195732156, "grad_norm": 0.019877178594470024, "learning_rate": 1.0468481726759874e-05, "loss": 3.0106, "step": 71900 }, { "epoch": 2.119205298013245, "grad_norm": 0.022199036553502083, "learning_rate": 1.045212983402829e-05, "loss": 3.0069, "step": 72000 }, { "epoch": 2.119205298013245, "eval_loss": 3.0110578536987305, "eval_runtime": 641.2228, "eval_samples_per_second": 423.87, "eval_steps_per_second": 13.247, "step": 72000 }, { "epoch": 2.12214863870493, "grad_norm": 0.01941896788775921, "learning_rate": 1.0435777941296706e-05, "loss": 3.0056, "step": 72100 }, { "epoch": 2.125091979396615, "grad_norm": 0.013523824512958527, "learning_rate": 1.0419426048565122e-05, "loss": 3.0156, "step": 72200 }, { "epoch": 2.1280353200883004, "grad_norm": 0.013491061516106129, "learning_rate": 1.040307415583354e-05, "loss": 3.0094, "step": 72300 }, { "epoch": 2.130978660779985, "grad_norm": 0.014269677922129631, "learning_rate": 1.0386722263101956e-05, "loss": 3.0081, "step": 72400 }, { "epoch": 2.1339220014716704, "grad_norm": 0.013196039013564587, "learning_rate": 1.037037037037037e-05, "loss": 3.0125, "step": 72500 }, { "epoch": 2.1339220014716704, "eval_loss": 3.0112051963806152, "eval_runtime": 642.1256, "eval_samples_per_second": 423.274, "eval_steps_per_second": 13.228, "step": 72500 }, { "epoch": 2.136865342163355, "grad_norm": 0.012134133838117123, "learning_rate": 1.0354018477638788e-05, "loss": 3.0125, "step": 72600 }, { "epoch": 2.1398086828550404, "grad_norm": 0.021895255893468857, "learning_rate": 1.0337666584907204e-05, "loss": 3.0144, "step": 72700 }, { "epoch": 2.1427520235467257, "grad_norm": 0.022650396451354027, "learning_rate": 1.032131469217562e-05, "loss": 3.0156, "step": 72800 }, { "epoch": 2.1456953642384105, "grad_norm": 0.023670652881264687, "learning_rate": 1.0304962799444036e-05, "loss": 3.0094, "step": 72900 }, { "epoch": 2.1486387049300957, "grad_norm": 0.014287452213466167, "learning_rate": 1.0288610906712453e-05, "loss": 3.0075, "step": 73000 }, { "epoch": 2.1486387049300957, "eval_loss": 3.0112051963806152, "eval_runtime": 625.4494, "eval_samples_per_second": 434.56, "eval_steps_per_second": 13.581, "step": 73000 }, { "epoch": 2.151582045621781, "grad_norm": 0.01823999732732773, "learning_rate": 1.027225901398087e-05, "loss": 3.0119, "step": 73100 }, { "epoch": 2.1545253863134657, "grad_norm": 0.01523875817656517, "learning_rate": 1.0255907121249284e-05, "loss": 3.0088, "step": 73200 }, { "epoch": 2.157468727005151, "grad_norm": 0.016394559293985367, "learning_rate": 1.0239718747445017e-05, "loss": 3.0119, "step": 73300 }, { "epoch": 2.1604120676968357, "grad_norm": 0.021515628322958946, "learning_rate": 1.0223366854713435e-05, "loss": 3.0131, "step": 73400 }, { "epoch": 2.163355408388521, "grad_norm": 0.015119379386305809, "learning_rate": 1.020701496198185e-05, "loss": 3.0094, "step": 73500 }, { "epoch": 2.163355408388521, "eval_loss": 3.0110063552856445, "eval_runtime": 619.3411, "eval_samples_per_second": 438.845, "eval_steps_per_second": 13.715, "step": 73500 }, { "epoch": 2.166298749080206, "grad_norm": 0.02541854791343212, "learning_rate": 1.0190663069250265e-05, "loss": 3.0063, "step": 73600 }, { "epoch": 2.169242089771891, "grad_norm": 0.014420312829315662, "learning_rate": 1.0174311176518683e-05, "loss": 3.0138, "step": 73700 }, { "epoch": 2.172185430463576, "grad_norm": 0.015578224323689938, "learning_rate": 1.0158122802714414e-05, "loss": 3.0094, "step": 73800 }, { "epoch": 2.1751287711552614, "grad_norm": 0.016129247844219208, "learning_rate": 1.0141770909982831e-05, "loss": 3.0144, "step": 73900 }, { "epoch": 2.178072111846946, "grad_norm": 0.016215290874242783, "learning_rate": 1.0125419017251247e-05, "loss": 3.0081, "step": 74000 }, { "epoch": 2.178072111846946, "eval_loss": 3.0108814239501953, "eval_runtime": 632.0161, "eval_samples_per_second": 430.044, "eval_steps_per_second": 13.44, "step": 74000 }, { "epoch": 2.1810154525386314, "grad_norm": 0.012344066984951496, "learning_rate": 1.0109067124519665e-05, "loss": 3.0138, "step": 74100 }, { "epoch": 2.1839587932303166, "grad_norm": 0.010854271240532398, "learning_rate": 1.009271523178808e-05, "loss": 3.0144, "step": 74200 }, { "epoch": 2.1869021339220014, "grad_norm": 0.016823800280690193, "learning_rate": 1.0076363339056497e-05, "loss": 3.0094, "step": 74300 }, { "epoch": 2.1898454746136866, "grad_norm": 0.012931378558278084, "learning_rate": 1.0060011446324913e-05, "loss": 3.0106, "step": 74400 }, { "epoch": 2.1927888153053714, "grad_norm": 0.011403587646782398, "learning_rate": 1.004365955359333e-05, "loss": 3.01, "step": 74500 }, { "epoch": 2.1927888153053714, "eval_loss": 3.011021137237549, "eval_runtime": 624.4749, "eval_samples_per_second": 435.238, "eval_steps_per_second": 13.602, "step": 74500 }, { "epoch": 2.1957321559970566, "grad_norm": 0.014979198575019836, "learning_rate": 1.0027307660861745e-05, "loss": 3.0088, "step": 74600 }, { "epoch": 2.198675496688742, "grad_norm": 0.026431478559970856, "learning_rate": 1.0010955768130161e-05, "loss": 3.0081, "step": 74700 }, { "epoch": 2.2016188373804266, "grad_norm": 0.020403901115059853, "learning_rate": 9.994603875398579e-06, "loss": 3.0094, "step": 74800 }, { "epoch": 2.204562178072112, "grad_norm": 0.014719990082085133, "learning_rate": 9.978251982666995e-06, "loss": 3.01, "step": 74900 }, { "epoch": 2.207505518763797, "grad_norm": 0.013472214341163635, "learning_rate": 9.96190008993541e-06, "loss": 3.0181, "step": 75000 }, { "epoch": 2.207505518763797, "eval_loss": 3.010822057723999, "eval_runtime": 626.9809, "eval_samples_per_second": 433.498, "eval_steps_per_second": 13.547, "step": 75000 }, { "epoch": 2.210448859455482, "grad_norm": 0.019549807533621788, "learning_rate": 9.945548197203827e-06, "loss": 3.0088, "step": 75100 }, { "epoch": 2.213392200147167, "grad_norm": 0.013459124602377415, "learning_rate": 9.929196304472244e-06, "loss": 3.0144, "step": 75200 }, { "epoch": 2.216335540838852, "grad_norm": 0.015060096979141235, "learning_rate": 9.91284441174066e-06, "loss": 3.0131, "step": 75300 }, { "epoch": 2.219278881530537, "grad_norm": 0.017027568072080612, "learning_rate": 9.896492519009076e-06, "loss": 3.01, "step": 75400 }, { "epoch": 2.2222222222222223, "grad_norm": 0.014108316972851753, "learning_rate": 9.880140626277492e-06, "loss": 3.0125, "step": 75500 }, { "epoch": 2.2222222222222223, "eval_loss": 3.0112345218658447, "eval_runtime": 625.8898, "eval_samples_per_second": 434.254, "eval_steps_per_second": 13.571, "step": 75500 }, { "epoch": 2.225165562913907, "grad_norm": 0.01825530454516411, "learning_rate": 9.863788733545909e-06, "loss": 3.0131, "step": 75600 }, { "epoch": 2.2281089036055923, "grad_norm": 0.013419277966022491, "learning_rate": 9.847436840814325e-06, "loss": 3.0125, "step": 75700 }, { "epoch": 2.2310522442972776, "grad_norm": 0.019161123782396317, "learning_rate": 9.83108494808274e-06, "loss": 3.01, "step": 75800 }, { "epoch": 2.2339955849889623, "grad_norm": 0.011380353942513466, "learning_rate": 9.814733055351158e-06, "loss": 3.01, "step": 75900 }, { "epoch": 2.2369389256806476, "grad_norm": 0.013692360371351242, "learning_rate": 9.798381162619574e-06, "loss": 3.0175, "step": 76000 }, { "epoch": 2.2369389256806476, "eval_loss": 3.0112414360046387, "eval_runtime": 633.6088, "eval_samples_per_second": 428.963, "eval_steps_per_second": 13.406, "step": 76000 }, { "epoch": 2.239882266372333, "grad_norm": 0.017813341692090034, "learning_rate": 9.78202926988799e-06, "loss": 3.0094, "step": 76100 }, { "epoch": 2.2428256070640176, "grad_norm": 0.01540736760944128, "learning_rate": 9.765677377156406e-06, "loss": 3.015, "step": 76200 }, { "epoch": 2.245768947755703, "grad_norm": 0.013645162805914879, "learning_rate": 9.749325484424822e-06, "loss": 3.0075, "step": 76300 }, { "epoch": 2.248712288447388, "grad_norm": 0.013211456127464771, "learning_rate": 9.732973591693238e-06, "loss": 3.0125, "step": 76400 }, { "epoch": 2.251655629139073, "grad_norm": 0.011174165643751621, "learning_rate": 9.716621698961656e-06, "loss": 3.0131, "step": 76500 }, { "epoch": 2.251655629139073, "eval_loss": 3.0108726024627686, "eval_runtime": 641.3776, "eval_samples_per_second": 423.767, "eval_steps_per_second": 13.243, "step": 76500 }, { "epoch": 2.254598969830758, "grad_norm": 0.019326943904161453, "learning_rate": 9.700269806230072e-06, "loss": 3.0175, "step": 76600 }, { "epoch": 2.257542310522443, "grad_norm": 0.019195320084691048, "learning_rate": 9.683917913498488e-06, "loss": 3.0063, "step": 76700 }, { "epoch": 2.260485651214128, "grad_norm": 0.019731836393475533, "learning_rate": 9.66772953969422e-06, "loss": 3.0113, "step": 76800 }, { "epoch": 2.2634289919058133, "grad_norm": 0.015995986759662628, "learning_rate": 9.651377646962636e-06, "loss": 3.0106, "step": 76900 }, { "epoch": 2.266372332597498, "grad_norm": 0.01430495921522379, "learning_rate": 9.635025754231054e-06, "loss": 3.0106, "step": 77000 }, { "epoch": 2.266372332597498, "eval_loss": 3.010873556137085, "eval_runtime": 638.6578, "eval_samples_per_second": 425.572, "eval_steps_per_second": 13.3, "step": 77000 }, { "epoch": 2.2693156732891833, "grad_norm": 0.018823616206645966, "learning_rate": 9.61867386149947e-06, "loss": 3.0125, "step": 77100 }, { "epoch": 2.272259013980868, "grad_norm": 0.017121613025665283, "learning_rate": 9.602321968767886e-06, "loss": 3.0163, "step": 77200 }, { "epoch": 2.2752023546725533, "grad_norm": 0.015796372666954994, "learning_rate": 9.585970076036302e-06, "loss": 3.0081, "step": 77300 }, { "epoch": 2.2781456953642385, "grad_norm": 0.01426434051245451, "learning_rate": 9.569618183304718e-06, "loss": 3.0131, "step": 77400 }, { "epoch": 2.2810890360559233, "grad_norm": 0.01874231919646263, "learning_rate": 9.553266290573134e-06, "loss": 3.0119, "step": 77500 }, { "epoch": 2.2810890360559233, "eval_loss": 3.0107483863830566, "eval_runtime": 643.0608, "eval_samples_per_second": 422.658, "eval_steps_per_second": 13.209, "step": 77500 }, { "epoch": 2.2840323767476085, "grad_norm": 0.012653839774429798, "learning_rate": 9.536914397841552e-06, "loss": 3.015, "step": 77600 }, { "epoch": 2.2869757174392937, "grad_norm": 0.011629996821284294, "learning_rate": 9.520562505109968e-06, "loss": 3.0125, "step": 77700 }, { "epoch": 2.2899190581309785, "grad_norm": 0.011719335801899433, "learning_rate": 9.504210612378384e-06, "loss": 3.0094, "step": 77800 }, { "epoch": 2.2928623988226637, "grad_norm": 0.016326269134879112, "learning_rate": 9.4878587196468e-06, "loss": 3.01, "step": 77900 }, { "epoch": 2.295805739514349, "grad_norm": 0.011857389472424984, "learning_rate": 9.471506826915217e-06, "loss": 3.0125, "step": 78000 }, { "epoch": 2.295805739514349, "eval_loss": 3.0106513500213623, "eval_runtime": 624.1895, "eval_samples_per_second": 435.437, "eval_steps_per_second": 13.608, "step": 78000 }, { "epoch": 2.2987490802060337, "grad_norm": 0.02019723318517208, "learning_rate": 9.455154934183632e-06, "loss": 3.0113, "step": 78100 }, { "epoch": 2.301692420897719, "grad_norm": 0.011368060484528542, "learning_rate": 9.43880304145205e-06, "loss": 3.01, "step": 78200 }, { "epoch": 2.304635761589404, "grad_norm": 0.018846547231078148, "learning_rate": 9.422451148720465e-06, "loss": 3.0119, "step": 78300 }, { "epoch": 2.307579102281089, "grad_norm": 0.010928289033472538, "learning_rate": 9.406099255988881e-06, "loss": 3.0131, "step": 78400 }, { "epoch": 2.310522442972774, "grad_norm": 0.009076746180653572, "learning_rate": 9.389747363257297e-06, "loss": 3.0106, "step": 78500 }, { "epoch": 2.310522442972774, "eval_loss": 3.0108940601348877, "eval_runtime": 643.8296, "eval_samples_per_second": 422.154, "eval_steps_per_second": 13.193, "step": 78500 }, { "epoch": 2.313465783664459, "grad_norm": 0.016543183475732803, "learning_rate": 9.373395470525715e-06, "loss": 3.0063, "step": 78600 }, { "epoch": 2.316409124356144, "grad_norm": 0.014919297769665718, "learning_rate": 9.357043577794131e-06, "loss": 3.0113, "step": 78700 }, { "epoch": 2.3193524650478294, "grad_norm": 0.010815060697495937, "learning_rate": 9.340691685062547e-06, "loss": 3.01, "step": 78800 }, { "epoch": 2.322295805739514, "grad_norm": 0.01541946455836296, "learning_rate": 9.324339792330963e-06, "loss": 3.0131, "step": 78900 }, { "epoch": 2.3252391464311994, "grad_norm": 0.011210919357836246, "learning_rate": 9.307987899599379e-06, "loss": 3.0088, "step": 79000 }, { "epoch": 2.3252391464311994, "eval_loss": 3.011805534362793, "eval_runtime": 652.0801, "eval_samples_per_second": 416.812, "eval_steps_per_second": 13.026, "step": 79000 }, { "epoch": 2.3281824871228842, "grad_norm": 0.013247012160718441, "learning_rate": 9.291636006867795e-06, "loss": 3.0088, "step": 79100 }, { "epoch": 2.3311258278145695, "grad_norm": 0.016304271295666695, "learning_rate": 9.275284114136211e-06, "loss": 3.0106, "step": 79200 }, { "epoch": 2.3340691685062547, "grad_norm": 0.016633635386824608, "learning_rate": 9.258932221404629e-06, "loss": 3.0081, "step": 79300 }, { "epoch": 2.3370125091979395, "grad_norm": 0.014432031661272049, "learning_rate": 9.242580328673045e-06, "loss": 3.0144, "step": 79400 }, { "epoch": 2.3399558498896247, "grad_norm": 0.010466613806784153, "learning_rate": 9.22622843594146e-06, "loss": 3.0138, "step": 79500 }, { "epoch": 2.3399558498896247, "eval_loss": 3.0106730461120605, "eval_runtime": 654.188, "eval_samples_per_second": 415.469, "eval_steps_per_second": 12.984, "step": 79500 }, { "epoch": 2.34289919058131, "grad_norm": 0.024127831682562828, "learning_rate": 9.209876543209877e-06, "loss": 3.01, "step": 79600 }, { "epoch": 2.3458425312729947, "grad_norm": 0.021622799336910248, "learning_rate": 9.193524650478295e-06, "loss": 3.01, "step": 79700 }, { "epoch": 2.34878587196468, "grad_norm": 0.012483078055083752, "learning_rate": 9.177172757746709e-06, "loss": 3.0144, "step": 79800 }, { "epoch": 2.351729212656365, "grad_norm": 0.015172810293734074, "learning_rate": 9.160820865015127e-06, "loss": 3.01, "step": 79900 }, { "epoch": 2.35467255334805, "grad_norm": 0.014833550900220871, "learning_rate": 9.144468972283543e-06, "loss": 3.0125, "step": 80000 }, { "epoch": 2.35467255334805, "eval_loss": 3.010429620742798, "eval_runtime": 670.7645, "eval_samples_per_second": 405.202, "eval_steps_per_second": 12.663, "step": 80000 }, { "epoch": 2.357615894039735, "grad_norm": 0.016069484874606133, "learning_rate": 9.128117079551959e-06, "loss": 3.005, "step": 80100 }, { "epoch": 2.3605592347314204, "grad_norm": 0.01306169480085373, "learning_rate": 9.111765186820375e-06, "loss": 3.0106, "step": 80200 }, { "epoch": 2.363502575423105, "grad_norm": 0.012455873191356659, "learning_rate": 9.095413294088792e-06, "loss": 3.0094, "step": 80300 }, { "epoch": 2.3664459161147904, "grad_norm": 0.014523331075906754, "learning_rate": 9.079061401357208e-06, "loss": 3.0131, "step": 80400 }, { "epoch": 2.369389256806475, "grad_norm": 0.010422768071293831, "learning_rate": 9.062709508625624e-06, "loss": 3.0125, "step": 80500 }, { "epoch": 2.369389256806475, "eval_loss": 3.010429620742798, "eval_runtime": 663.905, "eval_samples_per_second": 409.388, "eval_steps_per_second": 12.794, "step": 80500 }, { "epoch": 2.3723325974981604, "grad_norm": 0.01585693284869194, "learning_rate": 9.04635761589404e-06, "loss": 3.0106, "step": 80600 }, { "epoch": 2.3752759381898456, "grad_norm": 0.015781618654727936, "learning_rate": 9.030005723162456e-06, "loss": 3.01, "step": 80700 }, { "epoch": 2.3782192788815304, "grad_norm": 0.025128550827503204, "learning_rate": 9.01381734935819e-06, "loss": 3.0119, "step": 80800 }, { "epoch": 2.3811626195732156, "grad_norm": 0.016455015167593956, "learning_rate": 8.997465456626604e-06, "loss": 3.0088, "step": 80900 }, { "epoch": 2.384105960264901, "grad_norm": 0.012502489611506462, "learning_rate": 8.981113563895022e-06, "loss": 3.0113, "step": 81000 }, { "epoch": 2.384105960264901, "eval_loss": 3.0103044509887695, "eval_runtime": 674.4055, "eval_samples_per_second": 403.014, "eval_steps_per_second": 12.595, "step": 81000 }, { "epoch": 2.3870493009565856, "grad_norm": 0.01613890938460827, "learning_rate": 8.964761671163438e-06, "loss": 3.0094, "step": 81100 }, { "epoch": 2.389992641648271, "grad_norm": 0.010742935352027416, "learning_rate": 8.948409778431854e-06, "loss": 3.0094, "step": 81200 }, { "epoch": 2.3929359823399556, "grad_norm": 0.009545009583234787, "learning_rate": 8.93205788570027e-06, "loss": 3.0119, "step": 81300 }, { "epoch": 2.395879323031641, "grad_norm": 0.010394435375928879, "learning_rate": 8.915869511896002e-06, "loss": 3.0094, "step": 81400 }, { "epoch": 2.398822663723326, "grad_norm": 0.013553887605667114, "learning_rate": 8.89951761916442e-06, "loss": 3.0088, "step": 81500 }, { "epoch": 2.398822663723326, "eval_loss": 3.0103044509887695, "eval_runtime": 678.9192, "eval_samples_per_second": 400.335, "eval_steps_per_second": 12.511, "step": 81500 }, { "epoch": 2.401766004415011, "grad_norm": 0.01598992757499218, "learning_rate": 8.883165726432836e-06, "loss": 3.0106, "step": 81600 }, { "epoch": 2.404709345106696, "grad_norm": 0.016409505158662796, "learning_rate": 8.866813833701252e-06, "loss": 3.0088, "step": 81700 }, { "epoch": 2.4076526857983813, "grad_norm": 0.011591075919568539, "learning_rate": 8.850461940969668e-06, "loss": 3.005, "step": 81800 }, { "epoch": 2.410596026490066, "grad_norm": 0.012571400962769985, "learning_rate": 8.834110048238086e-06, "loss": 3.0113, "step": 81900 }, { "epoch": 2.4135393671817513, "grad_norm": 0.018546104431152344, "learning_rate": 8.8177581555065e-06, "loss": 3.0138, "step": 82000 }, { "epoch": 2.4135393671817513, "eval_loss": 3.010333776473999, "eval_runtime": 680.0532, "eval_samples_per_second": 399.667, "eval_steps_per_second": 12.49, "step": 82000 }, { "epoch": 2.4164827078734366, "grad_norm": 0.018130965530872345, "learning_rate": 8.801406262774916e-06, "loss": 3.0106, "step": 82100 }, { "epoch": 2.4194260485651213, "grad_norm": 0.015340627171099186, "learning_rate": 8.785054370043334e-06, "loss": 3.0094, "step": 82200 }, { "epoch": 2.4223693892568066, "grad_norm": 0.013468703255057335, "learning_rate": 8.76870247731175e-06, "loss": 3.0069, "step": 82300 }, { "epoch": 2.425312729948492, "grad_norm": 0.014740215614438057, "learning_rate": 8.752350584580166e-06, "loss": 3.0106, "step": 82400 }, { "epoch": 2.4282560706401766, "grad_norm": 0.011708319187164307, "learning_rate": 8.735998691848582e-06, "loss": 3.0106, "step": 82500 }, { "epoch": 2.4282560706401766, "eval_loss": 3.0103559494018555, "eval_runtime": 681.9525, "eval_samples_per_second": 398.554, "eval_steps_per_second": 12.455, "step": 82500 }, { "epoch": 2.431199411331862, "grad_norm": 0.017695287242531776, "learning_rate": 8.719646799117e-06, "loss": 3.0156, "step": 82600 }, { "epoch": 2.4341427520235466, "grad_norm": 0.014858684502542019, "learning_rate": 8.703294906385414e-06, "loss": 3.0138, "step": 82700 }, { "epoch": 2.437086092715232, "grad_norm": 0.013320837169885635, "learning_rate": 8.686943013653831e-06, "loss": 3.0113, "step": 82800 }, { "epoch": 2.440029433406917, "grad_norm": 0.010718763805925846, "learning_rate": 8.670591120922247e-06, "loss": 3.01, "step": 82900 }, { "epoch": 2.442972774098602, "grad_norm": 0.01910427212715149, "learning_rate": 8.654239228190663e-06, "loss": 3.0138, "step": 83000 }, { "epoch": 2.442972774098602, "eval_loss": 3.0103559494018555, "eval_runtime": 676.8925, "eval_samples_per_second": 401.533, "eval_steps_per_second": 12.549, "step": 83000 }, { "epoch": 2.445916114790287, "grad_norm": 0.017445407807826996, "learning_rate": 8.63788733545908e-06, "loss": 3.0194, "step": 83100 }, { "epoch": 2.448859455481972, "grad_norm": 0.015620074234902859, "learning_rate": 8.621535442727497e-06, "loss": 3.0075, "step": 83200 }, { "epoch": 2.451802796173657, "grad_norm": 0.014419025741517544, "learning_rate": 8.605183549995913e-06, "loss": 3.0088, "step": 83300 }, { "epoch": 2.4547461368653423, "grad_norm": 0.008925139904022217, "learning_rate": 8.588831657264329e-06, "loss": 3.0081, "step": 83400 }, { "epoch": 2.457689477557027, "grad_norm": 0.018620852380990982, "learning_rate": 8.572479764532745e-06, "loss": 3.0138, "step": 83500 }, { "epoch": 2.457689477557027, "eval_loss": 3.0103559494018555, "eval_runtime": 684.1712, "eval_samples_per_second": 397.262, "eval_steps_per_second": 12.415, "step": 83500 }, { "epoch": 2.4606328182487123, "grad_norm": 0.014130588620901108, "learning_rate": 8.556127871801163e-06, "loss": 3.0081, "step": 83600 }, { "epoch": 2.4635761589403975, "grad_norm": 0.013623060658574104, "learning_rate": 8.539775979069577e-06, "loss": 3.0163, "step": 83700 }, { "epoch": 2.4665194996320823, "grad_norm": 0.013883478008210659, "learning_rate": 8.523424086337995e-06, "loss": 3.0113, "step": 83800 }, { "epoch": 2.4694628403237675, "grad_norm": 0.017998341470956802, "learning_rate": 8.507072193606411e-06, "loss": 3.0063, "step": 83900 }, { "epoch": 2.4724061810154527, "grad_norm": 0.010962032712996006, "learning_rate": 8.490720300874827e-06, "loss": 3.0144, "step": 84000 }, { "epoch": 2.4724061810154527, "eval_loss": 3.0103485584259033, "eval_runtime": 688.3667, "eval_samples_per_second": 394.84, "eval_steps_per_second": 12.339, "step": 84000 }, { "epoch": 2.4753495217071375, "grad_norm": 0.015388740226626396, "learning_rate": 8.474368408143243e-06, "loss": 3.0088, "step": 84100 }, { "epoch": 2.4782928623988227, "grad_norm": 0.013463915325701237, "learning_rate": 8.45801651541166e-06, "loss": 3.0144, "step": 84200 }, { "epoch": 2.481236203090508, "grad_norm": 0.013166294433176517, "learning_rate": 8.441664622680077e-06, "loss": 3.0131, "step": 84300 }, { "epoch": 2.4841795437821927, "grad_norm": 0.01483384519815445, "learning_rate": 8.425312729948493e-06, "loss": 3.0094, "step": 84400 }, { "epoch": 2.487122884473878, "grad_norm": 0.014145870693027973, "learning_rate": 8.408960837216909e-06, "loss": 3.015, "step": 84500 }, { "epoch": 2.487122884473878, "eval_loss": 3.010282039642334, "eval_runtime": 686.5949, "eval_samples_per_second": 395.859, "eval_steps_per_second": 12.371, "step": 84500 }, { "epoch": 2.4900662251655628, "grad_norm": 0.014559496194124222, "learning_rate": 8.392608944485325e-06, "loss": 3.0106, "step": 84600 }, { "epoch": 2.493009565857248, "grad_norm": 0.013291534967720509, "learning_rate": 8.37625705175374e-06, "loss": 3.0119, "step": 84700 }, { "epoch": 2.495952906548933, "grad_norm": 0.017346816137433052, "learning_rate": 8.360068677949473e-06, "loss": 3.0125, "step": 84800 }, { "epoch": 2.498896247240618, "grad_norm": 0.024396775290369987, "learning_rate": 8.34371678521789e-06, "loss": 3.0125, "step": 84900 }, { "epoch": 2.501839587932303, "grad_norm": 0.01017684768885374, "learning_rate": 8.327364892486306e-06, "loss": 3.015, "step": 85000 }, { "epoch": 2.501839587932303, "eval_loss": 3.011319398880005, "eval_runtime": 696.2345, "eval_samples_per_second": 390.379, "eval_steps_per_second": 12.2, "step": 85000 }, { "epoch": 2.504782928623988, "grad_norm": 0.011449419893324375, "learning_rate": 8.311012999754722e-06, "loss": 3.0156, "step": 85100 }, { "epoch": 2.507726269315673, "grad_norm": 0.01031495165079832, "learning_rate": 8.294661107023138e-06, "loss": 3.0194, "step": 85200 }, { "epoch": 2.5106696100073584, "grad_norm": 0.01558097917586565, "learning_rate": 8.278309214291554e-06, "loss": 3.0119, "step": 85300 }, { "epoch": 2.5136129506990432, "grad_norm": 0.012004377320408821, "learning_rate": 8.261957321559972e-06, "loss": 3.0075, "step": 85400 }, { "epoch": 2.5165562913907285, "grad_norm": 0.015103288926184177, "learning_rate": 8.245605428828386e-06, "loss": 3.0156, "step": 85500 }, { "epoch": 2.5165562913907285, "eval_loss": 3.0102527141571045, "eval_runtime": 686.697, "eval_samples_per_second": 395.8, "eval_steps_per_second": 12.369, "step": 85500 }, { "epoch": 2.5194996320824137, "grad_norm": 0.01634531468153, "learning_rate": 8.229253536096804e-06, "loss": 3.0131, "step": 85600 }, { "epoch": 2.5224429727740985, "grad_norm": 0.00930737890303135, "learning_rate": 8.21290164336522e-06, "loss": 3.0044, "step": 85700 }, { "epoch": 2.5253863134657837, "grad_norm": 0.014242563396692276, "learning_rate": 8.196549750633636e-06, "loss": 3.0075, "step": 85800 }, { "epoch": 2.528329654157469, "grad_norm": 0.011730772443115711, "learning_rate": 8.180197857902052e-06, "loss": 3.0113, "step": 85900 }, { "epoch": 2.5312729948491537, "grad_norm": 0.014235168695449829, "learning_rate": 8.16384596517047e-06, "loss": 3.0144, "step": 86000 }, { "epoch": 2.5312729948491537, "eval_loss": 3.0102524757385254, "eval_runtime": 682.6911, "eval_samples_per_second": 398.123, "eval_steps_per_second": 12.442, "step": 86000 }, { "epoch": 2.534216335540839, "grad_norm": 0.017689114436507225, "learning_rate": 8.147657591366202e-06, "loss": 3.0144, "step": 86100 }, { "epoch": 2.537159676232524, "grad_norm": 0.012921694666147232, "learning_rate": 8.131305698634618e-06, "loss": 3.0113, "step": 86200 }, { "epoch": 2.540103016924209, "grad_norm": 0.01138000376522541, "learning_rate": 8.114953805903034e-06, "loss": 3.0163, "step": 86300 }, { "epoch": 2.543046357615894, "grad_norm": 0.012250062078237534, "learning_rate": 8.09860191317145e-06, "loss": 3.0169, "step": 86400 }, { "epoch": 2.5459896983075794, "grad_norm": 0.01626046560704708, "learning_rate": 8.082250020439868e-06, "loss": 3.01, "step": 86500 }, { "epoch": 2.5459896983075794, "eval_loss": 3.0101497173309326, "eval_runtime": 687.8463, "eval_samples_per_second": 395.139, "eval_steps_per_second": 12.349, "step": 86500 }, { "epoch": 2.548933038999264, "grad_norm": 0.009654931724071503, "learning_rate": 8.065898127708282e-06, "loss": 3.01, "step": 86600 }, { "epoch": 2.5518763796909494, "grad_norm": 0.012496327981352806, "learning_rate": 8.0495462349767e-06, "loss": 3.0113, "step": 86700 }, { "epoch": 2.554819720382634, "grad_norm": 0.011382297612726688, "learning_rate": 8.033194342245116e-06, "loss": 3.0138, "step": 86800 }, { "epoch": 2.5577630610743194, "grad_norm": 0.014212162233889103, "learning_rate": 8.016842449513532e-06, "loss": 3.0113, "step": 86900 }, { "epoch": 2.560706401766004, "grad_norm": 0.01954035460948944, "learning_rate": 8.000490556781948e-06, "loss": 3.0113, "step": 87000 }, { "epoch": 2.560706401766004, "eval_loss": 3.0101425647735596, "eval_runtime": 681.6332, "eval_samples_per_second": 398.741, "eval_steps_per_second": 12.461, "step": 87000 }, { "epoch": 2.5636497424576894, "grad_norm": 0.011419149115681648, "learning_rate": 7.984138664050365e-06, "loss": 3.0081, "step": 87100 }, { "epoch": 2.5665930831493746, "grad_norm": 0.017085328698158264, "learning_rate": 7.967786771318781e-06, "loss": 3.0069, "step": 87200 }, { "epoch": 2.5695364238410594, "grad_norm": 0.011345711536705494, "learning_rate": 7.951434878587197e-06, "loss": 3.0069, "step": 87300 }, { "epoch": 2.5724797645327446, "grad_norm": 0.011617106385529041, "learning_rate": 7.935082985855613e-06, "loss": 3.0088, "step": 87400 }, { "epoch": 2.57542310522443, "grad_norm": 0.008239028044044971, "learning_rate": 7.91873109312403e-06, "loss": 3.0094, "step": 87500 }, { "epoch": 2.57542310522443, "eval_loss": 3.0101420879364014, "eval_runtime": 695.3788, "eval_samples_per_second": 390.859, "eval_steps_per_second": 12.215, "step": 87500 }, { "epoch": 2.5783664459161146, "grad_norm": 0.008109316229820251, "learning_rate": 7.902379200392445e-06, "loss": 3.0088, "step": 87600 }, { "epoch": 2.5813097866078, "grad_norm": 0.011825203895568848, "learning_rate": 7.886027307660863e-06, "loss": 3.0119, "step": 87700 }, { "epoch": 2.584253127299485, "grad_norm": 0.00959658995270729, "learning_rate": 7.86967541492928e-06, "loss": 3.01, "step": 87800 }, { "epoch": 2.58719646799117, "grad_norm": 0.014189758338034153, "learning_rate": 7.853323522197695e-06, "loss": 3.0119, "step": 87900 }, { "epoch": 2.590139808682855, "grad_norm": 0.014036625623703003, "learning_rate": 7.836971629466111e-06, "loss": 3.0125, "step": 88000 }, { "epoch": 2.590139808682855, "eval_loss": 3.0101423263549805, "eval_runtime": 687.0847, "eval_samples_per_second": 395.577, "eval_steps_per_second": 12.362, "step": 88000 }, { "epoch": 2.5930831493745403, "grad_norm": 0.011337111704051495, "learning_rate": 7.820619736734527e-06, "loss": 3.0088, "step": 88100 }, { "epoch": 2.596026490066225, "grad_norm": 0.013065935112535954, "learning_rate": 7.804267844002943e-06, "loss": 3.0138, "step": 88200 }, { "epoch": 2.5989698307579103, "grad_norm": 0.015677237883210182, "learning_rate": 7.78791595127136e-06, "loss": 3.01, "step": 88300 }, { "epoch": 2.6019131714495956, "grad_norm": 0.015179571695625782, "learning_rate": 7.771564058539777e-06, "loss": 3.0119, "step": 88400 }, { "epoch": 2.6048565121412803, "grad_norm": 0.014148608781397343, "learning_rate": 7.755212165808193e-06, "loss": 3.0119, "step": 88500 }, { "epoch": 2.6048565121412803, "eval_loss": 3.010193347930908, "eval_runtime": 688.7325, "eval_samples_per_second": 394.631, "eval_steps_per_second": 12.333, "step": 88500 }, { "epoch": 2.6077998528329656, "grad_norm": 0.011794851161539555, "learning_rate": 7.738860273076609e-06, "loss": 3.0063, "step": 88600 }, { "epoch": 2.6107431935246503, "grad_norm": 0.017960846424102783, "learning_rate": 7.722508380345025e-06, "loss": 3.01, "step": 88700 }, { "epoch": 2.6136865342163356, "grad_norm": 0.011980012990534306, "learning_rate": 7.706156487613443e-06, "loss": 3.0125, "step": 88800 }, { "epoch": 2.6166298749080203, "grad_norm": 0.020208753645420074, "learning_rate": 7.689804594881857e-06, "loss": 3.0175, "step": 88900 }, { "epoch": 2.6195732155997056, "grad_norm": 0.01218309998512268, "learning_rate": 7.673452702150275e-06, "loss": 3.0113, "step": 89000 }, { "epoch": 2.6195732155997056, "eval_loss": 3.0118050575256348, "eval_runtime": 682.049, "eval_samples_per_second": 398.498, "eval_steps_per_second": 12.454, "step": 89000 }, { "epoch": 2.622516556291391, "grad_norm": 0.011643076315522194, "learning_rate": 7.65710080941869e-06, "loss": 3.02, "step": 89100 }, { "epoch": 2.6254598969830756, "grad_norm": 0.013384327292442322, "learning_rate": 7.640748916687107e-06, "loss": 3.0194, "step": 89200 }, { "epoch": 2.628403237674761, "grad_norm": 0.008897839114069939, "learning_rate": 7.624397023955524e-06, "loss": 3.0088, "step": 89300 }, { "epoch": 2.631346578366446, "grad_norm": 0.01655294932425022, "learning_rate": 7.6080451312239404e-06, "loss": 3.0144, "step": 89400 }, { "epoch": 2.634289919058131, "grad_norm": 0.012432574294507504, "learning_rate": 7.591693238492356e-06, "loss": 3.0125, "step": 89500 }, { "epoch": 2.634289919058131, "eval_loss": 3.0104501247406006, "eval_runtime": 698.4012, "eval_samples_per_second": 389.167, "eval_steps_per_second": 12.162, "step": 89500 }, { "epoch": 2.637233259749816, "grad_norm": 0.015816742554306984, "learning_rate": 7.5753413457607725e-06, "loss": 3.0144, "step": 89600 }, { "epoch": 2.6401766004415013, "grad_norm": 0.019270578399300575, "learning_rate": 7.5589894530291885e-06, "loss": 3.0163, "step": 89700 }, { "epoch": 2.643119941133186, "grad_norm": 0.007118350360542536, "learning_rate": 7.542637560297605e-06, "loss": 3.0106, "step": 89800 }, { "epoch": 2.6460632818248713, "grad_norm": 0.00850965641438961, "learning_rate": 7.526285667566021e-06, "loss": 3.0131, "step": 89900 }, { "epoch": 2.6490066225165565, "grad_norm": 0.013318294659256935, "learning_rate": 7.509933774834438e-06, "loss": 3.0119, "step": 90000 }, { "epoch": 2.6490066225165565, "eval_loss": 3.0100905895233154, "eval_runtime": 694.5387, "eval_samples_per_second": 391.332, "eval_steps_per_second": 12.23, "step": 90000 }, { "epoch": 2.6519499632082413, "grad_norm": 0.01271519809961319, "learning_rate": 7.493581882102854e-06, "loss": 3.0175, "step": 90100 }, { "epoch": 2.6548933038999265, "grad_norm": 0.009013323113322258, "learning_rate": 7.477229989371271e-06, "loss": 3.0106, "step": 90200 }, { "epoch": 2.6578366445916117, "grad_norm": 0.006833299994468689, "learning_rate": 7.460878096639686e-06, "loss": 3.0138, "step": 90300 }, { "epoch": 2.6607799852832965, "grad_norm": 0.009776272810995579, "learning_rate": 7.444526203908103e-06, "loss": 3.0069, "step": 90400 }, { "epoch": 2.6637233259749817, "grad_norm": 0.008224808610975742, "learning_rate": 7.428174311176519e-06, "loss": 3.0138, "step": 90500 }, { "epoch": 2.6637233259749817, "eval_loss": 3.010009527206421, "eval_runtime": 688.3067, "eval_samples_per_second": 394.875, "eval_steps_per_second": 12.34, "step": 90500 }, { "epoch": 2.6666666666666665, "grad_norm": 0.009688873775303364, "learning_rate": 7.411822418444936e-06, "loss": 3.0044, "step": 90600 }, { "epoch": 2.6696100073583517, "grad_norm": 0.011446884833276272, "learning_rate": 7.395470525713352e-06, "loss": 3.0131, "step": 90700 }, { "epoch": 2.6725533480500365, "grad_norm": 0.015336165204644203, "learning_rate": 7.379118632981769e-06, "loss": 3.01, "step": 90800 }, { "epoch": 2.6754966887417218, "grad_norm": 0.01325420755892992, "learning_rate": 7.362766740250185e-06, "loss": 3.0094, "step": 90900 }, { "epoch": 2.678440029433407, "grad_norm": 0.022510679438710213, "learning_rate": 7.3464148475186e-06, "loss": 3.0094, "step": 91000 }, { "epoch": 2.678440029433407, "eval_loss": 3.0099875926971436, "eval_runtime": 685.9367, "eval_samples_per_second": 396.239, "eval_steps_per_second": 12.383, "step": 91000 }, { "epoch": 2.6813833701250918, "grad_norm": 0.009558782912790775, "learning_rate": 7.330062954787017e-06, "loss": 3.0156, "step": 91100 }, { "epoch": 2.684326710816777, "grad_norm": 0.010735510848462582, "learning_rate": 7.313711062055433e-06, "loss": 3.01, "step": 91200 }, { "epoch": 2.687270051508462, "grad_norm": 0.010145319625735283, "learning_rate": 7.29735916932385e-06, "loss": 3.01, "step": 91300 }, { "epoch": 2.690213392200147, "grad_norm": 0.012136233039200306, "learning_rate": 7.281007276592266e-06, "loss": 3.01, "step": 91400 }, { "epoch": 2.693156732891832, "grad_norm": 0.009324021637439728, "learning_rate": 7.264655383860683e-06, "loss": 3.0075, "step": 91500 }, { "epoch": 2.693156732891832, "eval_loss": 3.0097742080688477, "eval_runtime": 701.1248, "eval_samples_per_second": 387.656, "eval_steps_per_second": 12.115, "step": 91500 }, { "epoch": 2.6961000735835174, "grad_norm": 0.013936453498899937, "learning_rate": 7.248303491129099e-06, "loss": 3.0125, "step": 91600 }, { "epoch": 2.6990434142752022, "grad_norm": 0.016051864251494408, "learning_rate": 7.2319515983975155e-06, "loss": 3.01, "step": 91700 }, { "epoch": 2.7019867549668874, "grad_norm": 0.013233642093837261, "learning_rate": 7.2157632245932475e-06, "loss": 3.0081, "step": 91800 }, { "epoch": 2.7049300956585727, "grad_norm": 0.016109127551317215, "learning_rate": 7.1994113318616635e-06, "loss": 3.01, "step": 91900 }, { "epoch": 2.7078734363502575, "grad_norm": 0.013135915622115135, "learning_rate": 7.18305943913008e-06, "loss": 3.0169, "step": 92000 }, { "epoch": 2.7078734363502575, "eval_loss": 3.0097362995147705, "eval_runtime": 695.4255, "eval_samples_per_second": 390.833, "eval_steps_per_second": 12.214, "step": 92000 }, { "epoch": 2.7108167770419427, "grad_norm": 0.012263975106179714, "learning_rate": 7.1667075463984955e-06, "loss": 3.01, "step": 92100 }, { "epoch": 2.713760117733628, "grad_norm": 0.014599333517253399, "learning_rate": 7.150355653666912e-06, "loss": 3.0125, "step": 92200 }, { "epoch": 2.7167034584253127, "grad_norm": 0.011498123407363892, "learning_rate": 7.134003760935328e-06, "loss": 3.0131, "step": 92300 }, { "epoch": 2.719646799116998, "grad_norm": 0.013148258440196514, "learning_rate": 7.117651868203745e-06, "loss": 3.0138, "step": 92400 }, { "epoch": 2.7225901398086827, "grad_norm": 0.010333675891160965, "learning_rate": 7.101299975472161e-06, "loss": 3.0156, "step": 92500 }, { "epoch": 2.7225901398086827, "eval_loss": 3.009920120239258, "eval_runtime": 697.5008, "eval_samples_per_second": 389.67, "eval_steps_per_second": 12.178, "step": 92500 }, { "epoch": 2.725533480500368, "grad_norm": 0.00897152628749609, "learning_rate": 7.084948082740578e-06, "loss": 3.0113, "step": 92600 }, { "epoch": 2.7284768211920527, "grad_norm": 0.011553160846233368, "learning_rate": 7.068596190008994e-06, "loss": 3.0106, "step": 92700 }, { "epoch": 2.731420161883738, "grad_norm": 0.014587395824491978, "learning_rate": 7.052244297277411e-06, "loss": 3.0125, "step": 92800 }, { "epoch": 2.734363502575423, "grad_norm": 0.010932709090411663, "learning_rate": 7.035892404545826e-06, "loss": 3.0038, "step": 92900 }, { "epoch": 2.737306843267108, "grad_norm": 0.01337679848074913, "learning_rate": 7.019540511814244e-06, "loss": 3.0088, "step": 93000 }, { "epoch": 2.737306843267108, "eval_loss": 3.0099799633026123, "eval_runtime": 695.0777, "eval_samples_per_second": 391.028, "eval_steps_per_second": 12.22, "step": 93000 }, { "epoch": 2.740250183958793, "grad_norm": 0.013354885391891003, "learning_rate": 7.003188619082659e-06, "loss": 3.0081, "step": 93100 }, { "epoch": 2.7431935246504784, "grad_norm": 0.013445881195366383, "learning_rate": 6.986836726351076e-06, "loss": 3.0119, "step": 93200 }, { "epoch": 2.746136865342163, "grad_norm": 0.020468570291996002, "learning_rate": 6.970484833619492e-06, "loss": 3.0138, "step": 93300 }, { "epoch": 2.7490802060338484, "grad_norm": 0.008735520765185356, "learning_rate": 6.954132940887909e-06, "loss": 3.0131, "step": 93400 }, { "epoch": 2.7520235467255336, "grad_norm": 0.015184330753982067, "learning_rate": 6.937781048156325e-06, "loss": 3.0106, "step": 93500 }, { "epoch": 2.7520235467255336, "eval_loss": 3.0099568367004395, "eval_runtime": 690.9915, "eval_samples_per_second": 393.341, "eval_steps_per_second": 12.292, "step": 93500 }, { "epoch": 2.7549668874172184, "grad_norm": 0.0051637194119393826, "learning_rate": 6.921429155424742e-06, "loss": 3.0081, "step": 93600 }, { "epoch": 2.7579102281089036, "grad_norm": 0.0076196142472326756, "learning_rate": 6.905077262693158e-06, "loss": 3.0056, "step": 93700 }, { "epoch": 2.760853568800589, "grad_norm": 0.01283348724246025, "learning_rate": 6.8887253699615745e-06, "loss": 3.0106, "step": 93800 }, { "epoch": 2.7637969094922736, "grad_norm": 0.011290019378066063, "learning_rate": 6.87237347722999e-06, "loss": 3.0119, "step": 93900 }, { "epoch": 2.766740250183959, "grad_norm": 0.011378777213394642, "learning_rate": 6.856021584498406e-06, "loss": 3.0075, "step": 94000 }, { "epoch": 2.766740250183959, "eval_loss": 3.009906530380249, "eval_runtime": 689.2507, "eval_samples_per_second": 394.334, "eval_steps_per_second": 12.324, "step": 94000 }, { "epoch": 2.769683590875644, "grad_norm": 0.008718795143067837, "learning_rate": 6.8396696917668225e-06, "loss": 3.0119, "step": 94100 }, { "epoch": 2.772626931567329, "grad_norm": 0.011570965871214867, "learning_rate": 6.8233177990352385e-06, "loss": 3.0075, "step": 94200 }, { "epoch": 2.775570272259014, "grad_norm": 0.010929002426564693, "learning_rate": 6.806965906303655e-06, "loss": 3.0094, "step": 94300 }, { "epoch": 2.7785136129506993, "grad_norm": 0.01315237395465374, "learning_rate": 6.790614013572071e-06, "loss": 3.0119, "step": 94400 }, { "epoch": 2.781456953642384, "grad_norm": 0.01032552495598793, "learning_rate": 6.774262120840488e-06, "loss": 3.01, "step": 94500 }, { "epoch": 2.781456953642384, "eval_loss": 3.009920120239258, "eval_runtime": 704.4128, "eval_samples_per_second": 385.846, "eval_steps_per_second": 12.058, "step": 94500 }, { "epoch": 2.7844002943340693, "grad_norm": 0.008803426288068295, "learning_rate": 6.757910228108903e-06, "loss": 3.0106, "step": 94600 }, { "epoch": 2.787343635025754, "grad_norm": 0.011199725791811943, "learning_rate": 6.74155833537732e-06, "loss": 3.0131, "step": 94700 }, { "epoch": 2.7902869757174393, "grad_norm": 0.007883629761636257, "learning_rate": 6.725206442645736e-06, "loss": 3.0094, "step": 94800 }, { "epoch": 2.793230316409124, "grad_norm": 0.007931094616651535, "learning_rate": 6.708854549914153e-06, "loss": 3.0075, "step": 94900 }, { "epoch": 2.7961736571008093, "grad_norm": 0.012498273514211178, "learning_rate": 6.692502657182569e-06, "loss": 3.0119, "step": 95000 }, { "epoch": 2.7961736571008093, "eval_loss": 3.0098466873168945, "eval_runtime": 704.101, "eval_samples_per_second": 386.017, "eval_steps_per_second": 12.064, "step": 95000 }, { "epoch": 2.7991169977924946, "grad_norm": 0.007909555919468403, "learning_rate": 6.676150764450986e-06, "loss": 3.0094, "step": 95100 }, { "epoch": 2.8020603384841793, "grad_norm": 0.011281512677669525, "learning_rate": 6.659798871719402e-06, "loss": 3.0138, "step": 95200 }, { "epoch": 2.8050036791758646, "grad_norm": 0.008072929456830025, "learning_rate": 6.643446978987819e-06, "loss": 3.0094, "step": 95300 }, { "epoch": 2.80794701986755, "grad_norm": 0.011675285175442696, "learning_rate": 6.627095086256234e-06, "loss": 3.0125, "step": 95400 }, { "epoch": 2.8108903605592346, "grad_norm": 0.013871339149773121, "learning_rate": 6.610743193524652e-06, "loss": 3.0081, "step": 95500 }, { "epoch": 2.8108903605592346, "eval_loss": 3.0100011825561523, "eval_runtime": 709.1825, "eval_samples_per_second": 383.251, "eval_steps_per_second": 11.977, "step": 95500 }, { "epoch": 2.81383370125092, "grad_norm": 0.01639448292553425, "learning_rate": 6.594391300793067e-06, "loss": 3.0081, "step": 95600 }, { "epoch": 2.816777041942605, "grad_norm": 0.009214093908667564, "learning_rate": 6.578039408061484e-06, "loss": 3.0088, "step": 95700 }, { "epoch": 2.81972038263429, "grad_norm": 0.019932011142373085, "learning_rate": 6.561851034257216e-06, "loss": 3.0113, "step": 95800 }, { "epoch": 2.822663723325975, "grad_norm": 0.010501363314688206, "learning_rate": 6.545662660452949e-06, "loss": 3.0075, "step": 95900 }, { "epoch": 2.8256070640176603, "grad_norm": 0.016032615676522255, "learning_rate": 6.529310767721364e-06, "loss": 3.0138, "step": 96000 }, { "epoch": 2.8256070640176603, "eval_loss": 3.009714365005493, "eval_runtime": 709.7158, "eval_samples_per_second": 382.963, "eval_steps_per_second": 11.968, "step": 96000 }, { "epoch": 2.828550404709345, "grad_norm": 0.012182539328932762, "learning_rate": 6.512958874989781e-06, "loss": 3.0106, "step": 96100 }, { "epoch": 2.8314937454010303, "grad_norm": 0.013241689652204514, "learning_rate": 6.496606982258197e-06, "loss": 3.01, "step": 96200 }, { "epoch": 2.8344370860927155, "grad_norm": 0.009016048163175583, "learning_rate": 6.4802550895266136e-06, "loss": 3.0119, "step": 96300 }, { "epoch": 2.8373804267844003, "grad_norm": 0.011815876699984074, "learning_rate": 6.46390319679503e-06, "loss": 3.0144, "step": 96400 }, { "epoch": 2.8403237674760855, "grad_norm": 0.01301447581499815, "learning_rate": 6.4475513040634464e-06, "loss": 3.0106, "step": 96500 }, { "epoch": 2.8403237674760855, "eval_loss": 3.00992751121521, "eval_runtime": 707.5785, "eval_samples_per_second": 384.12, "eval_steps_per_second": 12.004, "step": 96500 }, { "epoch": 2.8432671081677703, "grad_norm": 0.006966474000364542, "learning_rate": 6.4311994113318625e-06, "loss": 3.0094, "step": 96600 }, { "epoch": 2.8462104488594555, "grad_norm": 0.010580153204500675, "learning_rate": 6.414847518600279e-06, "loss": 3.0131, "step": 96700 }, { "epoch": 2.8491537895511403, "grad_norm": 0.008194738067686558, "learning_rate": 6.3984956258686945e-06, "loss": 3.0088, "step": 96800 }, { "epoch": 2.8520971302428255, "grad_norm": 0.007076977752149105, "learning_rate": 6.382143733137111e-06, "loss": 3.005, "step": 96900 }, { "epoch": 2.8550404709345107, "grad_norm": 0.00946815311908722, "learning_rate": 6.365791840405527e-06, "loss": 3.0156, "step": 97000 }, { "epoch": 2.8550404709345107, "eval_loss": 3.00992751121521, "eval_runtime": 708.3929, "eval_samples_per_second": 383.678, "eval_steps_per_second": 11.991, "step": 97000 }, { "epoch": 2.8579838116261955, "grad_norm": 0.013334361836314201, "learning_rate": 6.349439947673943e-06, "loss": 3.0094, "step": 97100 }, { "epoch": 2.8609271523178808, "grad_norm": 0.015538984909653664, "learning_rate": 6.33308805494236e-06, "loss": 3.0081, "step": 97200 }, { "epoch": 2.863870493009566, "grad_norm": 0.008151818998157978, "learning_rate": 6.316736162210776e-06, "loss": 3.0113, "step": 97300 }, { "epoch": 2.8668138337012508, "grad_norm": 0.009555498138070107, "learning_rate": 6.300384269479193e-06, "loss": 3.0138, "step": 97400 }, { "epoch": 2.869757174392936, "grad_norm": 0.008365914225578308, "learning_rate": 6.284032376747608e-06, "loss": 3.0119, "step": 97500 }, { "epoch": 2.869757174392936, "eval_loss": 3.0095744132995605, "eval_runtime": 710.6498, "eval_samples_per_second": 382.46, "eval_steps_per_second": 11.952, "step": 97500 }, { "epoch": 2.872700515084621, "grad_norm": 0.015025295317173004, "learning_rate": 6.267680484016025e-06, "loss": 3.0125, "step": 97600 }, { "epoch": 2.875643855776306, "grad_norm": 0.009561455808579922, "learning_rate": 6.251328591284441e-06, "loss": 3.0094, "step": 97700 }, { "epoch": 2.878587196467991, "grad_norm": 0.012126476503908634, "learning_rate": 6.234976698552858e-06, "loss": 3.0119, "step": 97800 }, { "epoch": 2.8815305371596764, "grad_norm": 0.017550857737660408, "learning_rate": 6.21878832474859e-06, "loss": 3.0081, "step": 97900 }, { "epoch": 2.8844738778513612, "grad_norm": 0.011998379603028297, "learning_rate": 6.202436432017006e-06, "loss": 3.0106, "step": 98000 }, { "epoch": 2.8844738778513612, "eval_loss": 3.009596347808838, "eval_runtime": 705.4938, "eval_samples_per_second": 385.255, "eval_steps_per_second": 12.04, "step": 98000 }, { "epoch": 2.8874172185430464, "grad_norm": 0.00900402944535017, "learning_rate": 6.186084539285423e-06, "loss": 3.0081, "step": 98100 }, { "epoch": 2.8903605592347317, "grad_norm": 0.013533372431993484, "learning_rate": 6.169732646553839e-06, "loss": 3.0125, "step": 98200 }, { "epoch": 2.8933038999264165, "grad_norm": 0.005965348798781633, "learning_rate": 6.153380753822256e-06, "loss": 3.0075, "step": 98300 }, { "epoch": 2.8962472406181017, "grad_norm": 0.011019539088010788, "learning_rate": 6.137028861090672e-06, "loss": 3.0119, "step": 98400 }, { "epoch": 2.8991905813097865, "grad_norm": 0.012915964238345623, "learning_rate": 6.120676968359089e-06, "loss": 3.0106, "step": 98500 }, { "epoch": 2.8991905813097865, "eval_loss": 3.009647846221924, "eval_runtime": 704.4726, "eval_samples_per_second": 385.813, "eval_steps_per_second": 12.057, "step": 98500 }, { "epoch": 2.9021339220014717, "grad_norm": 0.011877420358359814, "learning_rate": 6.104325075627504e-06, "loss": 3.0081, "step": 98600 }, { "epoch": 2.9050772626931565, "grad_norm": 0.01704784668982029, "learning_rate": 6.087973182895921e-06, "loss": 3.0094, "step": 98700 }, { "epoch": 2.9080206033848417, "grad_norm": 0.008073452860116959, "learning_rate": 6.071621290164337e-06, "loss": 3.0081, "step": 98800 }, { "epoch": 2.910963944076527, "grad_norm": 0.009032673202455044, "learning_rate": 6.0552693974327535e-06, "loss": 3.0144, "step": 98900 }, { "epoch": 2.9139072847682117, "grad_norm": 0.011180829256772995, "learning_rate": 6.0389175047011695e-06, "loss": 3.0094, "step": 99000 }, { "epoch": 2.9139072847682117, "eval_loss": 3.0090813636779785, "eval_runtime": 709.0944, "eval_samples_per_second": 383.299, "eval_steps_per_second": 11.979, "step": 99000 }, { "epoch": 2.916850625459897, "grad_norm": 0.008762011304497719, "learning_rate": 6.022565611969586e-06, "loss": 3.0094, "step": 99100 }, { "epoch": 2.919793966151582, "grad_norm": 0.011785381473600864, "learning_rate": 6.006213719238002e-06, "loss": 3.0094, "step": 99200 }, { "epoch": 2.922737306843267, "grad_norm": 0.006014193408191204, "learning_rate": 5.989861826506419e-06, "loss": 3.0106, "step": 99300 }, { "epoch": 2.925680647534952, "grad_norm": 0.010867521166801453, "learning_rate": 5.973509933774834e-06, "loss": 3.01, "step": 99400 }, { "epoch": 2.9286239882266374, "grad_norm": 0.009099874645471573, "learning_rate": 5.957158041043252e-06, "loss": 3.0113, "step": 99500 }, { "epoch": 2.9286239882266374, "eval_loss": 3.0090813636779785, "eval_runtime": 702.7525, "eval_samples_per_second": 386.758, "eval_steps_per_second": 12.087, "step": 99500 }, { "epoch": 2.931567328918322, "grad_norm": 0.010941165499389172, "learning_rate": 5.940806148311667e-06, "loss": 3.0106, "step": 99600 }, { "epoch": 2.9345106696100074, "grad_norm": 0.013228082098066807, "learning_rate": 5.924454255580084e-06, "loss": 3.0106, "step": 99700 }, { "epoch": 2.9374540103016926, "grad_norm": 0.016434643417596817, "learning_rate": 5.9081023628485e-06, "loss": 3.0094, "step": 99800 }, { "epoch": 2.9403973509933774, "grad_norm": 0.00843011774122715, "learning_rate": 5.891750470116917e-06, "loss": 3.0081, "step": 99900 }, { "epoch": 2.9433406916850626, "grad_norm": 0.008235963061451912, "learning_rate": 5.875398577385333e-06, "loss": 3.01, "step": 100000 }, { "epoch": 2.9433406916850626, "eval_loss": 3.0090813636779785, "eval_runtime": 702.2025, "eval_samples_per_second": 387.061, "eval_steps_per_second": 12.096, "step": 100000 }, { "epoch": 2.946284032376748, "grad_norm": 0.011761014349758625, "learning_rate": 5.85904668465375e-06, "loss": 3.0119, "step": 100100 }, { "epoch": 2.9492273730684326, "grad_norm": 0.010586312972009182, "learning_rate": 5.842694791922166e-06, "loss": 3.0106, "step": 100200 }, { "epoch": 2.952170713760118, "grad_norm": 0.009779071435332298, "learning_rate": 5.826342899190581e-06, "loss": 3.0113, "step": 100300 }, { "epoch": 2.9551140544518026, "grad_norm": 0.010667026042938232, "learning_rate": 5.809991006458998e-06, "loss": 3.0075, "step": 100400 }, { "epoch": 2.958057395143488, "grad_norm": 0.008737172931432724, "learning_rate": 5.793639113727414e-06, "loss": 3.0094, "step": 100500 }, { "epoch": 2.958057395143488, "eval_loss": 3.009773015975952, "eval_runtime": 691.9316, "eval_samples_per_second": 392.806, "eval_steps_per_second": 12.276, "step": 100500 }, { "epoch": 2.9610007358351726, "grad_norm": 0.008686036802828312, "learning_rate": 5.777287220995831e-06, "loss": 3.0119, "step": 100600 }, { "epoch": 2.963944076526858, "grad_norm": 0.012791481800377369, "learning_rate": 5.760935328264247e-06, "loss": 3.0106, "step": 100700 }, { "epoch": 2.966887417218543, "grad_norm": 0.011309823021292686, "learning_rate": 5.744583435532664e-06, "loss": 3.0088, "step": 100800 }, { "epoch": 2.969830757910228, "grad_norm": 0.005023526027798653, "learning_rate": 5.72823154280108e-06, "loss": 3.015, "step": 100900 }, { "epoch": 2.972774098601913, "grad_norm": 0.01013882551342249, "learning_rate": 5.7118796500694965e-06, "loss": 3.0106, "step": 101000 }, { "epoch": 2.972774098601913, "eval_loss": 3.009611129760742, "eval_runtime": 1335.9889, "eval_samples_per_second": 203.441, "eval_steps_per_second": 6.358, "step": 101000 }, { "epoch": 2.9757174392935983, "grad_norm": 0.010422018356621265, "learning_rate": 5.695527757337912e-06, "loss": 3.0075, "step": 101100 }, { "epoch": 2.978660779985283, "grad_norm": 0.0090619632974267, "learning_rate": 5.6791758646063285e-06, "loss": 3.0188, "step": 101200 }, { "epoch": 2.9816041206769683, "grad_norm": 0.007781198713928461, "learning_rate": 5.6628239718747445e-06, "loss": 3.0088, "step": 101300 }, { "epoch": 2.9845474613686536, "grad_norm": 0.0057840547524392605, "learning_rate": 5.646472079143161e-06, "loss": 3.0081, "step": 101400 }, { "epoch": 2.9874908020603383, "grad_norm": 0.006866885349154472, "learning_rate": 5.630120186411577e-06, "loss": 3.0075, "step": 101500 }, { "epoch": 2.9874908020603383, "eval_loss": 3.0096917152404785, "eval_runtime": 1223.4238, "eval_samples_per_second": 222.159, "eval_steps_per_second": 6.943, "step": 101500 }, { "epoch": 2.9904341427520236, "grad_norm": 0.009056687355041504, "learning_rate": 5.613768293679994e-06, "loss": 3.0119, "step": 101600 }, { "epoch": 2.993377483443709, "grad_norm": 0.008237130008637905, "learning_rate": 5.59741640094841e-06, "loss": 3.01, "step": 101700 }, { "epoch": 2.9963208241353936, "grad_norm": 0.011571983806788921, "learning_rate": 5.581064508216827e-06, "loss": 3.0075, "step": 101800 }, { "epoch": 2.999264164827079, "grad_norm": 0.006202177610248327, "learning_rate": 5.564712615485242e-06, "loss": 3.0094, "step": 101900 }, { "epoch": 3.0022075055187636, "grad_norm": 0.012070335447788239, "learning_rate": 5.548687760608291e-06, "loss": 3.0119, "step": 102000 }, { "epoch": 3.0022075055187636, "eval_loss": 3.009706735610962, "eval_runtime": 666.3021, "eval_samples_per_second": 407.916, "eval_steps_per_second": 12.748, "step": 102000 }, { "epoch": 3.005150846210449, "grad_norm": 0.007269890047609806, "learning_rate": 5.532335867876707e-06, "loss": 3.0113, "step": 102100 }, { "epoch": 3.008094186902134, "grad_norm": 0.013370104134082794, "learning_rate": 5.515983975145124e-06, "loss": 3.0088, "step": 102200 }, { "epoch": 3.011037527593819, "grad_norm": 0.008380554616451263, "learning_rate": 5.499632082413539e-06, "loss": 3.0106, "step": 102300 }, { "epoch": 3.013980868285504, "grad_norm": 0.013190552592277527, "learning_rate": 5.483280189681957e-06, "loss": 3.0113, "step": 102400 }, { "epoch": 3.0169242089771893, "grad_norm": 0.01525513082742691, "learning_rate": 5.466928296950372e-06, "loss": 3.015, "step": 102500 }, { "epoch": 3.0169242089771893, "eval_loss": 3.0096848011016846, "eval_runtime": 720.1514, "eval_samples_per_second": 377.414, "eval_steps_per_second": 11.795, "step": 102500 }, { "epoch": 3.019867549668874, "grad_norm": 0.007331838831305504, "learning_rate": 5.450576404218789e-06, "loss": 3.0088, "step": 102600 }, { "epoch": 3.0228108903605593, "grad_norm": 0.00869076233357191, "learning_rate": 5.434224511487205e-06, "loss": 3.0088, "step": 102700 }, { "epoch": 3.0257542310522445, "grad_norm": 0.010117026045918465, "learning_rate": 5.417872618755622e-06, "loss": 3.0106, "step": 102800 }, { "epoch": 3.0286975717439293, "grad_norm": 0.008123036473989487, "learning_rate": 5.401520726024038e-06, "loss": 3.0113, "step": 102900 }, { "epoch": 3.0316409124356145, "grad_norm": 0.01067271176725626, "learning_rate": 5.385168833292455e-06, "loss": 3.01, "step": 103000 }, { "epoch": 3.0316409124356145, "eval_loss": 3.009390354156494, "eval_runtime": 730.6547, "eval_samples_per_second": 371.988, "eval_steps_per_second": 11.625, "step": 103000 }, { "epoch": 3.0345842531272993, "grad_norm": 0.011704429052770138, "learning_rate": 5.368816940560871e-06, "loss": 3.0113, "step": 103100 }, { "epoch": 3.0375275938189845, "grad_norm": 0.007271961774677038, "learning_rate": 5.3524650478292876e-06, "loss": 3.0125, "step": 103200 }, { "epoch": 3.0404709345106697, "grad_norm": 0.012178131379187107, "learning_rate": 5.33627667402502e-06, "loss": 3.0056, "step": 103300 }, { "epoch": 3.0434142752023545, "grad_norm": 0.009277158416807652, "learning_rate": 5.319924781293435e-06, "loss": 3.01, "step": 103400 }, { "epoch": 3.0463576158940397, "grad_norm": 0.0175321027636528, "learning_rate": 5.303572888561851e-06, "loss": 3.01, "step": 103500 }, { "epoch": 3.0463576158940397, "eval_loss": 3.009390354156494, "eval_runtime": 714.6235, "eval_samples_per_second": 380.333, "eval_steps_per_second": 11.886, "step": 103500 }, { "epoch": 3.049300956585725, "grad_norm": 0.005140234250575304, "learning_rate": 5.287220995830268e-06, "loss": 3.01, "step": 103600 }, { "epoch": 3.0522442972774098, "grad_norm": 0.01453347783535719, "learning_rate": 5.270869103098684e-06, "loss": 3.01, "step": 103700 }, { "epoch": 3.055187637969095, "grad_norm": 0.010958492755889893, "learning_rate": 5.2545172103671005e-06, "loss": 3.0075, "step": 103800 }, { "epoch": 3.05813097866078, "grad_norm": 0.007108242250978947, "learning_rate": 5.2381653176355165e-06, "loss": 3.0063, "step": 103900 }, { "epoch": 3.061074319352465, "grad_norm": 0.006081312894821167, "learning_rate": 5.221813424903933e-06, "loss": 3.015, "step": 104000 }, { "epoch": 3.061074319352465, "eval_loss": 3.0096330642700195, "eval_runtime": 714.1082, "eval_samples_per_second": 380.608, "eval_steps_per_second": 11.895, "step": 104000 }, { "epoch": 3.06401766004415, "grad_norm": 0.007296455092728138, "learning_rate": 5.205461532172349e-06, "loss": 3.0063, "step": 104100 }, { "epoch": 3.066961000735835, "grad_norm": 0.007822861894965172, "learning_rate": 5.189109639440766e-06, "loss": 3.0119, "step": 104200 }, { "epoch": 3.0699043414275202, "grad_norm": 0.005588240921497345, "learning_rate": 5.172757746709181e-06, "loss": 3.0088, "step": 104300 }, { "epoch": 3.0728476821192054, "grad_norm": 0.007386285811662674, "learning_rate": 5.156405853977598e-06, "loss": 3.0113, "step": 104400 }, { "epoch": 3.0757910228108902, "grad_norm": 0.00789538025856018, "learning_rate": 5.140053961246014e-06, "loss": 3.01, "step": 104500 }, { "epoch": 3.0757910228108902, "eval_loss": 3.0094640254974365, "eval_runtime": 720.7895, "eval_samples_per_second": 377.08, "eval_steps_per_second": 11.784, "step": 104500 }, { "epoch": 3.0787343635025755, "grad_norm": 0.006783245597034693, "learning_rate": 5.123702068514431e-06, "loss": 3.0081, "step": 104600 }, { "epoch": 3.0816777041942607, "grad_norm": 0.009039152413606644, "learning_rate": 5.107350175782847e-06, "loss": 3.0094, "step": 104700 }, { "epoch": 3.0846210448859455, "grad_norm": 0.006649684626609087, "learning_rate": 5.090998283051264e-06, "loss": 3.0075, "step": 104800 }, { "epoch": 3.0875643855776307, "grad_norm": 0.011639785021543503, "learning_rate": 5.07464639031968e-06, "loss": 3.0113, "step": 104900 }, { "epoch": 3.0905077262693155, "grad_norm": 0.01296067051589489, "learning_rate": 5.058294497588097e-06, "loss": 3.0131, "step": 105000 }, { "epoch": 3.0905077262693155, "eval_loss": 3.0094637870788574, "eval_runtime": 717.5743, "eval_samples_per_second": 378.769, "eval_steps_per_second": 11.837, "step": 105000 }, { "epoch": 3.0934510669610007, "grad_norm": 0.01049660425633192, "learning_rate": 5.041942604856512e-06, "loss": 3.0131, "step": 105100 }, { "epoch": 3.096394407652686, "grad_norm": 0.014541316777467728, "learning_rate": 5.025590712124929e-06, "loss": 3.0131, "step": 105200 }, { "epoch": 3.0993377483443707, "grad_norm": 0.006947593297809362, "learning_rate": 5.009238819393345e-06, "loss": 3.0075, "step": 105300 }, { "epoch": 3.102281089036056, "grad_norm": 0.006751209497451782, "learning_rate": 4.992886926661762e-06, "loss": 3.0119, "step": 105400 }, { "epoch": 3.105224429727741, "grad_norm": 0.009482125751674175, "learning_rate": 4.976535033930178e-06, "loss": 3.0094, "step": 105500 }, { "epoch": 3.105224429727741, "eval_loss": 3.0091915130615234, "eval_runtime": 705.4567, "eval_samples_per_second": 385.275, "eval_steps_per_second": 12.04, "step": 105500 }, { "epoch": 3.108167770419426, "grad_norm": 0.011227046139538288, "learning_rate": 4.960183141198594e-06, "loss": 3.0069, "step": 105600 }, { "epoch": 3.111111111111111, "grad_norm": 0.008456013165414333, "learning_rate": 4.943831248467011e-06, "loss": 3.0063, "step": 105700 }, { "epoch": 3.1140544518027964, "grad_norm": 0.012018946930766106, "learning_rate": 4.927479355735427e-06, "loss": 3.0094, "step": 105800 }, { "epoch": 3.116997792494481, "grad_norm": 0.009483412839472294, "learning_rate": 4.911127463003843e-06, "loss": 3.01, "step": 105900 }, { "epoch": 3.1199411331861664, "grad_norm": 0.013960530050098896, "learning_rate": 4.8947755702722595e-06, "loss": 3.0113, "step": 106000 }, { "epoch": 3.1199411331861664, "eval_loss": 3.009669542312622, "eval_runtime": 727.0363, "eval_samples_per_second": 373.84, "eval_steps_per_second": 11.683, "step": 106000 }, { "epoch": 3.122884473877851, "grad_norm": 0.009077128954231739, "learning_rate": 4.8784236775406755e-06, "loss": 3.0056, "step": 106100 }, { "epoch": 3.1258278145695364, "grad_norm": 0.008909621275961399, "learning_rate": 4.862071784809092e-06, "loss": 3.01, "step": 106200 }, { "epoch": 3.1287711552612216, "grad_norm": 0.008803825825452805, "learning_rate": 4.845719892077508e-06, "loss": 3.0081, "step": 106300 }, { "epoch": 3.1317144959529064, "grad_norm": 0.009417102672159672, "learning_rate": 4.829367999345924e-06, "loss": 3.0106, "step": 106400 }, { "epoch": 3.1346578366445916, "grad_norm": 0.011447887867689133, "learning_rate": 4.813016106614341e-06, "loss": 3.01, "step": 106500 }, { "epoch": 3.1346578366445916, "eval_loss": 3.009618043899536, "eval_runtime": 738.9371, "eval_samples_per_second": 367.819, "eval_steps_per_second": 11.495, "step": 106500 }, { "epoch": 3.137601177336277, "grad_norm": 0.004400637932121754, "learning_rate": 4.796664213882757e-06, "loss": 3.0069, "step": 106600 }, { "epoch": 3.1405445180279616, "grad_norm": 0.008428971283137798, "learning_rate": 4.780312321151174e-06, "loss": 3.0119, "step": 106700 }, { "epoch": 3.143487858719647, "grad_norm": 0.008337216451764107, "learning_rate": 4.76396042841959e-06, "loss": 3.0081, "step": 106800 }, { "epoch": 3.1464311994113316, "grad_norm": 0.011110854335129261, "learning_rate": 4.747608535688006e-06, "loss": 3.0075, "step": 106900 }, { "epoch": 3.149374540103017, "grad_norm": 0.004748369101434946, "learning_rate": 4.731256642956423e-06, "loss": 3.0081, "step": 107000 }, { "epoch": 3.149374540103017, "eval_loss": 3.009662389755249, "eval_runtime": 713.1389, "eval_samples_per_second": 381.125, "eval_steps_per_second": 11.911, "step": 107000 }, { "epoch": 3.152317880794702, "grad_norm": 0.009798150509595871, "learning_rate": 4.714904750224839e-06, "loss": 3.0075, "step": 107100 }, { "epoch": 3.155261221486387, "grad_norm": 0.010354849509894848, "learning_rate": 4.698552857493255e-06, "loss": 3.0081, "step": 107200 }, { "epoch": 3.158204562178072, "grad_norm": 0.0084195826202631, "learning_rate": 4.682200964761672e-06, "loss": 3.0125, "step": 107300 }, { "epoch": 3.1611479028697573, "grad_norm": 0.008329696953296661, "learning_rate": 4.665849072030088e-06, "loss": 3.0094, "step": 107400 }, { "epoch": 3.164091243561442, "grad_norm": 0.0063116648234426975, "learning_rate": 4.649497179298505e-06, "loss": 3.0094, "step": 107500 }, { "epoch": 3.164091243561442, "eval_loss": 3.0091617107391357, "eval_runtime": 710.678, "eval_samples_per_second": 382.445, "eval_steps_per_second": 11.952, "step": 107500 }, { "epoch": 3.1670345842531273, "grad_norm": 0.006763426586985588, "learning_rate": 4.633145286566921e-06, "loss": 3.0175, "step": 107600 }, { "epoch": 3.1699779249448126, "grad_norm": 0.007907229475677013, "learning_rate": 4.616793393835337e-06, "loss": 3.01, "step": 107700 }, { "epoch": 3.1729212656364973, "grad_norm": 0.013606593012809753, "learning_rate": 4.600441501103753e-06, "loss": 3.0113, "step": 107800 }, { "epoch": 3.1758646063281826, "grad_norm": 0.0110632861033082, "learning_rate": 4.58408960837217e-06, "loss": 3.0094, "step": 107900 }, { "epoch": 3.1788079470198674, "grad_norm": 0.008390252478420734, "learning_rate": 4.567737715640586e-06, "loss": 3.0125, "step": 108000 }, { "epoch": 3.1788079470198674, "eval_loss": 3.0091471672058105, "eval_runtime": 730.307, "eval_samples_per_second": 372.165, "eval_steps_per_second": 11.631, "step": 108000 }, { "epoch": 3.1817512877115526, "grad_norm": 0.010934549383819103, "learning_rate": 4.551385822909002e-06, "loss": 3.0069, "step": 108100 }, { "epoch": 3.184694628403238, "grad_norm": 0.007799511309713125, "learning_rate": 4.5350339301774185e-06, "loss": 3.0119, "step": 108200 }, { "epoch": 3.1876379690949226, "grad_norm": 0.0077744838781654835, "learning_rate": 4.5186820374458345e-06, "loss": 3.0144, "step": 108300 }, { "epoch": 3.190581309786608, "grad_norm": 0.008387045934796333, "learning_rate": 4.5023301447142505e-06, "loss": 3.0075, "step": 108400 }, { "epoch": 3.193524650478293, "grad_norm": 0.0089939059689641, "learning_rate": 4.485978251982667e-06, "loss": 3.0094, "step": 108500 }, { "epoch": 3.193524650478293, "eval_loss": 3.009676933288574, "eval_runtime": 734.5777, "eval_samples_per_second": 370.002, "eval_steps_per_second": 11.563, "step": 108500 }, { "epoch": 3.196467991169978, "grad_norm": 0.01461584959179163, "learning_rate": 4.469626359251083e-06, "loss": 3.0106, "step": 108600 }, { "epoch": 3.199411331861663, "grad_norm": 0.010952042415738106, "learning_rate": 4.4532744665195e-06, "loss": 3.0144, "step": 108700 }, { "epoch": 3.202354672553348, "grad_norm": 0.0057230484671890736, "learning_rate": 4.436922573787916e-06, "loss": 3.0075, "step": 108800 }, { "epoch": 3.205298013245033, "grad_norm": 0.006332908757030964, "learning_rate": 4.420570681056332e-06, "loss": 3.0156, "step": 108900 }, { "epoch": 3.2082413539367183, "grad_norm": 0.005978718865662813, "learning_rate": 4.404218788324749e-06, "loss": 3.0044, "step": 109000 }, { "epoch": 3.2082413539367183, "eval_loss": 3.0094854831695557, "eval_runtime": 711.1601, "eval_samples_per_second": 382.185, "eval_steps_per_second": 11.944, "step": 109000 }, { "epoch": 3.211184694628403, "grad_norm": 0.0072373999282717705, "learning_rate": 4.387866895593165e-06, "loss": 3.01, "step": 109100 }, { "epoch": 3.2141280353200883, "grad_norm": 0.006627545226365328, "learning_rate": 4.371515002861582e-06, "loss": 3.0106, "step": 109200 }, { "epoch": 3.2170713760117735, "grad_norm": 0.009683563373982906, "learning_rate": 4.355326629057314e-06, "loss": 3.0081, "step": 109300 }, { "epoch": 3.2200147167034583, "grad_norm": 0.009509073570370674, "learning_rate": 4.33897473632573e-06, "loss": 3.0069, "step": 109400 }, { "epoch": 3.2229580573951435, "grad_norm": 0.00827412586659193, "learning_rate": 4.322622843594146e-06, "loss": 3.01, "step": 109500 }, { "epoch": 3.2229580573951435, "eval_loss": 3.009603261947632, "eval_runtime": 686.0413, "eval_samples_per_second": 396.179, "eval_steps_per_second": 12.381, "step": 109500 }, { "epoch": 3.2259013980868287, "grad_norm": 0.007418467663228512, "learning_rate": 4.306270950862563e-06, "loss": 3.01, "step": 109600 }, { "epoch": 3.2288447387785135, "grad_norm": 0.01257388200610876, "learning_rate": 4.289919058130979e-06, "loss": 3.0125, "step": 109700 }, { "epoch": 3.2317880794701987, "grad_norm": 0.007958817295730114, "learning_rate": 4.273567165399396e-06, "loss": 3.0069, "step": 109800 }, { "epoch": 3.2347314201618835, "grad_norm": 0.009971094317734241, "learning_rate": 4.257215272667812e-06, "loss": 3.0081, "step": 109900 }, { "epoch": 3.2376747608535688, "grad_norm": 0.011098182760179043, "learning_rate": 4.240863379936228e-06, "loss": 3.0088, "step": 110000 }, { "epoch": 3.2376747608535688, "eval_loss": 3.009706497192383, "eval_runtime": 711.7139, "eval_samples_per_second": 381.888, "eval_steps_per_second": 11.935, "step": 110000 }, { "epoch": 3.240618101545254, "grad_norm": 0.012215577065944672, "learning_rate": 4.224511487204645e-06, "loss": 3.0119, "step": 110100 }, { "epoch": 3.2435614422369388, "grad_norm": 0.006662637460976839, "learning_rate": 4.208159594473061e-06, "loss": 3.0131, "step": 110200 }, { "epoch": 3.246504782928624, "grad_norm": 0.008719694800674915, "learning_rate": 4.1918077017414775e-06, "loss": 3.0119, "step": 110300 }, { "epoch": 3.249448123620309, "grad_norm": 0.009266557171940804, "learning_rate": 4.1754558090098935e-06, "loss": 3.0094, "step": 110400 }, { "epoch": 3.252391464311994, "grad_norm": 0.006458790507167578, "learning_rate": 4.1591039162783095e-06, "loss": 3.0094, "step": 110500 }, { "epoch": 3.252391464311994, "eval_loss": 3.009603261947632, "eval_runtime": 704.5216, "eval_samples_per_second": 385.787, "eval_steps_per_second": 12.056, "step": 110500 }, { "epoch": 3.255334805003679, "grad_norm": 0.008837465196847916, "learning_rate": 4.142752023546726e-06, "loss": 3.0144, "step": 110600 }, { "epoch": 3.258278145695364, "grad_norm": 0.006608805153518915, "learning_rate": 4.126400130815142e-06, "loss": 3.0069, "step": 110700 }, { "epoch": 3.2612214863870492, "grad_norm": 0.005039707757532597, "learning_rate": 4.1100482380835584e-06, "loss": 3.0131, "step": 110800 }, { "epoch": 3.2641648270787345, "grad_norm": 0.00620513828471303, "learning_rate": 4.0936963453519744e-06, "loss": 3.0081, "step": 110900 }, { "epoch": 3.2671081677704192, "grad_norm": 0.009890182875096798, "learning_rate": 4.077344452620391e-06, "loss": 3.01, "step": 111000 }, { "epoch": 3.2671081677704192, "eval_loss": 3.0095739364624023, "eval_runtime": 711.7925, "eval_samples_per_second": 381.846, "eval_steps_per_second": 11.933, "step": 111000 }, { "epoch": 3.2700515084621045, "grad_norm": 0.005919859278947115, "learning_rate": 4.060992559888807e-06, "loss": 3.01, "step": 111100 }, { "epoch": 3.2729948491537897, "grad_norm": 0.010464128106832504, "learning_rate": 4.044640667157223e-06, "loss": 3.01, "step": 111200 }, { "epoch": 3.2759381898454745, "grad_norm": 0.007603708188980818, "learning_rate": 4.02828877442564e-06, "loss": 3.0125, "step": 111300 }, { "epoch": 3.2788815305371597, "grad_norm": 0.004336678888648748, "learning_rate": 4.012100400621372e-06, "loss": 3.0113, "step": 111400 }, { "epoch": 3.281824871228845, "grad_norm": 0.005941751413047314, "learning_rate": 3.995748507889788e-06, "loss": 3.0088, "step": 111500 }, { "epoch": 3.281824871228845, "eval_loss": 3.009544610977173, "eval_runtime": 717.7994, "eval_samples_per_second": 378.65, "eval_steps_per_second": 11.833, "step": 111500 }, { "epoch": 3.2847682119205297, "grad_norm": 0.00981513038277626, "learning_rate": 3.979396615158205e-06, "loss": 3.0131, "step": 111600 }, { "epoch": 3.287711552612215, "grad_norm": 0.008071414195001125, "learning_rate": 3.963044722426621e-06, "loss": 3.0125, "step": 111700 }, { "epoch": 3.2906548933039, "grad_norm": 0.011878515593707561, "learning_rate": 3.946692829695037e-06, "loss": 3.01, "step": 111800 }, { "epoch": 3.293598233995585, "grad_norm": 0.007522930856794119, "learning_rate": 3.930340936963454e-06, "loss": 3.0113, "step": 111900 }, { "epoch": 3.29654157468727, "grad_norm": 0.012534476816654205, "learning_rate": 3.91398904423187e-06, "loss": 3.0044, "step": 112000 }, { "epoch": 3.29654157468727, "eval_loss": 3.009544610977173, "eval_runtime": 820.3926, "eval_samples_per_second": 331.299, "eval_steps_per_second": 10.354, "step": 112000 }, { "epoch": 3.299484915378955, "grad_norm": 0.01274307444691658, "learning_rate": 3.897637151500287e-06, "loss": 3.0144, "step": 112100 }, { "epoch": 3.30242825607064, "grad_norm": 0.006973479874432087, "learning_rate": 3.881285258768703e-06, "loss": 3.0081, "step": 112200 }, { "epoch": 3.3053715967623254, "grad_norm": 0.008529232814908028, "learning_rate": 3.864933366037119e-06, "loss": 3.0106, "step": 112300 }, { "epoch": 3.30831493745401, "grad_norm": 0.009087673388421535, "learning_rate": 3.848581473305536e-06, "loss": 3.0094, "step": 112400 }, { "epoch": 3.3112582781456954, "grad_norm": 0.01095573604106903, "learning_rate": 3.832229580573952e-06, "loss": 3.005, "step": 112500 }, { "epoch": 3.3112582781456954, "eval_loss": 3.0095443725585938, "eval_runtime": 1440.5018, "eval_samples_per_second": 188.681, "eval_steps_per_second": 5.897, "step": 112500 }, { "epoch": 3.31420161883738, "grad_norm": 0.009205692447721958, "learning_rate": 3.815877687842368e-06, "loss": 3.0131, "step": 112600 }, { "epoch": 3.3171449595290654, "grad_norm": 0.008724776096642017, "learning_rate": 3.7995257951107846e-06, "loss": 3.0081, "step": 112700 }, { "epoch": 3.3200883002207506, "grad_norm": 0.014002680778503418, "learning_rate": 3.7831739023792006e-06, "loss": 3.0094, "step": 112800 }, { "epoch": 3.3230316409124354, "grad_norm": 0.005569032859057188, "learning_rate": 3.766822009647617e-06, "loss": 3.0075, "step": 112900 }, { "epoch": 3.3259749816041206, "grad_norm": 0.006643925327807665, "learning_rate": 3.7504701169160335e-06, "loss": 3.0113, "step": 113000 }, { "epoch": 3.3259749816041206, "eval_loss": 3.009493112564087, "eval_runtime": 1030.7941, "eval_samples_per_second": 263.675, "eval_steps_per_second": 8.24, "step": 113000 }, { "epoch": 3.328918322295806, "grad_norm": 0.00807524286210537, "learning_rate": 3.7342817431117655e-06, "loss": 3.0081, "step": 113100 }, { "epoch": 3.3318616629874906, "grad_norm": 0.01236091647297144, "learning_rate": 3.717929850380182e-06, "loss": 3.0094, "step": 113200 }, { "epoch": 3.334805003679176, "grad_norm": 0.007302865851670504, "learning_rate": 3.7015779576485984e-06, "loss": 3.0081, "step": 113300 }, { "epoch": 3.337748344370861, "grad_norm": 0.005653740838170052, "learning_rate": 3.685226064917015e-06, "loss": 3.0106, "step": 113400 }, { "epoch": 3.340691685062546, "grad_norm": 0.004335132427513599, "learning_rate": 3.668874172185431e-06, "loss": 3.0169, "step": 113500 }, { "epoch": 3.340691685062546, "eval_loss": 3.009492874145508, "eval_runtime": 695.0866, "eval_samples_per_second": 391.023, "eval_steps_per_second": 12.22, "step": 113500 }, { "epoch": 3.343635025754231, "grad_norm": 0.011461712419986725, "learning_rate": 3.6525222794538472e-06, "loss": 3.0056, "step": 113600 }, { "epoch": 3.3465783664459163, "grad_norm": 0.007389253471046686, "learning_rate": 3.6361703867222637e-06, "loss": 3.0081, "step": 113700 }, { "epoch": 3.349521707137601, "grad_norm": 0.010446279309689999, "learning_rate": 3.61981849399068e-06, "loss": 3.0069, "step": 113800 }, { "epoch": 3.3524650478292863, "grad_norm": 0.010799568146467209, "learning_rate": 3.603466601259096e-06, "loss": 3.0094, "step": 113900 }, { "epoch": 3.355408388520971, "grad_norm": 0.012215420603752136, "learning_rate": 3.587114708527512e-06, "loss": 3.0031, "step": 114000 }, { "epoch": 3.355408388520971, "eval_loss": 3.009492874145508, "eval_runtime": 722.7554, "eval_samples_per_second": 376.054, "eval_steps_per_second": 11.752, "step": 114000 }, { "epoch": 3.3583517292126563, "grad_norm": 0.006888228934258223, "learning_rate": 3.5707628157959286e-06, "loss": 3.0069, "step": 114100 }, { "epoch": 3.3612950699043416, "grad_norm": 0.0073467399924993515, "learning_rate": 3.5544109230643446e-06, "loss": 3.0075, "step": 114200 }, { "epoch": 3.3642384105960264, "grad_norm": 0.0061506531201303005, "learning_rate": 3.538059030332761e-06, "loss": 3.015, "step": 114300 }, { "epoch": 3.3671817512877116, "grad_norm": 0.008562905713915825, "learning_rate": 3.5217071376011774e-06, "loss": 3.0081, "step": 114400 }, { "epoch": 3.370125091979397, "grad_norm": 0.006173011846840382, "learning_rate": 3.505355244869594e-06, "loss": 3.0094, "step": 114500 }, { "epoch": 3.370125091979397, "eval_loss": 3.009493112564087, "eval_runtime": 706.6255, "eval_samples_per_second": 384.638, "eval_steps_per_second": 12.021, "step": 114500 }, { "epoch": 3.3730684326710816, "grad_norm": 0.012165830470621586, "learning_rate": 3.4890033521380103e-06, "loss": 3.0056, "step": 114600 }, { "epoch": 3.376011773362767, "grad_norm": 0.005966350436210632, "learning_rate": 3.4726514594064263e-06, "loss": 3.0081, "step": 114700 }, { "epoch": 3.3789551140544516, "grad_norm": 0.006823147181421518, "learning_rate": 3.4562995666748428e-06, "loss": 3.0119, "step": 114800 }, { "epoch": 3.381898454746137, "grad_norm": 0.008843391202390194, "learning_rate": 3.439947673943259e-06, "loss": 3.0075, "step": 114900 }, { "epoch": 3.384841795437822, "grad_norm": 0.012642895802855492, "learning_rate": 3.4235957812116756e-06, "loss": 3.0063, "step": 115000 }, { "epoch": 3.384841795437822, "eval_loss": 3.0097503662109375, "eval_runtime": 674.764, "eval_samples_per_second": 402.8, "eval_steps_per_second": 12.588, "step": 115000 }, { "epoch": 3.387785136129507, "grad_norm": 0.008255718275904655, "learning_rate": 3.4072438884800916e-06, "loss": 3.0144, "step": 115100 }, { "epoch": 3.390728476821192, "grad_norm": 0.009167641401290894, "learning_rate": 3.390891995748508e-06, "loss": 3.0138, "step": 115200 }, { "epoch": 3.3936718175128773, "grad_norm": 0.007253487128764391, "learning_rate": 3.3745401030169245e-06, "loss": 3.0081, "step": 115300 }, { "epoch": 3.396615158204562, "grad_norm": 0.0071833510883152485, "learning_rate": 3.358188210285341e-06, "loss": 3.0113, "step": 115400 }, { "epoch": 3.3995584988962473, "grad_norm": 0.004905087407678366, "learning_rate": 3.341836317553757e-06, "loss": 3.0138, "step": 115500 }, { "epoch": 3.3995584988962473, "eval_loss": 3.0097577571868896, "eval_runtime": 714.7055, "eval_samples_per_second": 380.29, "eval_steps_per_second": 11.885, "step": 115500 }, { "epoch": 3.4025018395879325, "grad_norm": 0.0061522894538939, "learning_rate": 3.3254844248221734e-06, "loss": 3.0081, "step": 115600 }, { "epoch": 3.4054451802796173, "grad_norm": 0.005075466353446245, "learning_rate": 3.30913253209059e-06, "loss": 3.0106, "step": 115700 }, { "epoch": 3.4083885209713025, "grad_norm": 0.004356270655989647, "learning_rate": 3.2927806393590062e-06, "loss": 3.0088, "step": 115800 }, { "epoch": 3.4113318616629873, "grad_norm": 0.005263475235551596, "learning_rate": 3.2764287466274227e-06, "loss": 3.0106, "step": 115900 }, { "epoch": 3.4142752023546725, "grad_norm": 0.006237916648387909, "learning_rate": 3.2600768538958387e-06, "loss": 3.0156, "step": 116000 }, { "epoch": 3.4142752023546725, "eval_loss": 3.00950026512146, "eval_runtime": 718.3873, "eval_samples_per_second": 378.34, "eval_steps_per_second": 11.824, "step": 116000 }, { "epoch": 3.4172185430463577, "grad_norm": 0.010489786975085735, "learning_rate": 3.243724961164255e-06, "loss": 3.0119, "step": 116100 }, { "epoch": 3.4201618837380425, "grad_norm": 0.006893331650644541, "learning_rate": 3.2273730684326716e-06, "loss": 3.01, "step": 116200 }, { "epoch": 3.4231052244297278, "grad_norm": 0.0070856367237865925, "learning_rate": 3.211021175701088e-06, "loss": 3.0144, "step": 116300 }, { "epoch": 3.426048565121413, "grad_norm": 0.009969787672162056, "learning_rate": 3.194669282969504e-06, "loss": 3.0131, "step": 116400 }, { "epoch": 3.4289919058130978, "grad_norm": 0.011481888592243195, "learning_rate": 3.1783173902379204e-06, "loss": 3.0131, "step": 116500 }, { "epoch": 3.4289919058130978, "eval_loss": 3.009721040725708, "eval_runtime": 711.3814, "eval_samples_per_second": 382.067, "eval_steps_per_second": 11.94, "step": 116500 }, { "epoch": 3.431935246504783, "grad_norm": 0.006305059418082237, "learning_rate": 3.161965497506337e-06, "loss": 3.0088, "step": 116600 }, { "epoch": 3.4348785871964678, "grad_norm": 0.010218881070613861, "learning_rate": 3.1456136047747533e-06, "loss": 3.0113, "step": 116700 }, { "epoch": 3.437821927888153, "grad_norm": 0.008223234675824642, "learning_rate": 3.129261712043169e-06, "loss": 3.0044, "step": 116800 }, { "epoch": 3.440765268579838, "grad_norm": 0.005670448299497366, "learning_rate": 3.1129098193115853e-06, "loss": 3.01, "step": 116900 }, { "epoch": 3.443708609271523, "grad_norm": 0.006958055309951305, "learning_rate": 3.0965579265800018e-06, "loss": 3.0069, "step": 117000 }, { "epoch": 3.443708609271523, "eval_loss": 3.009397268295288, "eval_runtime": 721.7065, "eval_samples_per_second": 376.6, "eval_steps_per_second": 11.769, "step": 117000 }, { "epoch": 3.4466519499632082, "grad_norm": 0.007549801375716925, "learning_rate": 3.080369552775734e-06, "loss": 3.0081, "step": 117100 }, { "epoch": 3.4495952906548935, "grad_norm": 0.004502241965383291, "learning_rate": 3.0640176600441502e-06, "loss": 3.0125, "step": 117200 }, { "epoch": 3.4525386313465782, "grad_norm": 0.007807764690369368, "learning_rate": 3.0476657673125667e-06, "loss": 3.0069, "step": 117300 }, { "epoch": 3.4554819720382635, "grad_norm": 0.007696804124861956, "learning_rate": 3.0313138745809827e-06, "loss": 3.0063, "step": 117400 }, { "epoch": 3.4584253127299487, "grad_norm": 0.015226827934384346, "learning_rate": 3.014961981849399e-06, "loss": 3.0044, "step": 117500 }, { "epoch": 3.4584253127299487, "eval_loss": 3.0095224380493164, "eval_runtime": 720.5736, "eval_samples_per_second": 377.193, "eval_steps_per_second": 11.788, "step": 117500 }, { "epoch": 3.4613686534216335, "grad_norm": 0.010318039916455746, "learning_rate": 2.9986100891178155e-06, "loss": 3.0119, "step": 117600 }, { "epoch": 3.4643119941133187, "grad_norm": 0.0058851661160588264, "learning_rate": 2.982258196386232e-06, "loss": 3.0081, "step": 117700 }, { "epoch": 3.467255334805004, "grad_norm": 0.006086741574108601, "learning_rate": 2.965906303654648e-06, "loss": 3.0081, "step": 117800 }, { "epoch": 3.4701986754966887, "grad_norm": 0.014253015629947186, "learning_rate": 2.9495544109230644e-06, "loss": 3.0106, "step": 117900 }, { "epoch": 3.473142016188374, "grad_norm": 0.007352728862315416, "learning_rate": 2.933202518191481e-06, "loss": 3.0125, "step": 118000 }, { "epoch": 3.473142016188374, "eval_loss": 3.0095224380493164, "eval_runtime": 721.8039, "eval_samples_per_second": 376.55, "eval_steps_per_second": 11.768, "step": 118000 }, { "epoch": 3.4760853568800587, "grad_norm": 0.0040458994917571545, "learning_rate": 2.9168506254598973e-06, "loss": 3.0138, "step": 118100 }, { "epoch": 3.479028697571744, "grad_norm": 0.008429568260908127, "learning_rate": 2.9004987327283133e-06, "loss": 3.0106, "step": 118200 }, { "epoch": 3.481972038263429, "grad_norm": 0.010222744196653366, "learning_rate": 2.8841468399967297e-06, "loss": 3.0144, "step": 118300 }, { "epoch": 3.484915378955114, "grad_norm": 0.006831009406596422, "learning_rate": 2.867794947265146e-06, "loss": 3.0081, "step": 118400 }, { "epoch": 3.487858719646799, "grad_norm": 0.0050921314395964146, "learning_rate": 2.8514430545335626e-06, "loss": 3.01, "step": 118500 }, { "epoch": 3.487858719646799, "eval_loss": 3.0095224380493164, "eval_runtime": 733.4473, "eval_samples_per_second": 370.572, "eval_steps_per_second": 11.581, "step": 118500 }, { "epoch": 3.490802060338484, "grad_norm": 0.007430725265294313, "learning_rate": 2.835091161801979e-06, "loss": 3.0075, "step": 118600 }, { "epoch": 3.493745401030169, "grad_norm": 0.008209146559238434, "learning_rate": 2.818739269070395e-06, "loss": 3.0056, "step": 118700 }, { "epoch": 3.4966887417218544, "grad_norm": 0.0056533776223659515, "learning_rate": 2.8023873763388115e-06, "loss": 3.0069, "step": 118800 }, { "epoch": 3.499632082413539, "grad_norm": 0.012093271128833294, "learning_rate": 2.786035483607228e-06, "loss": 3.0094, "step": 118900 }, { "epoch": 3.5025754231052244, "grad_norm": 0.008876970037817955, "learning_rate": 2.7696835908756444e-06, "loss": 3.0119, "step": 119000 }, { "epoch": 3.5025754231052244, "eval_loss": 3.0095224380493164, "eval_runtime": 729.7516, "eval_samples_per_second": 372.449, "eval_steps_per_second": 11.64, "step": 119000 }, { "epoch": 3.5055187637969096, "grad_norm": 0.012612666003406048, "learning_rate": 2.7534952170713764e-06, "loss": 3.0038, "step": 119100 }, { "epoch": 3.5084621044885944, "grad_norm": 0.007383661810308695, "learning_rate": 2.737143324339793e-06, "loss": 3.025, "step": 119200 }, { "epoch": 3.5114054451802796, "grad_norm": 0.00930376909673214, "learning_rate": 2.720791431608209e-06, "loss": 3.0081, "step": 119300 }, { "epoch": 3.514348785871965, "grad_norm": 0.007331425789743662, "learning_rate": 2.7044395388766253e-06, "loss": 3.0119, "step": 119400 }, { "epoch": 3.5172921265636496, "grad_norm": 0.006421847268939018, "learning_rate": 2.6880876461450417e-06, "loss": 3.005, "step": 119500 }, { "epoch": 3.5172921265636496, "eval_loss": 3.0095224380493164, "eval_runtime": 714.9827, "eval_samples_per_second": 380.142, "eval_steps_per_second": 11.88, "step": 119500 }, { "epoch": 3.520235467255335, "grad_norm": 0.007157259155064821, "learning_rate": 2.671735753413458e-06, "loss": 3.01, "step": 119600 }, { "epoch": 3.52317880794702, "grad_norm": 0.005451077129691839, "learning_rate": 2.6553838606818746e-06, "loss": 3.0025, "step": 119700 }, { "epoch": 3.526122148638705, "grad_norm": 0.007249257992953062, "learning_rate": 2.6390319679502906e-06, "loss": 3.0088, "step": 119800 }, { "epoch": 3.52906548933039, "grad_norm": 0.005024211946874857, "learning_rate": 2.622680075218707e-06, "loss": 3.0106, "step": 119900 }, { "epoch": 3.5320088300220753, "grad_norm": 0.008244294673204422, "learning_rate": 2.606328182487123e-06, "loss": 3.0138, "step": 120000 }, { "epoch": 3.5320088300220753, "eval_loss": 3.0095224380493164, "eval_runtime": 724.0218, "eval_samples_per_second": 375.396, "eval_steps_per_second": 11.732, "step": 120000 }, { "epoch": 3.53495217071376, "grad_norm": 0.004973077680915594, "learning_rate": 2.589976289755539e-06, "loss": 3.0056, "step": 120100 }, { "epoch": 3.5378955114054453, "grad_norm": 0.00688886409625411, "learning_rate": 2.5736243970239555e-06, "loss": 3.0088, "step": 120200 }, { "epoch": 3.54083885209713, "grad_norm": 0.012771316803991795, "learning_rate": 2.557272504292372e-06, "loss": 3.0125, "step": 120300 }, { "epoch": 3.5437821927888153, "grad_norm": 0.005449370015412569, "learning_rate": 2.5409206115607883e-06, "loss": 3.0125, "step": 120400 }, { "epoch": 3.5467255334805, "grad_norm": 0.006475921254605055, "learning_rate": 2.5245687188292043e-06, "loss": 3.0056, "step": 120500 }, { "epoch": 3.5467255334805, "eval_loss": 3.0095224380493164, "eval_runtime": 731.564, "eval_samples_per_second": 371.526, "eval_steps_per_second": 11.611, "step": 120500 }, { "epoch": 3.5496688741721854, "grad_norm": 0.00998611282557249, "learning_rate": 2.5082168260976208e-06, "loss": 3.0131, "step": 120600 }, { "epoch": 3.5526122148638706, "grad_norm": 0.009240238927304745, "learning_rate": 2.4918649333660372e-06, "loss": 3.0119, "step": 120700 }, { "epoch": 3.5555555555555554, "grad_norm": 0.005599851720035076, "learning_rate": 2.4755130406344537e-06, "loss": 3.0094, "step": 120800 }, { "epoch": 3.5584988962472406, "grad_norm": 0.010330124758183956, "learning_rate": 2.4591611479028697e-06, "loss": 3.0106, "step": 120900 }, { "epoch": 3.561442236938926, "grad_norm": 0.00612435769289732, "learning_rate": 2.442809255171286e-06, "loss": 3.0113, "step": 121000 }, { "epoch": 3.561442236938926, "eval_loss": 3.0095224380493164, "eval_runtime": 686.4441, "eval_samples_per_second": 395.946, "eval_steps_per_second": 12.374, "step": 121000 }, { "epoch": 3.5643855776306106, "grad_norm": 0.006482547149062157, "learning_rate": 2.4266208813670186e-06, "loss": 3.0106, "step": 121100 }, { "epoch": 3.567328918322296, "grad_norm": 0.005151326768100262, "learning_rate": 2.4102689886354346e-06, "loss": 3.0156, "step": 121200 }, { "epoch": 3.570272259013981, "grad_norm": 0.011095167137682438, "learning_rate": 2.393917095903851e-06, "loss": 3.0069, "step": 121300 }, { "epoch": 3.573215599705666, "grad_norm": 0.008724207989871502, "learning_rate": 2.3775652031722674e-06, "loss": 3.0125, "step": 121400 }, { "epoch": 3.576158940397351, "grad_norm": 0.006806240417063236, "learning_rate": 2.361213310440684e-06, "loss": 3.0069, "step": 121500 }, { "epoch": 3.576158940397351, "eval_loss": 3.0094854831695557, "eval_runtime": 726.9654, "eval_samples_per_second": 373.876, "eval_steps_per_second": 11.684, "step": 121500 }, { "epoch": 3.5791022810890363, "grad_norm": 0.014759100042283535, "learning_rate": 2.3448614177091e-06, "loss": 3.01, "step": 121600 }, { "epoch": 3.582045621780721, "grad_norm": 0.007006676867604256, "learning_rate": 2.3285095249775163e-06, "loss": 3.0119, "step": 121700 }, { "epoch": 3.5849889624724063, "grad_norm": 0.007907293736934662, "learning_rate": 2.3121576322459327e-06, "loss": 3.0088, "step": 121800 }, { "epoch": 3.5879323031640915, "grad_norm": 0.005841423291712999, "learning_rate": 2.295805739514349e-06, "loss": 3.0119, "step": 121900 }, { "epoch": 3.5908756438557763, "grad_norm": 0.008063334971666336, "learning_rate": 2.279453846782765e-06, "loss": 3.0069, "step": 122000 }, { "epoch": 3.5908756438557763, "eval_loss": 3.0094854831695557, "eval_runtime": 708.8856, "eval_samples_per_second": 383.412, "eval_steps_per_second": 11.982, "step": 122000 }, { "epoch": 3.5938189845474615, "grad_norm": 0.008008514530956745, "learning_rate": 2.2631019540511816e-06, "loss": 3.0069, "step": 122100 }, { "epoch": 3.5967623252391463, "grad_norm": 0.009769214317202568, "learning_rate": 2.246750061319598e-06, "loss": 3.0138, "step": 122200 }, { "epoch": 3.5997056659308315, "grad_norm": 0.005510885734111071, "learning_rate": 2.230398168588014e-06, "loss": 3.01, "step": 122300 }, { "epoch": 3.6026490066225163, "grad_norm": 0.004009539261460304, "learning_rate": 2.2140462758564305e-06, "loss": 3.0106, "step": 122400 }, { "epoch": 3.6055923473142015, "grad_norm": 0.0056317588314414024, "learning_rate": 2.197694383124847e-06, "loss": 3.0113, "step": 122500 }, { "epoch": 3.6055923473142015, "eval_loss": 3.009507656097412, "eval_runtime": 706.0076, "eval_samples_per_second": 384.975, "eval_steps_per_second": 12.031, "step": 122500 }, { "epoch": 3.6085356880058868, "grad_norm": 0.0074787321500480175, "learning_rate": 2.181342490393263e-06, "loss": 3.01, "step": 122600 }, { "epoch": 3.6114790286975715, "grad_norm": 0.0036889142356812954, "learning_rate": 2.1649905976616794e-06, "loss": 3.005, "step": 122700 }, { "epoch": 3.6144223693892568, "grad_norm": 0.004288042895495892, "learning_rate": 2.148638704930096e-06, "loss": 3.0069, "step": 122800 }, { "epoch": 3.617365710080942, "grad_norm": 0.004678983706980944, "learning_rate": 2.1322868121985122e-06, "loss": 3.0094, "step": 122900 }, { "epoch": 3.6203090507726268, "grad_norm": 0.006774542853236198, "learning_rate": 2.1159349194669283e-06, "loss": 3.0119, "step": 123000 }, { "epoch": 3.6203090507726268, "eval_loss": 3.0095295906066895, "eval_runtime": 716.852, "eval_samples_per_second": 379.151, "eval_steps_per_second": 11.849, "step": 123000 }, { "epoch": 3.623252391464312, "grad_norm": 0.006680713500827551, "learning_rate": 2.0995830267353447e-06, "loss": 3.0056, "step": 123100 }, { "epoch": 3.626195732155997, "grad_norm": 0.004762918688356876, "learning_rate": 2.083231134003761e-06, "loss": 3.0075, "step": 123200 }, { "epoch": 3.629139072847682, "grad_norm": 0.01290228869765997, "learning_rate": 2.067042760199493e-06, "loss": 3.0106, "step": 123300 }, { "epoch": 3.6320824135393672, "grad_norm": 0.00804790947586298, "learning_rate": 2.0506908674679096e-06, "loss": 3.005, "step": 123400 }, { "epoch": 3.6350257542310525, "grad_norm": 0.0049733552150428295, "learning_rate": 2.034338974736326e-06, "loss": 3.0081, "step": 123500 }, { "epoch": 3.6350257542310525, "eval_loss": 3.009507656097412, "eval_runtime": 702.3186, "eval_samples_per_second": 386.997, "eval_steps_per_second": 12.094, "step": 123500 }, { "epoch": 3.6379690949227372, "grad_norm": 0.007307144813239574, "learning_rate": 2.0179870820047425e-06, "loss": 3.02, "step": 123600 }, { "epoch": 3.6409124356144225, "grad_norm": 0.004135403316468, "learning_rate": 2.0016351892731585e-06, "loss": 3.0094, "step": 123700 }, { "epoch": 3.6438557763061077, "grad_norm": 0.0045571899972856045, "learning_rate": 1.985283296541575e-06, "loss": 3.0119, "step": 123800 }, { "epoch": 3.6467991169977925, "grad_norm": 0.010028069838881493, "learning_rate": 1.968931403809991e-06, "loss": 3.0106, "step": 123900 }, { "epoch": 3.6497424576894777, "grad_norm": 0.0068875872530043125, "learning_rate": 1.9525795110784074e-06, "loss": 3.0125, "step": 124000 }, { "epoch": 3.6497424576894777, "eval_loss": 3.009507656097412, "eval_runtime": 700.339, "eval_samples_per_second": 388.091, "eval_steps_per_second": 12.128, "step": 124000 }, { "epoch": 3.6526857983811625, "grad_norm": 0.004337831400334835, "learning_rate": 1.936227618346824e-06, "loss": 3.0125, "step": 124100 }, { "epoch": 3.6556291390728477, "grad_norm": 0.004582313355058432, "learning_rate": 1.9198757256152402e-06, "loss": 3.0188, "step": 124200 }, { "epoch": 3.6585724797645325, "grad_norm": 0.004025149624794722, "learning_rate": 1.9035238328836564e-06, "loss": 3.01, "step": 124300 }, { "epoch": 3.6615158204562177, "grad_norm": 0.008733101189136505, "learning_rate": 1.8871719401520727e-06, "loss": 3.0088, "step": 124400 }, { "epoch": 3.664459161147903, "grad_norm": 0.00574888288974762, "learning_rate": 1.870820047420489e-06, "loss": 3.0169, "step": 124500 }, { "epoch": 3.664459161147903, "eval_loss": 3.0095295906066895, "eval_runtime": 696.3431, "eval_samples_per_second": 390.318, "eval_steps_per_second": 12.198, "step": 124500 }, { "epoch": 3.6674025018395877, "grad_norm": 0.007303401827812195, "learning_rate": 1.8544681546889053e-06, "loss": 3.0113, "step": 124600 }, { "epoch": 3.670345842531273, "grad_norm": 0.007154214195907116, "learning_rate": 1.8381162619573218e-06, "loss": 3.0063, "step": 124700 }, { "epoch": 3.673289183222958, "grad_norm": 0.006601954810321331, "learning_rate": 1.821764369225738e-06, "loss": 3.0094, "step": 124800 }, { "epoch": 3.676232523914643, "grad_norm": 0.004953761585056782, "learning_rate": 1.8054124764941544e-06, "loss": 3.0038, "step": 124900 }, { "epoch": 3.679175864606328, "grad_norm": 0.005694480147212744, "learning_rate": 1.7890605837625708e-06, "loss": 3.0106, "step": 125000 }, { "epoch": 3.679175864606328, "eval_loss": 3.0090808868408203, "eval_runtime": 709.0885, "eval_samples_per_second": 383.302, "eval_steps_per_second": 11.979, "step": 125000 }, { "epoch": 3.6821192052980134, "grad_norm": 0.005826183594763279, "learning_rate": 1.772708691030987e-06, "loss": 3.005, "step": 125100 }, { "epoch": 3.685062545989698, "grad_norm": 0.006321778055280447, "learning_rate": 1.7563567982994035e-06, "loss": 3.0081, "step": 125200 }, { "epoch": 3.6880058866813834, "grad_norm": 0.005246689077466726, "learning_rate": 1.7400049055678195e-06, "loss": 3.0075, "step": 125300 }, { "epoch": 3.6909492273730686, "grad_norm": 0.0031371659133583307, "learning_rate": 1.7236530128362357e-06, "loss": 3.0131, "step": 125400 }, { "epoch": 3.6938925680647534, "grad_norm": 0.006758968811482191, "learning_rate": 1.7073011201046522e-06, "loss": 3.0075, "step": 125500 }, { "epoch": 3.6938925680647534, "eval_loss": 3.0090808868408203, "eval_runtime": 693.061, "eval_samples_per_second": 392.166, "eval_steps_per_second": 12.256, "step": 125500 }, { "epoch": 3.6968359087564386, "grad_norm": 0.011111920699477196, "learning_rate": 1.6911127463003842e-06, "loss": 3.0131, "step": 125600 }, { "epoch": 3.699779249448124, "grad_norm": 0.004824851639568806, "learning_rate": 1.6747608535688006e-06, "loss": 3.01, "step": 125700 }, { "epoch": 3.7027225901398086, "grad_norm": 0.008264412172138691, "learning_rate": 1.658408960837217e-06, "loss": 3.0075, "step": 125800 }, { "epoch": 3.705665930831494, "grad_norm": 0.003976735752075911, "learning_rate": 1.6420570681056333e-06, "loss": 3.0113, "step": 125900 }, { "epoch": 3.7086092715231787, "grad_norm": 0.008425184525549412, "learning_rate": 1.6257051753740497e-06, "loss": 3.0094, "step": 126000 }, { "epoch": 3.7086092715231787, "eval_loss": 3.0090808868408203, "eval_runtime": 680.0046, "eval_samples_per_second": 399.696, "eval_steps_per_second": 12.491, "step": 126000 }, { "epoch": 3.711552612214864, "grad_norm": 0.004391680005937815, "learning_rate": 1.609353282642466e-06, "loss": 3.0081, "step": 126100 }, { "epoch": 3.7144959529065487, "grad_norm": 0.004432505462318659, "learning_rate": 1.5930013899108824e-06, "loss": 3.0119, "step": 126200 }, { "epoch": 3.717439293598234, "grad_norm": 0.005312929395586252, "learning_rate": 1.5766494971792986e-06, "loss": 3.0088, "step": 126300 }, { "epoch": 3.720382634289919, "grad_norm": 0.010315795429050922, "learning_rate": 1.560297604447715e-06, "loss": 3.0063, "step": 126400 }, { "epoch": 3.723325974981604, "grad_norm": 0.008698856458067894, "learning_rate": 1.5439457117161313e-06, "loss": 3.0081, "step": 126500 }, { "epoch": 3.723325974981604, "eval_loss": 3.0090808868408203, "eval_runtime": 666.6557, "eval_samples_per_second": 407.699, "eval_steps_per_second": 12.741, "step": 126500 }, { "epoch": 3.726269315673289, "grad_norm": 0.005269891116768122, "learning_rate": 1.5275938189845477e-06, "loss": 3.0125, "step": 126600 }, { "epoch": 3.7292126563649743, "grad_norm": 0.007758379448205233, "learning_rate": 1.511241926252964e-06, "loss": 3.0125, "step": 126700 }, { "epoch": 3.732155997056659, "grad_norm": 0.008820072747766972, "learning_rate": 1.4948900335213804e-06, "loss": 3.0131, "step": 126800 }, { "epoch": 3.7350993377483444, "grad_norm": 0.005608238745480776, "learning_rate": 1.4785381407897964e-06, "loss": 3.0106, "step": 126900 }, { "epoch": 3.7380426784400296, "grad_norm": 0.0036508748307824135, "learning_rate": 1.4621862480582128e-06, "loss": 3.0088, "step": 127000 }, { "epoch": 3.7380426784400296, "eval_loss": 3.0090808868408203, "eval_runtime": 660.3656, "eval_samples_per_second": 411.583, "eval_steps_per_second": 12.863, "step": 127000 }, { "epoch": 3.7409860191317144, "grad_norm": 0.006226410623639822, "learning_rate": 1.445834355326629e-06, "loss": 3.0113, "step": 127100 }, { "epoch": 3.7439293598233996, "grad_norm": 0.005639576353132725, "learning_rate": 1.4294824625950455e-06, "loss": 3.0125, "step": 127200 }, { "epoch": 3.746872700515085, "grad_norm": 0.007343409117311239, "learning_rate": 1.4131305698634617e-06, "loss": 3.0094, "step": 127300 }, { "epoch": 3.7498160412067696, "grad_norm": 0.004069926217198372, "learning_rate": 1.3967786771318781e-06, "loss": 3.0069, "step": 127400 }, { "epoch": 3.752759381898455, "grad_norm": 0.004760729614645243, "learning_rate": 1.3804267844002943e-06, "loss": 3.0088, "step": 127500 }, { "epoch": 3.752759381898455, "eval_loss": 3.0090808868408203, "eval_runtime": 647.1193, "eval_samples_per_second": 420.008, "eval_steps_per_second": 13.126, "step": 127500 }, { "epoch": 3.75570272259014, "grad_norm": 0.0034565567038953304, "learning_rate": 1.3640748916687108e-06, "loss": 3.0163, "step": 127600 }, { "epoch": 3.758646063281825, "grad_norm": 0.00694636907428503, "learning_rate": 1.3477229989371272e-06, "loss": 3.0094, "step": 127700 }, { "epoch": 3.76158940397351, "grad_norm": 0.01046943012624979, "learning_rate": 1.3315346251328592e-06, "loss": 3.0069, "step": 127800 }, { "epoch": 3.764532744665195, "grad_norm": 0.00863370019942522, "learning_rate": 1.3151827324012757e-06, "loss": 3.0063, "step": 127900 }, { "epoch": 3.76747608535688, "grad_norm": 0.0049459547735750675, "learning_rate": 1.298830839669692e-06, "loss": 3.0094, "step": 128000 }, { "epoch": 3.76747608535688, "eval_loss": 3.0090808868408203, "eval_runtime": 645.2225, "eval_samples_per_second": 421.242, "eval_steps_per_second": 13.164, "step": 128000 }, { "epoch": 3.7704194260485653, "grad_norm": 0.0048855566419661045, "learning_rate": 1.2824789469381083e-06, "loss": 3.01, "step": 128100 }, { "epoch": 3.77336276674025, "grad_norm": 0.005270547699183226, "learning_rate": 1.2661270542065246e-06, "loss": 3.015, "step": 128200 }, { "epoch": 3.7763061074319353, "grad_norm": 0.004688181448727846, "learning_rate": 1.2497751614749408e-06, "loss": 3.0163, "step": 128300 }, { "epoch": 3.77924944812362, "grad_norm": 0.0038302906323224306, "learning_rate": 1.2334232687433572e-06, "loss": 3.0106, "step": 128400 }, { "epoch": 3.7821927888153053, "grad_norm": 0.008955448865890503, "learning_rate": 1.2170713760117734e-06, "loss": 3.0113, "step": 128500 }, { "epoch": 3.7821927888153053, "eval_loss": 3.0090808868408203, "eval_runtime": 648.6086, "eval_samples_per_second": 419.043, "eval_steps_per_second": 13.096, "step": 128500 }, { "epoch": 3.7851361295069905, "grad_norm": 0.007574366871267557, "learning_rate": 1.2007194832801899e-06, "loss": 3.0069, "step": 128600 }, { "epoch": 3.7880794701986753, "grad_norm": 0.005190542433410883, "learning_rate": 1.184367590548606e-06, "loss": 3.0113, "step": 128700 }, { "epoch": 3.7910228108903605, "grad_norm": 0.00397200882434845, "learning_rate": 1.1680156978170225e-06, "loss": 3.0063, "step": 128800 }, { "epoch": 3.7939661515820458, "grad_norm": 0.005643351934850216, "learning_rate": 1.1516638050854387e-06, "loss": 3.0088, "step": 128900 }, { "epoch": 3.7969094922737305, "grad_norm": 0.0047252788208425045, "learning_rate": 1.135311912353855e-06, "loss": 3.0019, "step": 129000 }, { "epoch": 3.7969094922737305, "eval_loss": 3.0090808868408203, "eval_runtime": 641.2312, "eval_samples_per_second": 423.864, "eval_steps_per_second": 13.246, "step": 129000 }, { "epoch": 3.7998528329654158, "grad_norm": 0.005014285445213318, "learning_rate": 1.1189600196222714e-06, "loss": 3.0094, "step": 129100 }, { "epoch": 3.802796173657101, "grad_norm": 0.00693289702758193, "learning_rate": 1.1026081268906876e-06, "loss": 3.0038, "step": 129200 }, { "epoch": 3.8057395143487858, "grad_norm": 0.0036394798662513494, "learning_rate": 1.086256234159104e-06, "loss": 3.0044, "step": 129300 }, { "epoch": 3.808682855040471, "grad_norm": 0.005163101013749838, "learning_rate": 1.0699043414275203e-06, "loss": 3.0088, "step": 129400 }, { "epoch": 3.811626195732156, "grad_norm": 0.0048975031822919846, "learning_rate": 1.0535524486959367e-06, "loss": 3.0113, "step": 129500 }, { "epoch": 3.811626195732156, "eval_loss": 3.0090882778167725, "eval_runtime": 637.8948, "eval_samples_per_second": 426.081, "eval_steps_per_second": 13.316, "step": 129500 }, { "epoch": 3.814569536423841, "grad_norm": 0.0058024791069328785, "learning_rate": 1.037200555964353e-06, "loss": 3.0094, "step": 129600 }, { "epoch": 3.8175128771155262, "grad_norm": 0.007251008879393339, "learning_rate": 1.0208486632327692e-06, "loss": 3.0088, "step": 129700 }, { "epoch": 3.8204562178072115, "grad_norm": 0.006669306196272373, "learning_rate": 1.0044967705011856e-06, "loss": 3.0113, "step": 129800 }, { "epoch": 3.8233995584988962, "grad_norm": 0.005515568424016237, "learning_rate": 9.881448777696018e-07, "loss": 3.0094, "step": 129900 }, { "epoch": 3.8263428991905815, "grad_norm": 0.005022202152758837, "learning_rate": 9.717929850380183e-07, "loss": 3.0069, "step": 130000 }, { "epoch": 3.8263428991905815, "eval_loss": 3.0090882778167725, "eval_runtime": 640.2425, "eval_samples_per_second": 424.519, "eval_steps_per_second": 13.267, "step": 130000 }, { "epoch": 3.8292862398822662, "grad_norm": 0.006356573663651943, "learning_rate": 9.556046112337505e-07, "loss": 3.0113, "step": 130100 }, { "epoch": 3.8322295805739515, "grad_norm": 0.003855107817798853, "learning_rate": 9.392527185021667e-07, "loss": 3.0081, "step": 130200 }, { "epoch": 3.8351729212656362, "grad_norm": 0.005638912785798311, "learning_rate": 9.22900825770583e-07, "loss": 3.0125, "step": 130300 }, { "epoch": 3.8381162619573215, "grad_norm": 0.0029610078781843185, "learning_rate": 9.065489330389994e-07, "loss": 3.0156, "step": 130400 }, { "epoch": 3.8410596026490067, "grad_norm": 0.004076396115124226, "learning_rate": 8.901970403074156e-07, "loss": 3.0069, "step": 130500 }, { "epoch": 3.8410596026490067, "eval_loss": 3.0090882778167725, "eval_runtime": 630.1265, "eval_samples_per_second": 431.334, "eval_steps_per_second": 13.48, "step": 130500 }, { "epoch": 3.8440029433406915, "grad_norm": 0.004032805096358061, "learning_rate": 8.738451475758319e-07, "loss": 3.0131, "step": 130600 }, { "epoch": 3.8469462840323767, "grad_norm": 0.00947653315961361, "learning_rate": 8.574932548442483e-07, "loss": 3.0131, "step": 130700 }, { "epoch": 3.849889624724062, "grad_norm": 0.003898181486874819, "learning_rate": 8.411413621126646e-07, "loss": 3.005, "step": 130800 }, { "epoch": 3.8528329654157467, "grad_norm": 0.004422783385962248, "learning_rate": 8.247894693810809e-07, "loss": 3.0106, "step": 130900 }, { "epoch": 3.855776306107432, "grad_norm": 0.0044276174157857895, "learning_rate": 8.084375766494972e-07, "loss": 3.0119, "step": 131000 }, { "epoch": 3.855776306107432, "eval_loss": 3.0088894367218018, "eval_runtime": 614.9598, "eval_samples_per_second": 441.972, "eval_steps_per_second": 13.812, "step": 131000 }, { "epoch": 3.858719646799117, "grad_norm": 0.007576746866106987, "learning_rate": 7.920856839179137e-07, "loss": 3.0081, "step": 131100 }, { "epoch": 3.861662987490802, "grad_norm": 0.0033155162818729877, "learning_rate": 7.757337911863298e-07, "loss": 3.0088, "step": 131200 }, { "epoch": 3.864606328182487, "grad_norm": 0.004428409039974213, "learning_rate": 7.593818984547461e-07, "loss": 3.0075, "step": 131300 }, { "epoch": 3.8675496688741724, "grad_norm": 0.004841793328523636, "learning_rate": 7.430300057231626e-07, "loss": 3.0056, "step": 131400 } ], "logging_steps": 100, "max_steps": 135900, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }